Skip to content

Commit

Permalink
separar scraper de sitio
Browse files Browse the repository at this point in the history
  • Loading branch information
catdevnull committed Feb 26, 2024
1 parent d2cafe7 commit 90f860d
Show file tree
Hide file tree
Showing 92 changed files with 2,746 additions and 669 deletions.
8 changes: 4 additions & 4 deletions .github/workflows/container.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,14 +45,14 @@ jobs:
id: meta
uses: docker/metadata-action@v5
with:
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}
images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/sitio
- name: Build and push Docker image
uses: docker/build-push-action@v5
with:
context: .
file: Dockerfile
file: Dockerfile.sitio
push: true
tags: ${{ steps.meta.outputs.tags }}
labels: ${{ steps.meta.outputs.labels }}
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}:buildcache,mode=max
cache-from: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/sitio:buildcache
cache-to: type=registry,ref=${{ env.REGISTRY }}/${{ env.IMAGE_NAME }}/sitio:buildcache,mode=max
14 changes: 0 additions & 14 deletions .prettierrc

This file was deleted.

7 changes: 4 additions & 3 deletions Dockerfile → Dockerfile.sitio
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,11 @@ WORKDIR /usr/src/app
FROM base as build
RUN apk add --no-cache nodejs npm
RUN npm install --global pnpm
COPY package.json package.json
COPY pnpm-lock.yaml .

COPY sitio/package.json .
COPY sitio/pnpm-lock.yaml .
RUN pnpm install
COPY . .
COPY sitio/ .
RUN pnpm install && \
pnpm build

Expand Down
50 changes: 31 additions & 19 deletions scraper/index.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
import puppeteer, { Browser, Page, type CookieParam } from "puppeteer";
import * as schema from "../src/schema.js";
import { connectDb } from "../src/lib/connectDb.js";
import * as schema from "./schema.js";
import { sql, eq } from "drizzle-orm";
import { LibSQLDatabase, drizzle } from "drizzle-orm/libsql";
import { migrate } from "drizzle-orm/libsql/migrator";
import { createClient } from "@libsql/client";
import { z } from "zod";
import { mkdir, writeFile } from "fs/promises";
import { nanoid } from "nanoid";
import { JMILEI_ID } from "../src/lib/consts.js";
import { LibSQLDatabase } from "drizzle-orm/libsql";
import { migrate } from "drizzle-orm/libsql/migrator";
import { JMILEI_ID } from "../sitio/src/lib/consts.js";

async function connectDb({
url,
authToken,
}: {
url: string;
authToken?: string;
}) {
const client = createClient({ url, authToken });
const db = drizzle(client, { schema });
return db;
}

import {
command,
Expand Down Expand Up @@ -104,7 +116,7 @@ const dev = process.env.NODE_ENV !== "production";
function cookiesFromAccountData(cuenta: schema.Cuenta): Array<CookieParam> {
if (!cuenta.accountDataJson) throw new Error("falta token");
const tokens = schema.zTokenAccountData.parse(
JSON.parse(cuenta.accountDataJson),
JSON.parse(cuenta.accountDataJson)
);
const cookies: Array<CookieParam> = [
{
Expand Down Expand Up @@ -205,7 +217,7 @@ const zTimelineAddEntriesEntry = z.object({
entryType: z.literal("TimelineTimelineModule"),
}),
]),
}),
})
),
});
const zUserTweetsRes = z.object({
Expand All @@ -221,7 +233,7 @@ const zUserTweetsRes = z.object({
z.object({ type: z.literal("TimelineTerminateTimeline") }),
z.object({ type: z.literal("TimelinePinEntry") }),
zTimelineAddEntriesEntry,
]),
])
),
}),
}),
Expand Down Expand Up @@ -341,7 +353,7 @@ class Scraper {
await mkdir("debug-api-responses", { recursive: true });
await writeFile(
`debug-api-responses/${+new Date()}-${nanoid()}.json`,
JSON.stringify(json, undefined, 2),
JSON.stringify(json, undefined, 2)
);
}

Expand All @@ -350,23 +362,23 @@ class Scraper {
parsed.data.user.result.timeline_v2.timeline.instructions
.filter(
(x): x is z.infer<typeof zTimelineAddEntriesEntry> =>
"entries" in x,
"entries" in x
)
.flatMap((x) =>
x.entries
.map((e) => e.content)
.filter(
(y): y is TimelineTimelineItem =>
y.entryType === "TimelineTimelineItem",
),
y.entryType === "TimelineTimelineItem"
)
)
// filtrar publicidades
.filter((e) => !e.itemContent.tweet_results.promotedMetadata)
.map((e) => e.itemContent.tweet_results.result)
// filtrar publicidades
.filter(
(e): e is z.infer<typeof zUserTweetsTweetResultTweet> =>
e.__typename === "Tweet",
e.__typename === "Tweet"
);
for (const entry of entries) {
map.set(entry.legacy.id_str, entry.legacy);
Expand Down Expand Up @@ -396,7 +408,7 @@ class Scraper {
if (tweet.user_id_str !== JMILEI_ID) {
// esto suelen ser publicidades que no me fije bien como filtrar
console.warn(
`tweet que no es de milei en feed https://twitter.com/${tweet.user_id_str}/status/${tweet.id_str}`,
`tweet que no es de milei en feed https://twitter.com/${tweet.user_id_str}/status/${tweet.id_str}`
);
return false;
}
Expand All @@ -405,7 +417,7 @@ class Scraper {
const retweets = allTweets
.filter(
(tweet): tweet is UserTweetsRetweetTweet =>
"retweeted_status_result" in tweet,
"retweeted_status_result" in tweet
)
.map(
(tweet): schema.Retweet => ({
Expand All @@ -419,7 +431,7 @@ class Scraper {
postedAt: tweet.retweeted_status_result.result.legacy.created_at,
retweetAt: tweet.created_at,
text: tweet.retweeted_status_result.result.legacy.full_text,
}),
})
);

// XXX: ojo que puede ser que estemos contando tweets aunque no estemos logeadxs.. pero son tweets viejos populares que twitter muestra cuando no estamos logeadxs
Expand Down Expand Up @@ -473,12 +485,12 @@ class Scraper {
throw new Error("no es un link");
const href = linkEl?.href;
const text = x.querySelector(
"[data-testid=tweetText]",
"[data-testid=tweetText]"
)?.textContent;

return { href, text };
}),
sel,
sel
);

for (const { href, text } of got) {
Expand All @@ -504,7 +516,7 @@ class Scraper {
async scrapLikedTweets(
n: number = 10,
cuenta: schema.Cuenta,
scrapId: number,
scrapId: number
): Promise<number> {
return await this.usePage(async (page) => {
await this.setupAccountInPage(cuenta, page);
Expand Down
25 changes: 25 additions & 0 deletions scraper/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"name": "scraper",
"version": "1.0.0",
"description": "",
"main": "index.js",
"scripts": {
"start": "esrun index.ts"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"@libsql/client": "^0.5.2",
"cmd-ts": "^0.13.0",
"drizzle-orm": "^0.29.4",
"nanoid": "^5.0.6",
"puppeteer": "^22.3.0",
"zod": "^3.22.4"
},
"devDependencies": {
"@types/node": "^20.11.20",
"esrun": "^3.2.26",
"prettier": "^3.2.5"
}
}
Loading

0 comments on commit 90f860d

Please sign in to comment.