diff --git a/packages/content-handler/package.json b/packages/content-handler/package.json index dc4edb4b1..6f5c18197 100644 --- a/packages/content-handler/package.json +++ b/packages/content-handler/package.json @@ -32,6 +32,7 @@ "axios": "^0.27.2", "linkedom": "^0.14.16", "luxon": "^3.0.4", + "puppeteer-core": "^19.1.1", "rfc2047": "^4.0.1", "underscore": "^1.13.6", "uuid": "^9.0.0" diff --git a/packages/content-handler/src/websites/twitter-handler.ts b/packages/content-handler/src/websites/twitter-handler.ts index 3aa820468..06549624f 100644 --- a/packages/content-handler/src/websites/twitter-handler.ts +++ b/packages/content-handler/src/websites/twitter-handler.ts @@ -2,6 +2,7 @@ import { ContentHandler, PreHandleResult } from '../content-handler' import axios from 'axios' import { DateTime } from 'luxon' import _ from 'underscore' +import puppeteer from 'puppeteer-core' interface TweetIncludes { users: { @@ -134,87 +135,93 @@ const formatTimestamp = (timestamp: string) => { ) } +/** + * Wait for `ms` amount of milliseconds + * @param {number} ms + */ +const waitFor = async (ms: number) => + new Promise((resolve) => setTimeout(resolve, ms)) + +/** + * Get tweets(even older than 7 days) using puppeteer + * @param {string} tweetId + */ +const getTweetIdsFromThreadId = async (tweetId: string): Promise => { + const pageURL = `https://twitter.com/anyone/status/${tweetId}` + + // Modify this variable to control the size of viewport + const factor = 0.2 + const height = Math.floor(2000 / factor) + const width = Math.floor(1700 / factor) + + const browser = await puppeteer.launch({ + headless: true, + defaultViewport: { + width, + height, + }, + args: [ + `--force-device-scale-factor=${factor}`, + `--window-size=${width},${height}`, + ], + }) + + const page = await browser.newPage() + + await page.goto(pageURL, { + waitUntil: 'networkidle2', + timeout: 5 * 60 * 1000, + }) + + await waitFor(4000) + + /** @type {string[]} */ + const tweetIds = (await page.evaluate(async () => { + const ids: string[] = [] + + // Find the first Show thread button and click it + const showRepliesButton = Array.from( + document.querySelectorAll('div[dir="auto"]') + ) + .filter((node) => node.children[0] && node.children[0].tagName === 'SPAN') + .find((node) => node.children[0].innerHTML === 'Show replies') + + if (showRepliesButton) { + ;(showRepliesButton as HTMLElement).click() + + await waitFor(2000) + } + + const timeNodes = Array.from(document.querySelectorAll('time')) + + for (const timeNode of timeNodes) { + /** @type {HTMLAnchorElement | HTMLSpanElement} */ + const timeContainerAnchor = timeNode.parentElement + if (!timeContainerAnchor) continue + + if (timeContainerAnchor?.tagName === 'SPAN') continue + + const id = (timeContainerAnchor as HTMLAnchorElement).href + .split('/') + .reverse()[0] + + ids.push(id) + } + + return ids + })) as string[] + + await browser.close() + + return [tweetId, ...tweetIds] +} + export class TwitterHandler extends ContentHandler { constructor() { super() this.name = 'Twitter' } - async getTweetIdsFromThreadId(tweetID: string): Promise { - const pageURL = `https://twitter.com/anyone/status/${tweetID}` - - // Modify this variable to control the size of viewport - const factor = 0.2 - const height = Math.floor(2000 / factor) - const width = Math.floor(1700 / factor) - - const browser = await puppeteer.launch({ - headless: true, - defaultViewport: { - width, - height, - }, - args: [ - `--force-device-scale-factor=${factor}`, - `--window-size=${width},${height}`, - ], - }) - - const page = await browser.newPage() - - await page.goto(pageURL, { - waitUntil: 'networkidle2', - timeout: 5 * 60 * 1000, - }) - - await waitFor(4000) - - /** @type {string[]} */ - const tweetIDs = await page.evaluate(async () => { - const ids = [] - - /** - * Wait for `ms` amount of milliseconds - * @param {number} ms - */ - const waitFor = (ms) => new Promise((resolve) => setTimeout(resolve, ms)) - - // Find the first Show thread button and click it - const showRepliesButton = [ - ...document.querySelectorAll('div[dir="auto"]'), - ] - .filter( - (node) => node.children[0] && node.children[0].tagName === 'SPAN' - ) - .find((node) => node.children[0].innerHTML === 'Show replies') - - if (showRepliesButton) { - showRepliesButton.click() - - await waitFor(2000) - } - - const timeNodes = Array.from(document.querySelectorAll('time')) - - for (const timeNode of timeNodes) { - /** @type {HTMLAnchorElement | HTMLSpanElement} */ - const timeContainerAnchor = timeNode.parentElement - - if (timeContainerAnchor.tagName === 'SPAN') continue - - const id = timeContainerAnchor.href.split('/').reverse()[0] - - ids.push(id) - } - - return ids - }) - - await browser.close() - - return [tweetID, ...tweetIDs] - } - shouldPreHandle(url: string): boolean { return !!TWITTER_BEARER_TOKEN && TWITTER_URL_MATCH.test(url.toString()) } diff --git a/yarn.lock b/yarn.lock index a90c7857b..27452d1b1 100644 --- a/yarn.lock +++ b/yarn.lock @@ -12333,6 +12333,11 @@ devtools-protocol@0.0.1019158: resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1019158.tgz#4b08d06108a784a2134313149626ba55f030a86f" integrity sha512-wvq+KscQ7/6spEV7czhnZc9RM/woz1AY+/Vpd8/h2HFMwJSdTliu7f/yr1A6vDdJfKICZsShqsYpEQbdhg8AFQ== +devtools-protocol@0.0.1045489: + version "0.0.1045489" + resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1045489.tgz#f959ad560b05acd72d55644bc3fb8168a83abf28" + integrity sha512-D+PTmWulkuQW4D1NTiCRCFxF7pQPn0hgp4YyX4wAQ6xYXKOadSWPR3ENGDQ47MW/Ewc9v2rpC/UEEGahgBYpSQ== + devtools-protocol@0.0.901419: version "0.0.901419" resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.901419.tgz#79b5459c48fe7e1c5563c02bd72f8fec3e0cebcd" @@ -21372,6 +21377,22 @@ puppeteer-core@^16.1.0: unbzip2-stream "1.4.3" ws "8.8.1" +puppeteer-core@^19.1.1: + version "19.1.1" + resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-19.1.1.tgz#6416ff925a9cc78523c490482a17a2998f7c0626" + integrity sha512-jV26Ke0VFel4MoXLjqm50uAW2uwksTP6Md1tvtXqWqXM5FyboKI6E9YYJ1qEQilUAqlhgGq8xLN5+SL8bPz/kw== + dependencies: + cross-fetch "3.1.5" + debug "4.3.4" + devtools-protocol "0.0.1045489" + extract-zip "2.0.1" + https-proxy-agent "5.0.1" + proxy-from-env "1.1.0" + rimraf "3.0.2" + tar-fs "2.1.1" + unbzip2-stream "1.4.3" + ws "8.9.0" + puppeteer@^10.1.0: version "10.4.0" resolved "https://registry.yarnpkg.com/puppeteer/-/puppeteer-10.4.0.tgz#a6465ff97fda0576c4ac29601406f67e6fea3dc7" @@ -25896,6 +25917,11 @@ ws@8.8.1, ws@^8.2.3, ws@^8.3.0, ws@^8.4.2: resolved "https://registry.yarnpkg.com/ws/-/ws-8.8.1.tgz#5dbad0feb7ade8ecc99b830c1d77c913d4955ff0" integrity sha512-bGy2JzvzkPowEJV++hF07hAD6niYSr0JzBNo/J29WsB57A2r7Wlc1UFcTR9IzrPvuNVO4B8LGqF8qcpsVOhJCA== +ws@8.9.0: + version "8.9.0" + resolved "https://registry.yarnpkg.com/ws/-/ws-8.9.0.tgz#2a994bb67144be1b53fe2d23c53c028adeb7f45e" + integrity sha512-Ja7nszREasGaYUYCI2k4lCKIRTt+y7XuqVoHR44YpI49TtryyqbqvDMn5eqfW7e6HzTukDRIsXqzVHScqRcafg== + "ws@^5.2.0 || ^6.0.0 || ^7.0.0", ws@^7.3.1, ws@^7.4.6: version "7.5.7" resolved "https://registry.yarnpkg.com/ws/-/ws-7.5.7.tgz#9e0ac77ee50af70d58326ecff7e85eb3fa375e67"