Get tweet reply ids from thread id using puppeteer

This commit is contained in:
Hongbo Wu
2022-10-25 22:09:16 +08:00
parent 343c8fb5cc
commit 5f0d0ed69b
3 changed files with 109 additions and 75 deletions

View File

@ -32,6 +32,7 @@
"axios": "^0.27.2",
"linkedom": "^0.14.16",
"luxon": "^3.0.4",
"puppeteer-core": "^19.1.1",
"rfc2047": "^4.0.1",
"underscore": "^1.13.6",
"uuid": "^9.0.0"

View File

@ -2,6 +2,7 @@ import { ContentHandler, PreHandleResult } from '../content-handler'
import axios from 'axios'
import { DateTime } from 'luxon'
import _ from 'underscore'
import puppeteer from 'puppeteer-core'
interface TweetIncludes {
users: {
@ -134,87 +135,93 @@ const formatTimestamp = (timestamp: string) => {
)
}
/**
* Wait for `ms` amount of milliseconds
* @param {number} ms
*/
const waitFor = async (ms: number) =>
new Promise((resolve) => setTimeout(resolve, ms))
/**
* Get tweets(even older than 7 days) using puppeteer
* @param {string} tweetId
*/
const getTweetIdsFromThreadId = async (tweetId: string): Promise<string[]> => {
const pageURL = `https://twitter.com/anyone/status/${tweetId}`
// Modify this variable to control the size of viewport
const factor = 0.2
const height = Math.floor(2000 / factor)
const width = Math.floor(1700 / factor)
const browser = await puppeteer.launch({
headless: true,
defaultViewport: {
width,
height,
},
args: [
`--force-device-scale-factor=${factor}`,
`--window-size=${width},${height}`,
],
})
const page = await browser.newPage()
await page.goto(pageURL, {
waitUntil: 'networkidle2',
timeout: 5 * 60 * 1000,
})
await waitFor(4000)
/** @type {string[]} */
const tweetIds = (await page.evaluate(async () => {
const ids: string[] = []
// Find the first Show thread button and click it
const showRepliesButton = Array.from(
document.querySelectorAll('div[dir="auto"]')
)
.filter((node) => node.children[0] && node.children[0].tagName === 'SPAN')
.find((node) => node.children[0].innerHTML === 'Show replies')
if (showRepliesButton) {
;(showRepliesButton as HTMLElement).click()
await waitFor(2000)
}
const timeNodes = Array.from(document.querySelectorAll('time'))
for (const timeNode of timeNodes) {
/** @type {HTMLAnchorElement | HTMLSpanElement} */
const timeContainerAnchor = timeNode.parentElement
if (!timeContainerAnchor) continue
if (timeContainerAnchor?.tagName === 'SPAN') continue
const id = (timeContainerAnchor as HTMLAnchorElement).href
.split('/')
.reverse()[0]
ids.push(id)
}
return ids
})) as string[]
await browser.close()
return [tweetId, ...tweetIds]
}
export class TwitterHandler extends ContentHandler {
constructor() {
super()
this.name = 'Twitter'
}
async getTweetIdsFromThreadId(tweetID: string): Promise<string[]> {
const pageURL = `https://twitter.com/anyone/status/${tweetID}`
// Modify this variable to control the size of viewport
const factor = 0.2
const height = Math.floor(2000 / factor)
const width = Math.floor(1700 / factor)
const browser = await puppeteer.launch({
headless: true,
defaultViewport: {
width,
height,
},
args: [
`--force-device-scale-factor=${factor}`,
`--window-size=${width},${height}`,
],
})
const page = await browser.newPage()
await page.goto(pageURL, {
waitUntil: 'networkidle2',
timeout: 5 * 60 * 1000,
})
await waitFor(4000)
/** @type {string[]} */
const tweetIDs = await page.evaluate(async () => {
const ids = []
/**
* Wait for `ms` amount of milliseconds
* @param {number} ms
*/
const waitFor = (ms) => new Promise((resolve) => setTimeout(resolve, ms))
// Find the first Show thread button and click it
const showRepliesButton = [
...document.querySelectorAll('div[dir="auto"]'),
]
.filter(
(node) => node.children[0] && node.children[0].tagName === 'SPAN'
)
.find((node) => node.children[0].innerHTML === 'Show replies')
if (showRepliesButton) {
showRepliesButton.click()
await waitFor(2000)
}
const timeNodes = Array.from(document.querySelectorAll('time'))
for (const timeNode of timeNodes) {
/** @type {HTMLAnchorElement | HTMLSpanElement} */
const timeContainerAnchor = timeNode.parentElement
if (timeContainerAnchor.tagName === 'SPAN') continue
const id = timeContainerAnchor.href.split('/').reverse()[0]
ids.push(id)
}
return ids
})
await browser.close()
return [tweetID, ...tweetIDs]
}
shouldPreHandle(url: string): boolean {
return !!TWITTER_BEARER_TOKEN && TWITTER_URL_MATCH.test(url.toString())
}

View File

@ -12333,6 +12333,11 @@ devtools-protocol@0.0.1019158:
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1019158.tgz#4b08d06108a784a2134313149626ba55f030a86f"
integrity sha512-wvq+KscQ7/6spEV7czhnZc9RM/woz1AY+/Vpd8/h2HFMwJSdTliu7f/yr1A6vDdJfKICZsShqsYpEQbdhg8AFQ==
devtools-protocol@0.0.1045489:
version "0.0.1045489"
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.1045489.tgz#f959ad560b05acd72d55644bc3fb8168a83abf28"
integrity sha512-D+PTmWulkuQW4D1NTiCRCFxF7pQPn0hgp4YyX4wAQ6xYXKOadSWPR3ENGDQ47MW/Ewc9v2rpC/UEEGahgBYpSQ==
devtools-protocol@0.0.901419:
version "0.0.901419"
resolved "https://registry.yarnpkg.com/devtools-protocol/-/devtools-protocol-0.0.901419.tgz#79b5459c48fe7e1c5563c02bd72f8fec3e0cebcd"
@ -21372,6 +21377,22 @@ puppeteer-core@^16.1.0:
unbzip2-stream "1.4.3"
ws "8.8.1"
puppeteer-core@^19.1.1:
version "19.1.1"
resolved "https://registry.yarnpkg.com/puppeteer-core/-/puppeteer-core-19.1.1.tgz#6416ff925a9cc78523c490482a17a2998f7c0626"
integrity sha512-jV26Ke0VFel4MoXLjqm50uAW2uwksTP6Md1tvtXqWqXM5FyboKI6E9YYJ1qEQilUAqlhgGq8xLN5+SL8bPz/kw==
dependencies:
cross-fetch "3.1.5"
debug "4.3.4"
devtools-protocol "0.0.1045489"
extract-zip "2.0.1"
https-proxy-agent "5.0.1"
proxy-from-env "1.1.0"
rimraf "3.0.2"
tar-fs "2.1.1"
unbzip2-stream "1.4.3"
ws "8.9.0"
puppeteer@^10.1.0:
version "10.4.0"
resolved "https://registry.yarnpkg.com/puppeteer/-/puppeteer-10.4.0.tgz#a6465ff97fda0576c4ac29601406f67e6fea3dc7"
@ -25896,6 +25917,11 @@ ws@8.8.1, ws@^8.2.3, ws@^8.3.0, ws@^8.4.2:
resolved "https://registry.yarnpkg.com/ws/-/ws-8.8.1.tgz#5dbad0feb7ade8ecc99b830c1d77c913d4955ff0"
integrity sha512-bGy2JzvzkPowEJV++hF07hAD6niYSr0JzBNo/J29WsB57A2r7Wlc1UFcTR9IzrPvuNVO4B8LGqF8qcpsVOhJCA==
ws@8.9.0:
version "8.9.0"
resolved "https://registry.yarnpkg.com/ws/-/ws-8.9.0.tgz#2a994bb67144be1b53fe2d23c53c028adeb7f45e"
integrity sha512-Ja7nszREasGaYUYCI2k4lCKIRTt+y7XuqVoHR44YpI49TtryyqbqvDMn5eqfW7e6HzTukDRIsXqzVHScqRcafg==
"ws@^5.2.0 || ^6.0.0 || ^7.0.0", ws@^7.3.1, ws@^7.4.6:
version "7.5.7"
resolved "https://registry.yarnpkg.com/ws/-/ws-7.5.7.tgz#9e0ac77ee50af70d58326ecff7e85eb3fa375e67"