Fix not saving all the tweets in a thread

This commit is contained in:
Hongbo Wu
2023-01-03 21:26:34 +08:00
parent 5c215095e4
commit ab16447c14
2 changed files with 33 additions and 25 deletions

View File

@ -215,16 +215,10 @@ const getTweetIds = async (
const page = await context.newPage()
await page.goto(pageURL, {
waitUntil: 'networkidle2',
timeout: 60000,
waitUntil: 'networkidle0',
})
await waitFor(4000)
return (await page.evaluate(async (author) => {
const MAX_THREAD_DEPTH = 100
const ids: string[] = []
/**
* Wait for `ms` amount of milliseconds
* @param {number} ms
@ -232,6 +226,8 @@ const getTweetIds = async (
const waitFor = (ms: number) =>
new Promise((resolve) => setTimeout(resolve, ms))
const ids: Set<string> = new Set()
// Find the first Show thread button and click it
const showRepliesButton = Array.from(
document.querySelectorAll('div[dir="auto"]')
@ -247,30 +243,42 @@ const getTweetIds = async (
await waitFor(2000)
}
const timeNodes = Array.from(document.querySelectorAll('time'))
const distance = 1080
const scrollHeight = document.body.scrollHeight
let currentHeight = 0
// keep scrolling until there are no more elements
while (currentHeight < scrollHeight) {
const timeNodes = Array.from(document.querySelectorAll('time'))
for (let i = 0; i < timeNodes.length && i < MAX_THREAD_DEPTH; i++) {
const timeContainerAnchor: HTMLAnchorElement | HTMLSpanElement | null =
timeNodes[i].parentElement
if (!timeContainerAnchor) continue
for (let i = 0; i < timeNodes.length; i++) {
const timeContainerAnchor:
| HTMLAnchorElement
| HTMLSpanElement
| null = timeNodes[i].parentElement
if (!timeContainerAnchor) continue
if (timeContainerAnchor.tagName === 'SPAN') continue
if (timeContainerAnchor.tagName === 'SPAN') continue
const href = timeContainerAnchor.getAttribute('href')
if (!href) continue
const href = timeContainerAnchor.getAttribute('href')
if (!href) continue
// Get the tweet id and username from the href: https://twitter.com/username/status/1234567890
const match = href.match(/\/([^/]+)\/status\/(\d+)/)
if (!match) continue
// Get the tweet id and username from the href: https://twitter.com/username/status/1234567890
const match = href.match(/\/([^/]+)\/status\/(\d+)/)
if (!match) continue
const id = match[2]
const username = match[1]
const id = match[2]
const username = match[1]
// skip non-author replies
username === author && ids.push(id)
// skip non-author replies
username === author && ids.add(id)
}
window.scrollBy(0, distance)
await waitFor(100)
currentHeight += distance
}
return ids
return Array.from(ids)
}, author)) as string[]
} catch (error) {
console.log(error)

View File

@ -534,7 +534,7 @@ async function retrieveHtml(page, logRecord) {
logRecord.title = title;
const pageScrollingStart = Date.now();
/* scroll with a 5 second timeout */
/* scroll with a 5 seconds timeout */
await Promise.race([
new Promise(resolve => {
(async function () {
@ -562,7 +562,7 @@ async function retrieveHtml(page, logRecord) {
}
})();
}),
await page.waitForTimeout(1000),
await page.waitForTimeout(5000),
]);
logRecord.timing = { ...logRecord.timing, pageScrolled: Date.now() - pageScrollingStart };