From ab16447c14addc0342568a4c9d3164d289bb2de5 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 3 Jan 2023 21:26:34 +0800 Subject: [PATCH] Fix not saving all the tweets in a thread --- .../src/websites/twitter-handler.ts | 54 +++++++++++-------- packages/puppeteer-parse/index.js | 4 +- 2 files changed, 33 insertions(+), 25 deletions(-) diff --git a/packages/content-handler/src/websites/twitter-handler.ts b/packages/content-handler/src/websites/twitter-handler.ts index afb8985ef..46e635980 100644 --- a/packages/content-handler/src/websites/twitter-handler.ts +++ b/packages/content-handler/src/websites/twitter-handler.ts @@ -215,16 +215,10 @@ const getTweetIds = async ( const page = await context.newPage() await page.goto(pageURL, { - waitUntil: 'networkidle2', - timeout: 60000, + waitUntil: 'networkidle0', }) - await waitFor(4000) - return (await page.evaluate(async (author) => { - const MAX_THREAD_DEPTH = 100 - const ids: string[] = [] - /** * Wait for `ms` amount of milliseconds * @param {number} ms @@ -232,6 +226,8 @@ const getTweetIds = async ( const waitFor = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms)) + const ids: Set = new Set() + // Find the first Show thread button and click it const showRepliesButton = Array.from( document.querySelectorAll('div[dir="auto"]') @@ -247,30 +243,42 @@ const getTweetIds = async ( await waitFor(2000) } - const timeNodes = Array.from(document.querySelectorAll('time')) + const distance = 1080 + const scrollHeight = document.body.scrollHeight + let currentHeight = 0 + // keep scrolling until there are no more elements + while (currentHeight < scrollHeight) { + const timeNodes = Array.from(document.querySelectorAll('time')) - for (let i = 0; i < timeNodes.length && i < MAX_THREAD_DEPTH; i++) { - const timeContainerAnchor: HTMLAnchorElement | HTMLSpanElement | null = - timeNodes[i].parentElement - if (!timeContainerAnchor) continue + for (let i = 0; i < timeNodes.length; i++) { + const timeContainerAnchor: + | HTMLAnchorElement + | HTMLSpanElement + | null = timeNodes[i].parentElement + if (!timeContainerAnchor) continue - if (timeContainerAnchor.tagName === 'SPAN') continue + if (timeContainerAnchor.tagName === 'SPAN') continue - const href = timeContainerAnchor.getAttribute('href') - if (!href) continue + const href = timeContainerAnchor.getAttribute('href') + if (!href) continue - // Get the tweet id and username from the href: https://twitter.com/username/status/1234567890 - const match = href.match(/\/([^/]+)\/status\/(\d+)/) - if (!match) continue + // Get the tweet id and username from the href: https://twitter.com/username/status/1234567890 + const match = href.match(/\/([^/]+)\/status\/(\d+)/) + if (!match) continue - const id = match[2] - const username = match[1] + const id = match[2] + const username = match[1] - // skip non-author replies - username === author && ids.push(id) + // skip non-author replies + username === author && ids.add(id) + } + + window.scrollBy(0, distance) + await waitFor(100) + currentHeight += distance } - return ids + return Array.from(ids) }, author)) as string[] } catch (error) { console.log(error) diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index f2ec56f42..071dea4a2 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -534,7 +534,7 @@ async function retrieveHtml(page, logRecord) { logRecord.title = title; const pageScrollingStart = Date.now(); - /* scroll with a 5 second timeout */ + /* scroll with a 5 seconds timeout */ await Promise.race([ new Promise(resolve => { (async function () { @@ -562,7 +562,7 @@ async function retrieveHtml(page, logRecord) { } })(); }), - await page.waitForTimeout(1000), + await page.waitForTimeout(5000), ]); logRecord.timing = { ...logRecord.timing, pageScrolled: Date.now() - pageScrollingStart };