From 145a8b973060e3223ddfa99fdaf3d8581f1f5317 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Thu, 12 Jan 2023 22:03:29 +0800 Subject: [PATCH] Fix last two tweets not saved in a long thread by waiting for document loaded after scrolling --- .../content-handler/src/websites/twitter-handler.ts | 6 ++++-- packages/puppeteer-parse/index.js | 13 ++++++++++++- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/packages/content-handler/src/websites/twitter-handler.ts b/packages/content-handler/src/websites/twitter-handler.ts index 46e635980..c745f6d66 100644 --- a/packages/content-handler/src/websites/twitter-handler.ts +++ b/packages/content-handler/src/websites/twitter-handler.ts @@ -274,7 +274,7 @@ const getTweetIds = async ( } window.scrollBy(0, distance) - await waitFor(100) + await waitFor(500) currentHeight += distance } @@ -364,6 +364,7 @@ export class TwitterHandler extends ContentHandler { ` const content = ` + @@ -375,7 +376,8 @@ export class TwitterHandler extends ContentHandler { ${tweetsContent} ${tweetUrl} - ` + +` return { content, url, title } } diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index 2eb417901..fa25b4732 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -348,7 +348,18 @@ async function fetchContent(req, res) { const content = sbResult.domContent; logRecord.fetchContentTime = Date.now() - functionStartTime; - const readabilityResult = content ? (await getReadabilityResult(url, content)) : null; + let readabilityResult = null; + if (content) { + let document = parseHTML(content).document; + + // preParse content + const preParsedDom = await preParseContent(sbUrl, document) + if (preParsedDom) { + document = preParsedDom + } + + readabilityResult = await getReadabilityResult(url, document); + } const apiResponse = await sendSavePageMutation(userId, { url: finalUrl,