diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts index 431f6619e..8fd3a4d30 100644 --- a/packages/api/src/utils/parser.ts +++ b/packages/api/src/utils/parser.ts @@ -184,7 +184,8 @@ export const parsePreparedContent = async ( labels: { source: 'parsePreparedContent' }, } - let article = null + // If we have a parse result, use it + let article = parseResult || null let highlightData = undefined const { document, pageInfo } = preparedDocument @@ -205,14 +206,16 @@ export const parsePreparedContent = async ( let dom = parseHTML(document).document - // preParse content - const preParsedDom = await preParseContent(url, dom) - preParsedDom && (dom = preParsedDom) - try { - article = - parseResult || - (await getReadabilityResult(url, document, dom, isNewsletter)) + if (!article) { + // Attempt to parse the article + // preParse content + const preParsedDom = await preParseContent(url, dom) + preParsedDom && (dom = preParsedDom) + + article = await getReadabilityResult(url, document, dom, isNewsletter) + } + if (!article?.textContent && allowRetry) { const newDocument = { ...preparedDocument, diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index 0ee12be6e..2eb417901 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -15,7 +15,7 @@ const signToken = promisify(jwt.sign); const os = require('os'); const { Storage } = require('@google-cloud/storage'); const { parseHTML } = require('linkedom'); -const { preHandleContent } = require("@omnivore/content-handler"); +const { preHandleContent, preParseContent } = require("@omnivore/content-handler"); const { Readability } = require("@omnivore/readability"); const puppeteer = require('puppeteer-extra'); @@ -314,7 +314,18 @@ async function fetchContent(req, res) { logRecord.fetchContentTime = Date.now() - functionStartTime; - const readabilityResult = content ? (await getReadabilityResult(url, content)) : null; + let readabilityResult = null; + if (content) { + let document = parseHTML(content).document; + + // preParse content + const preParsedDom = await preParseContent(url, document) + if (preParsedDom) { + document = preParsedDom + } + + readabilityResult = await getReadabilityResult(url, document); + } const apiResponse = await sendSavePageMutation(userId, { url: finalUrl,