Fix readability not run in puppeteer-parse

This commit is contained in:
Hongbo Wu
2023-01-12 18:33:47 +08:00
parent 126373a10d
commit 7bfd36e308
2 changed files with 24 additions and 10 deletions

View File

@ -184,7 +184,8 @@ export const parsePreparedContent = async (
labels: { source: 'parsePreparedContent' },
}
let article = null
// If we have a parse result, use it
let article = parseResult || null
let highlightData = undefined
const { document, pageInfo } = preparedDocument
@ -205,14 +206,16 @@ export const parsePreparedContent = async (
let dom = parseHTML(document).document
// preParse content
const preParsedDom = await preParseContent(url, dom)
preParsedDom && (dom = preParsedDom)
try {
article =
parseResult ||
(await getReadabilityResult(url, document, dom, isNewsletter))
if (!article) {
// Attempt to parse the article
// preParse content
const preParsedDom = await preParseContent(url, dom)
preParsedDom && (dom = preParsedDom)
article = await getReadabilityResult(url, document, dom, isNewsletter)
}
if (!article?.textContent && allowRetry) {
const newDocument = {
...preparedDocument,

View File

@ -15,7 +15,7 @@ const signToken = promisify(jwt.sign);
const os = require('os');
const { Storage } = require('@google-cloud/storage');
const { parseHTML } = require('linkedom');
const { preHandleContent } = require("@omnivore/content-handler");
const { preHandleContent, preParseContent } = require("@omnivore/content-handler");
const { Readability } = require("@omnivore/readability");
const puppeteer = require('puppeteer-extra');
@ -314,7 +314,18 @@ async function fetchContent(req, res) {
logRecord.fetchContentTime = Date.now() - functionStartTime;
const readabilityResult = content ? (await getReadabilityResult(url, content)) : null;
let readabilityResult = null;
if (content) {
let document = parseHTML(content).document;
// preParse content
const preParsedDom = await preParseContent(url, document)
if (preParsedDom) {
document = preParsedDom
}
readabilityResult = await getReadabilityResult(url, document);
}
const apiResponse = await sendSavePageMutation(userId, {
url: finalUrl,