Add readability in puppeteer-parse

This commit is contained in:
Hongbo Wu
2022-12-27 21:22:49 +08:00
parent 2190acc0e0
commit 6a49689d1e
2 changed files with 16 additions and 2 deletions

View File

@ -13,6 +13,7 @@ const os = require('os');
const { Storage } = require('@google-cloud/storage');
const { parseHTML } = require('linkedom');
const { preHandleContent } = require("@omnivore/content-handler");
const { Readability } = require("@omnivore/readability");
const puppeteer = require('puppeteer-extra');
@ -280,6 +281,8 @@ async function fetchContent(req, res) {
logRecord.fetchContentTime = Date.now() - functionStartTime;
const readabilityResult = content ? (await getReadabilityResult(url, content)) : null;
const apiResponse = await sendCreateArticleMutation(userId, {
url: finalUrl,
articleSavingRequestId,
@ -290,7 +293,8 @@ async function fetchContent(req, res) {
canonicalUrl: finalUrl,
},
},
skipParsing: !content,
skipParsing: !!readabilityResult,
readabilityResult,
});
logRecord.totalTime = Date.now() - functionStartTime;
@ -306,6 +310,8 @@ async function fetchContent(req, res) {
const content = sbResult.domContent;
logRecord.fetchContentTime = Date.now() - functionStartTime;
const readabilityResult = content ? (await getReadabilityResult(url, content)) : null;
const apiResponse = await sendCreateArticleMutation(userId, {
url: sbUrl,
articleSavingRequestId,
@ -316,7 +322,8 @@ async function fetchContent(req, res) {
canonicalUrl: sbUrl,
},
},
skipParsing: !content,
skipParsing: !!readabilityResult,
readabilityResult,
});
logRecord.totalTime = Date.now() - functionStartTime;
@ -758,6 +765,12 @@ async function preview(req, res) {
return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`);
}
async function getReadabilityResult(url, domContent) {
const document = parseHTML(domContent).document;
const readability = new Readability(document, { url });
return readability.parse();
}
module.exports = {
fetchContent,
preview,

View File

@ -6,6 +6,7 @@
"dependencies": {
"@google-cloud/storage": "^5.18.1",
"@omnivore/content-handler": "1.0.0",
"@omnivore/readability": "1.0.0",
"axios": "^0.27.2",
"jsonwebtoken": "^8.5.1",
"linkedom": "^0.14.9",