From 6a49689d1eb08f2a7f5c3ff0a21cedd6125e6726 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 27 Dec 2022 21:22:49 +0800 Subject: [PATCH] Add readability in puppeteer-parse --- packages/puppeteer-parse/index.js | 17 +++++++++++++++-- packages/puppeteer-parse/package.json | 1 + 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index efce2c6e0..ae2dd942b 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -13,6 +13,7 @@ const os = require('os'); const { Storage } = require('@google-cloud/storage'); const { parseHTML } = require('linkedom'); const { preHandleContent } = require("@omnivore/content-handler"); +const { Readability } = require("@omnivore/readability"); const puppeteer = require('puppeteer-extra'); @@ -280,6 +281,8 @@ async function fetchContent(req, res) { logRecord.fetchContentTime = Date.now() - functionStartTime; + const readabilityResult = content ? (await getReadabilityResult(url, content)) : null; + const apiResponse = await sendCreateArticleMutation(userId, { url: finalUrl, articleSavingRequestId, @@ -290,7 +293,8 @@ async function fetchContent(req, res) { canonicalUrl: finalUrl, }, }, - skipParsing: !content, + skipParsing: !!readabilityResult, + readabilityResult, }); logRecord.totalTime = Date.now() - functionStartTime; @@ -306,6 +310,8 @@ async function fetchContent(req, res) { const content = sbResult.domContent; logRecord.fetchContentTime = Date.now() - functionStartTime; + const readabilityResult = content ? (await getReadabilityResult(url, content)) : null; + const apiResponse = await sendCreateArticleMutation(userId, { url: sbUrl, articleSavingRequestId, @@ -316,7 +322,8 @@ async function fetchContent(req, res) { canonicalUrl: sbUrl, }, }, - skipParsing: !content, + skipParsing: !!readabilityResult, + readabilityResult, }); logRecord.totalTime = Date.now() - functionStartTime; @@ -758,6 +765,12 @@ async function preview(req, res) { return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`); } +async function getReadabilityResult(url, domContent) { + const document = parseHTML(domContent).document; + const readability = new Readability(document, { url }); + return readability.parse(); +} + module.exports = { fetchContent, preview, diff --git a/packages/puppeteer-parse/package.json b/packages/puppeteer-parse/package.json index 42b3a10a4..8798377a9 100644 --- a/packages/puppeteer-parse/package.json +++ b/packages/puppeteer-parse/package.json @@ -6,6 +6,7 @@ "dependencies": { "@google-cloud/storage": "^5.18.1", "@omnivore/content-handler": "1.0.0", + "@omnivore/readability": "1.0.0", "axios": "^0.27.2", "jsonwebtoken": "^8.5.1", "linkedom": "^0.14.9",