Merge pull request #1577 from omnivore-app/readability-in-puppeteer

readability in puppeteer
This commit is contained in:
Hongbo Wu
2022-12-28 17:57:06 +08:00
committed by GitHub
13 changed files with 272 additions and 60 deletions

View File

@ -1612,6 +1612,21 @@ export enum PageType {
Website = 'WEBSITE'
}
export type ParseResult = {
byline?: InputMaybe<Scalars['String']>;
content: Scalars['String'];
dir?: InputMaybe<Scalars['String']>;
excerpt: Scalars['String'];
language?: InputMaybe<Scalars['String']>;
length: Scalars['Int'];
previewImage?: InputMaybe<Scalars['String']>;
publishedDate?: InputMaybe<Scalars['Date']>;
siteIcon?: InputMaybe<Scalars['String']>;
siteName?: InputMaybe<Scalars['String']>;
textContent: Scalars['String'];
title: Scalars['String'];
};
export type PreparedDocumentInput = {
document: Scalars['String'];
pageInfo: PageInfoInput;
@ -2071,6 +2086,7 @@ export type SaveFilterSuccess = {
export type SavePageInput = {
clientRequestId: Scalars['ID'];
originalContent: Scalars['String'];
parseResult?: InputMaybe<ParseResult>;
source: Scalars['String'];
title?: InputMaybe<Scalars['String']>;
url: Scalars['String'];
@ -3291,6 +3307,7 @@ export type ResolversTypes = {
PageInfo: ResolverTypeWrapper<PageInfo>;
PageInfoInput: PageInfoInput;
PageType: PageType;
ParseResult: ParseResult;
PreparedDocumentInput: PreparedDocumentInput;
Profile: ResolverTypeWrapper<Profile>;
Query: ResolverTypeWrapper<{}>;
@ -3704,6 +3721,7 @@ export type ResolversParentTypes = {
Page: Page;
PageInfo: PageInfo;
PageInfoInput: PageInfoInput;
ParseResult: ParseResult;
PreparedDocumentInput: PreparedDocumentInput;
Profile: Profile;
Query: {};

View File

@ -1169,6 +1169,21 @@ enum PageType {
WEBSITE
}
input ParseResult {
byline: String
content: String!
dir: String
excerpt: String!
language: String
length: Int!
previewImage: String
publishedDate: Date
siteIcon: String
siteName: String
textContent: String!
title: String!
}
input PreparedDocumentInput {
document: String!
pageInfo: PageInfoInput!
@ -1501,6 +1516,7 @@ type SaveFilterSuccess {
input SavePageInput {
clientRequestId: ID!
originalContent: String!
parseResult: ParseResult
source: String!
title: String
url: String!

View File

@ -145,9 +145,9 @@ declare module '@omnivore/readability' {
/** Article title */
title: string
/** Author metadata */
byline: string
byline?: string | null
/** Content direction */
dir: string
dir?: string | null
/** HTML string of processed article content */
content: string
/** non-HTML version of `content` */
@ -157,15 +157,14 @@ declare module '@omnivore/readability' {
/** Article description, or short excerpt from the content */
excerpt: string
/** Article site name */
siteName: string
siteName?: string | null
/** Article site icon */
siteIcon: string
siteIcon?: string | null
/** Article preview image */
previewImage?: string
previewImage?: string | null
/** Article published date */
publishedDate?: Date
dom?: Element
language?: string
publishedDate?: Date | null
language?: string | null
}
}

View File

@ -526,12 +526,28 @@ const schema = gql`
uploadFileId: ID!
}
input ParseResult {
title: String!
byline: String
dir: String
content: String!
textContent: String!
length: Int!
excerpt: String!
siteName: String
siteIcon: String
previewImage: String
publishedDate: Date
language: String
}
input SavePageInput {
url: String!
source: String!
clientRequestId: ID!
title: String
originalContent: String!
parseResult: ParseResult
}
input SaveUrlInput {

View File

@ -41,6 +41,7 @@ export const saveEmail = async (
// can leave this empty for now
},
},
null,
true
)
const content = parseResult.parsedContent?.content || input.originalContent
@ -62,15 +63,21 @@ export const saveEmail = async (
}),
pageType: parseResult.pageType,
hash: stringToHash(content),
image: metadata?.previewImage || parseResult.parsedContent?.previewImage,
publishedAt: validatedDate(parseResult.parsedContent?.publishedDate),
image:
metadata?.previewImage ||
parseResult.parsedContent?.previewImage ||
undefined,
publishedAt: validatedDate(
parseResult.parsedContent?.publishedDate ?? undefined
),
createdAt: new Date(),
savedAt: new Date(),
readingProgressAnchorIndex: 0,
readingProgressPercent: 0,
subscription: input.author,
state: ArticleSavingRequestStatus.Succeeded,
siteIcon: parseResult.parsedContent?.siteIcon,
siteIcon: parseResult.parsedContent?.siteIcon ?? undefined,
siteName: parseResult.parsedContent?.siteName ?? undefined,
}
const page = await getPageByParam({

View File

@ -77,13 +77,17 @@ export const savePage = async (
input: SavePageInput
): Promise<SaveResult> => {
const [slug, croppedPathname] = createSlug(input.url, input.title)
const parseResult = await parsePreparedContent(input.url, {
document: input.originalContent,
pageInfo: {
title: input.title,
canonicalUrl: input.url,
const parseResult = await parsePreparedContent(
input.url,
{
document: input.originalContent,
pageInfo: {
title: input.title,
canonicalUrl: input.url,
},
},
})
input.parseResult
)
const articleToSave = parsedContentToPage({
url: input.url,
@ -222,24 +226,24 @@ export const parsedContentToPage = ({
croppedPathname ||
parsedContent?.siteName ||
url,
author: parsedContent?.byline,
author: parsedContent?.byline ?? undefined,
url: normalizeUrl(canonicalUrl || url, {
stripHash: true,
stripWWW: false,
}),
pageType,
hash: uploadFileHash || stringToHash(parsedContent?.content || url),
image: parsedContent?.previewImage,
publishedAt: validatedDate(parsedContent?.publishedDate),
image: parsedContent?.previewImage ?? undefined,
publishedAt: validatedDate(parsedContent?.publishedDate ?? undefined),
uploadFileId: uploadFileId,
readingProgressPercent: 0,
readingProgressAnchorIndex: 0,
state: ArticleSavingRequestStatus.Succeeded,
createdAt: saveTime || new Date(),
savedAt: saveTime || new Date(),
siteName: parsedContent?.siteName,
language: parsedContent?.language,
siteIcon: parsedContent?.siteIcon,
siteName: parsedContent?.siteName ?? undefined,
language: parsedContent?.language ?? undefined,
siteIcon: parsedContent?.siteIcon ?? undefined,
wordsCount: wordsCount(parsedContent?.textContent || ''),
}
}

View File

@ -17,8 +17,8 @@ import { v4 as uuid } from 'uuid'
import addressparser from 'addressparser'
import { preParseContent } from '@omnivore/content-handler'
import {
findEmbeddedHighlight,
EmbeddedHighlightData,
findEmbeddedHighlight,
} from './highlightGenerator'
const logger = buildLogger('utils.parse')
@ -174,6 +174,7 @@ const getReadabilityResult = async (
export const parsePreparedContent = async (
url: string,
preparedDocument: PreparedDocumentInput,
parseResult?: Readability.ParseResult | null,
isNewsletter?: boolean,
allowRetry = true
): Promise<ParsedContentPuppeteer> => {
@ -208,20 +209,29 @@ export const parsePreparedContent = async (
preParsedDom && (dom = preParsedDom)
try {
article = await getReadabilityResult(url, document, dom, isNewsletter)
article =
parseResult ||
(await getReadabilityResult(url, document, dom, isNewsletter))
if (!article?.textContent && allowRetry) {
const newDocument = {
...preparedDocument,
document: '<html>' + preparedDocument.document + '</html>',
}
return parsePreparedContent(url, newDocument, isNewsletter, false)
return parsePreparedContent(
url,
newDocument,
parseResult,
isNewsletter,
false
)
}
// Format code blocks
// TODO: we probably want to move this type of thing
// to the handlers, and have some concept of postHandle
if (article?.dom) {
const codeBlocks = article.dom.querySelectorAll('code')
if (article?.content) {
const articleDom = parseHTML(article.content).document
const codeBlocks = articleDom.querySelectorAll('code')
if (codeBlocks.length > 0) {
codeBlocks.forEach((e) => {
if (e.textContent) {
@ -237,12 +247,10 @@ export const parsePreparedContent = async (
e.replaceWith(code)
}
})
article.content = article.dom.outerHTML
article.content = articleDom.documentElement.outerHTML
}
if (article?.dom) {
highlightData = findEmbeddedHighlight(article?.dom)
}
highlightData = findEmbeddedHighlight(articleDom.documentElement)
const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
'omnivore-highlight-id',
@ -251,7 +259,7 @@ export const parsePreparedContent = async (
]
// Get the top level element?
const pageNode = article.dom.firstElementChild as HTMLElement
const pageNode = articleDom.firstElementChild as HTMLElement
const nodesToVisitStack: [HTMLElement] = [pageNode]
const visitedNodeList = []
@ -281,7 +289,7 @@ export const parsePreparedContent = async (
node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString())
})
article.content = article.dom.outerHTML
article.content = articleDom.documentElement.outerHTML
}
const newWindow = parseHTML('')

View File

@ -9,7 +9,10 @@ RUN apk add --no-cache \
ca-certificates \
ttf-freefont \
nodejs \
yarn
yarn \
g++ \
make \
python3
# Add user so we don't need --no-sandbox.
RUN addgroup -S pptruser && adduser -S -g pptruser pptruser \
@ -29,6 +32,7 @@ COPY tsconfig.json .
COPY .prettierrc .
COPY .eslintrc .
COPY /packages/readabilityjs/package.json ./packages/readabilityjs/package.json
COPY /packages/content-handler/package.json ./packages/content-handler/package.json
COPY /packages/puppeteer-parse/package.json ./packages/puppeteer-parse/package.json
@ -37,6 +41,7 @@ RUN yarn install --pure-lockfile
ADD /packages/content-fetch ./packages/content-fetch
ADD /packages/content-handler ./packages/content-handler
ADD /packages/puppeteer-parse ./packages/puppeteer-parse
ADD /packages/readabilityjs ./packages/readabilityjs
RUN yarn workspace @omnivore/content-handler build
# After building, fetch the production dependencies

View File

@ -9,7 +9,10 @@ RUN apk add --no-cache \
ca-certificates \
ttf-freefont \
nodejs \
yarn
yarn \
g++ \
make \
python3
# Add user so we don't need --no-sandbox.
RUN addgroup -S pptruser && adduser -S -g pptruser pptruser \
@ -30,6 +33,7 @@ COPY tsconfig.json .
COPY .prettierrc .
COPY .eslintrc .
COPY /packages/readabilityjs/package.json ./packages/readabilityjs/package.json
COPY /packages/content-handler/package.json ./packages/content-handler/package.json
COPY /packages/puppeteer-parse/package.json ./packages/puppeteer-parse/package.json
@ -38,6 +42,7 @@ RUN yarn install --pure-lockfile
ADD /packages/content-handler ./packages/content-handler
ADD /packages/puppeteer-parse ./packages/puppeteer-parse
ADD /packages/content-fetch ./packages/content-fetch
ADD /packages/readabilityjs ./packages/readabilityjs
RUN yarn workspace @omnivore/content-handler build
# After building, fetch the production dependencies

View File

@ -3,6 +3,9 @@
/* eslint-disable @typescript-eslint/explicit-function-return-type */
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable @typescript-eslint/no-require-imports */
const { encode } = require("urlsafe-base64");
const crypto = require("crypto");
const Url = require('url');
// const puppeteer = require('puppeteer-extra');
const axios = require('axios');
@ -13,6 +16,7 @@ const os = require('os');
const { Storage } = require('@google-cloud/storage');
const { parseHTML } = require('linkedom');
const { preHandleContent } = require("@omnivore/content-handler");
const { Readability } = require("@omnivore/readability");
const puppeteer = require('puppeteer-extra');
@ -22,6 +26,7 @@ puppeteer.use(StealthPlugin());
// Add adblocker plugin to block all ads and trackers (saves bandwidth)
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker');
const createDOMPurify = require("dompurify");
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
const storage = new Storage();
@ -199,6 +204,35 @@ const sendCreateArticleMutation = async (userId, input) => {
return response.data.data.createArticle;
};
const sendSavePageMutation = async (userId, input) => {
const data = JSON.stringify({
query: `mutation SavePage ($input: SavePageInput!){
savePage(input:$input){
... on SaveSuccess{
url
clientRequestId
}
... on SaveError{
errorCodes
}
}
}`,
variables: {
input: Object.assign({}, input , { source: 'puppeteer-parse' }),
},
});
const auth = await signToken({ uid: userId }, process.env.JWT_SECRET);
const response = await axios.post(`${process.env.REST_BACKEND_ENDPOINT}/graphql`, data,
{
headers: {
Cookie: `auth=${auth};`,
'Content-Type': 'application/json',
},
});
return response.data.data.savePage;
};
const saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId) => {
return sendCreateArticleMutation(userId, {
url: encodeURI(url),
@ -280,17 +314,14 @@ async function fetchContent(req, res) {
logRecord.fetchContentTime = Date.now() - functionStartTime;
const apiResponse = await sendCreateArticleMutation(userId, {
const readabilityResult = content ? (await getReadabilityResult(url, content)) : null;
const apiResponse = await sendSavePageMutation(userId, {
url: finalUrl,
articleSavingRequestId,
preparedDocument: {
document: content,
pageInfo: {
title,
canonicalUrl: finalUrl,
},
},
skipParsing: !content,
clientRequestId: articleSavingRequestId,
title,
originalContent: content,
parseResult: readabilityResult,
});
logRecord.totalTime = Date.now() - functionStartTime;
@ -306,17 +337,14 @@ async function fetchContent(req, res) {
const content = sbResult.domContent;
logRecord.fetchContentTime = Date.now() - functionStartTime;
const apiResponse = await sendCreateArticleMutation(userId, {
url: sbUrl,
articleSavingRequestId,
preparedDocument: {
document: content,
pageInfo: {
title: sbResult.title,
canonicalUrl: sbUrl,
},
},
skipParsing: !content,
const readabilityResult = content ? (await getReadabilityResult(url, content)) : null;
const apiResponse = await sendSavePageMutation(userId, {
url: finalUrl,
clientRequestId: articleSavingRequestId,
title,
originalContent: content,
parseResult: readabilityResult,
});
logRecord.totalTime = Date.now() - functionStartTime;
@ -758,6 +786,99 @@ async function preview(req, res) {
return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`);
}
const DOM_PURIFY_CONFIG = {
ADD_TAGS: ['iframe'],
ADD_ATTR: ['allow', 'allowfullscreen', 'frameborder', 'scrolling'],
FORBID_ATTR: [
'data-ml-dynamic',
'data-ml-dynamic-type',
'data-orig-url',
'data-ml-id',
'data-ml',
'data-xid',
'data-feature',
],
}
function domPurifySanitizeHook(node, data) {
if (data.tagName === 'iframe') {
const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i
const src = node.getAttribute('src') || ''
const dataSrc = node.getAttribute('data-src') || ''
if (src && urlRegex.test(src)) {
return
}
if (dataSrc && urlRegex.test(dataSrc)) {
node.setAttribute('src', dataSrc)
return
}
node.parentNode?.removeChild(node)
}
}
function getPurifiedContent(html) {
const newWindow = parseHTML('')
const DOMPurify = createDOMPurify(newWindow)
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG)
return parseHTML(clean).document
}
function signImageProxyUrl(url) {
return encode(
crypto.createHmac('sha256', process.env.IMAGE_PROXY_SECRET).update(url).digest()
)
}
function createImageProxyUrl(url, width = 0, height = 0) {
if (!process.env.IMAGE_PROXY_URL || !process.env.IMAGE_PROXY_SECRET) {
return url
}
const urlWithOptions = `${url}#${width}x${height}`
const signature = signImageProxyUrl(urlWithOptions)
return `${process.env.IMAGE_PROXY_URL}/${width}x${height},s${signature}/${url}`
}
async function getReadabilityResult(url, document) {
// First attempt to read the article as is.
// if that fails attempt to purify then read
const sources = [
() => {
return document
},
() => {
return getPurifiedContent(document)
},
]
for (const source of sources) {
const document = source()
if (!document) {
continue
}
try {
const article = await new Readability(document, {
createImageProxyUrl,
url,
}).parse()
if (article) {
return article
}
} catch (error) {
console.log('parsing error for url', url, error)
}
}
return null
}
module.exports = {
fetchContent,
preview,

View File

@ -6,14 +6,18 @@
"dependencies": {
"@google-cloud/storage": "^5.18.1",
"@omnivore/content-handler": "1.0.0",
"@omnivore/readability": "1.0.0",
"axios": "^0.27.2",
"crypto": "^1.0.1",
"dompurify": "^2.4.1",
"jsonwebtoken": "^8.5.1",
"linkedom": "^0.14.9",
"puppeteer-core": "^16.1.0",
"puppeteer-extra": "^3.3.4",
"puppeteer-extra-plugin-adblocker": "^2.13.5",
"puppeteer-extra-plugin-stealth": "^2.11.1",
"underscore": "^1.13.4"
"underscore": "^1.13.4",
"urlsafe-base64": "^1.0.0"
},
"devDependencies": {
"chai": "^4.3.6",

View File

@ -2998,7 +2998,6 @@ Readability.prototype = {
siteIcon: metadata.siteIcon,
previewImage: metadata.previewImage,
publishedDate: metadata.publishedDate || publishedAt || this._articlePublishedDate,
dom: articleContent,
language: this._getLanguage(metadata.locale || this._languageCode),
};
}

View File

@ -12251,6 +12251,11 @@ crypto-random-string@^2.0.0:
resolved "https://registry.yarnpkg.com/crypto-random-string/-/crypto-random-string-2.0.0.tgz#ef2a7a966ec11083388369baa02ebead229b30d5"
integrity sha512-v1plID3y9r/lPhviJ1wrXpLeyUIGAZ2SHNYTEapm7/8A9nLPoyvVp3RK/EPFqn5kEznyWgYZNsRtYYIWbuG8KA==
crypto@^1.0.1:
version "1.0.1"
resolved "https://registry.yarnpkg.com/crypto/-/crypto-1.0.1.tgz#2af1b7cad8175d24c8a1b0778255794a21803037"
integrity sha512-VxBKmeNcqQdiUQUW2Tzq0t377b54N2bMtXO/qiLa+6eRRmmC4qT3D4OnTGoT/U6O9aklQ/jTwbOtRMTTY8G0Ig==
css-loader@^3.6.0:
version "3.6.0"
resolved "https://registry.yarnpkg.com/css-loader/-/css-loader-3.6.0.tgz#2e4b2c7e6e2d27f8c8f28f61bffcd2e6c91ef645"
@ -13055,6 +13060,11 @@ dompurify@^2.0.17:
resolved "https://registry.yarnpkg.com/dompurify/-/dompurify-2.3.8.tgz#224fe9ae57d7ebd9a1ae1ac18c1c1ca3f532226f"
integrity sha512-eVhaWoVibIzqdGYjwsBWodIQIaXFSB+cKDf4cfxLMsK0xiud6SE+/WCVx/Xw/UwQsa4cS3T2eITcdtmTg2UKcw==
dompurify@^2.4.1:
version "2.4.1"
resolved "https://registry.yarnpkg.com/dompurify/-/dompurify-2.4.1.tgz#f9cb1a275fde9af6f2d0a2644ef648dd6847b631"
integrity sha512-ewwFzHzrrneRjxzmK6oVz/rZn9VWspGFRDb4/rRtIsM1n36t9AKma/ye8syCpcw+XJ25kOK/hOG7t1j2I2yBqA==
domutils@^2.0.0, domutils@^2.5.2:
version "2.7.0"
resolved "https://registry.yarnpkg.com/domutils/-/domutils-2.7.0.tgz#8ebaf0c41ebafcf55b0b72ec31c56323712c5442"
@ -26280,7 +26290,7 @@ url@^0.11.0:
urlsafe-base64@^1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/urlsafe-base64/-/urlsafe-base64-1.0.0.tgz#23f89069a6c62f46cf3a1d3b00169cefb90be0c6"
integrity sha1-I/iQaabGL0bPOh07ABac77kL4MY=
integrity sha512-RtuPeMy7c1UrHwproMZN9gN6kiZ0SvJwRaEzwZY0j9MypEkFqyBaKv176jvlPtg58Zh36bOkS0NFABXMHvvGCA==
use-callback-ref@^1.2.3:
version "1.2.5"