From 5e239d2568f82d644cce301cc43c27dd40de61e4 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Wed, 24 Jan 2024 18:57:25 +0800 Subject: [PATCH] run readability in save-page instead of puppeteer --- packages/api/src/jobs/save_page.ts | 15 +- packages/api/src/services/save_email.ts | 1 - packages/api/src/services/save_page.ts | 16 +- packages/api/src/utils/parser.ts | 203 +++++++++--------- packages/content-fetch/src/request_handler.ts | 1 - packages/puppeteer-parse/src/index.ts | 142 ++---------- 6 files changed, 126 insertions(+), 252 deletions(-) diff --git a/packages/api/src/jobs/save_page.ts b/packages/api/src/jobs/save_page.ts index 08f650f38..3d143caf8 100644 --- a/packages/api/src/jobs/save_page.ts +++ b/packages/api/src/jobs/save_page.ts @@ -1,4 +1,3 @@ -import { Readability } from '@omnivore/readability' import axios from 'axios' import jwt from 'jsonwebtoken' import { promisify } from 'util' @@ -63,7 +62,6 @@ interface FetchResult { title?: string content?: string contentType?: string - readabilityResult?: Readability.ParseResult } const isFetchResult = (obj: unknown): obj is FetchResult => { @@ -297,7 +295,7 @@ export const savePageJob = async (data: Data, attemptsMade: number) => { // get the fetch result from cache const fetchedResult = await getCachedFetchResult(url) - const { title, contentType, readabilityResult } = fetchedResult + const { title, contentType } = fetchedResult let content = fetchedResult.content // for pdf content, we need to upload the pdf @@ -350,7 +348,6 @@ export const savePageJob = async (data: Data, attemptsMade: number) => { clientRequestId: articleSavingRequestId, title, originalContent: content, - parseResult: readabilityResult, state: state ? (state as ArticleSavingRequestStatus) : undefined, labels: labels, rssFeedUrl, @@ -362,13 +359,11 @@ export const savePageJob = async (data: Data, attemptsMade: number) => { user ) - // if (result.__typename == 'SaveError') { - // logger.error('Error saving page', { userId, url, result }) - // throw new Error('Error saving page') - // } + if (result.__typename == 'SaveError') { + throw new Error(result.message || 'Failed to save page') + } - // if the readability result is not parsed, the import is failed - isImported = !!readabilityResult + isImported = true isSaved = true } catch (e) { if (e instanceof Error) { diff --git a/packages/api/src/services/save_email.ts b/packages/api/src/services/save_email.ts index 092c2b98f..0c0f4f456 100644 --- a/packages/api/src/services/save_email.ts +++ b/packages/api/src/services/save_email.ts @@ -53,7 +53,6 @@ export const saveEmail = async ( // can leave this empty for now }, }, - null, true ) diff --git a/packages/api/src/services/save_page.ts b/packages/api/src/services/save_page.ts index a91f90549..9d55961e7 100644 --- a/packages/api/src/services/save_page.ts +++ b/packages/api/src/services/save_page.ts @@ -68,17 +68,13 @@ export const savePage = async ( input: SavePageInput, user: User ): Promise => { - const parseResult = await parsePreparedContent( - input.url, - { - document: input.originalContent, - pageInfo: { - title: input.title, - canonicalUrl: input.url, - }, + const parseResult = await parsePreparedContent(input.url, { + document: input.originalContent, + pageInfo: { + title: input.title, + canonicalUrl: input.url, }, - input.parseResult - ) + }) const [newSlug, croppedPathname] = createSlug(input.url, input.title) let slug = newSlug let clientRequestId = input.clientRequestId diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts index 54373baea..ae57ee6d0 100644 --- a/packages/api/src/utils/parser.ts +++ b/packages/api/src/utils/parser.ts @@ -223,7 +223,6 @@ const getReadabilityResult = async ( export const parsePreparedContent = async ( url: string, preparedDocument: PreparedDocumentInput, - parseResult?: Readability.ParseResult | null, isNewsletter?: boolean, allowRetry = true ): Promise => { @@ -232,9 +231,6 @@ export const parsePreparedContent = async ( labels: { source: 'parsePreparedContent' }, } - // If we have a parse result, use it - let article = parseResult || null - let highlightData = undefined const { document, pageInfo } = preparedDocument if (!document) { @@ -262,137 +258,134 @@ export const parsePreparedContent = async ( } } - let dom: Document | null = null + const { title: pageInfoTitle, canonicalUrl } = pageInfo + + let parsedContent = null + let pageType = PageType.Unknown + let highlightData = undefined try { - dom = parseHTML(document).document + const dom = parseHTML(document).document + pageType = parseOriginalContent(dom) - if (!article) { - // Attempt to parse the article - // preParse content - dom = (await preParseContent(url, dom)) || dom + // Run readability + await preParseContent(url, dom) - article = await getReadabilityResult(url, document, dom, isNewsletter) + parsedContent = await getReadabilityResult(url, document, dom, isNewsletter) + + if (!parsedContent || !parsedContent.content) { + logger.info('No parsed content') + + if (allowRetry) { + logger.info('Retrying with content wrapped in html body') + + const newDocument = { + ...preparedDocument, + document: '' + document + '', // wrap in body + } + return parsePreparedContent(url, newDocument, isNewsletter, false) + } + + return { + canonicalUrl, + parsedContent, + domContent: document, + pageType, + } } - if (!article?.textContent && allowRetry) { - const newDocument = { - ...preparedDocument, - document: '' + document + '', // wrap in body - } - return parsePreparedContent( - url, - newDocument, - parseResult, - isNewsletter, - false - ) + // use title if not found after running readability + if (!parsedContent.title && pageInfoTitle) { + parsedContent.title = pageInfoTitle } // Format code blocks // TODO: we probably want to move this type of thing // to the handlers, and have some concept of postHandle - if (article?.content) { - const articleDom = parseHTML(article.content).document - const codeBlocks = articleDom.querySelectorAll( - 'code, pre[class^="prism-"], pre[class^="language-"]' - ) - if (codeBlocks.length > 0) { - codeBlocks.forEach((e) => { - if (e.textContent) { - const att = hljs.highlightAuto(e.textContent) - const code = articleDom.createElement('code') - const langClass = - `hljs language-${att.language}` + - (att.second_best?.language - ? ` language-${att.second_best?.language}` - : '') - code.setAttribute('class', langClass) - code.innerHTML = att.value - e.replaceWith(code) - } - }) - article.content = articleDom.documentElement.outerHTML - } - - highlightData = findEmbeddedHighlight(articleDom.documentElement) - - const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [ - 'omnivore-highlight-id', - 'data-twitter-tweet-id', - 'data-instagram-id', - ] - - // Get the top level element? - const pageNode = articleDom.firstElementChild as HTMLElement - const nodesToVisitStack: [HTMLElement] = [pageNode] - const visitedNodeList = [] - - while (nodesToVisitStack.length > 0) { - const currentNode = nodesToVisitStack.pop() - if ( - currentNode?.nodeType !== 1 || - // Avoiding dynamic elements from being counted as anchor-allowed elements - ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) => - currentNode.hasAttribute(attrib) - ) - ) { - continue + const newDom = parseHTML(parsedContent.content).document + const codeBlocks = newDom.querySelectorAll( + 'code, pre[class^="prism-"], pre[class^="language-"]' + ) + if (codeBlocks.length > 0) { + codeBlocks.forEach((e) => { + if (e.textContent) { + const att = hljs.highlightAuto(e.textContent) + const code = newDom.createElement('code') + const langClass = + `hljs language-${att.language}` + + (att.second_best?.language + ? ` language-${att.second_best?.language}` + : '') + code.setAttribute('class', langClass) + code.innerHTML = att.value + e.replaceWith(code) } - visitedNodeList.push(currentNode) - ;[].slice - .call(currentNode.childNodes) - .reverse() - .forEach(function (node) { - nodesToVisitStack.push(node) - }) - } - - visitedNodeList.shift() - visitedNodeList.forEach((node, index) => { - // start from index 1, index 0 reserved for anchor unknown. - node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString()) }) - - article.content = articleDom.documentElement.outerHTML } + highlightData = findEmbeddedHighlight(newDom.documentElement) + + const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [ + 'omnivore-highlight-id', + 'data-twitter-tweet-id', + 'data-instagram-id', + ] + + // Get the top level element? + const pageNode = newDom.firstElementChild as HTMLElement + const nodesToVisitStack: [HTMLElement] = [pageNode] + const visitedNodeList = [] + + while (nodesToVisitStack.length > 0) { + const currentNode = nodesToVisitStack.pop() + if ( + currentNode?.nodeType !== 1 || + // Avoiding dynamic elements from being counted as anchor-allowed elements + ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) => + currentNode.hasAttribute(attrib) + ) + ) { + continue + } + visitedNodeList.push(currentNode) + ;[].slice + .call(currentNode.childNodes) + .reverse() + .forEach(function (node) { + nodesToVisitStack.push(node) + }) + } + + visitedNodeList.shift() + visitedNodeList.forEach((node, index) => { + // start from index 1, index 0 reserved for anchor unknown. + node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString()) + }) + + const newHtml = newDom.documentElement.outerHTML const newWindow = parseHTML('') const DOMPurify = createDOMPurify(newWindow) DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook) - const clean = DOMPurify.sanitize(article?.content || '', DOM_PURIFY_CONFIG) + const cleanHtml = DOMPurify.sanitize(newHtml, DOM_PURIFY_CONFIG) + parsedContent.content = cleanHtml - Object.assign(article || {}, { - content: clean, - title: article?.title, - previewImage: article?.previewImage, - siteName: article?.siteName, - siteIcon: article?.siteIcon, - byline: article?.byline, - language: article?.language, - }) logRecord.parseSuccess = true } catch (error) { - logger.info('Error parsing content', error) + logger.error('Error parsing content', error) + Object.assign(logRecord, { parseSuccess: false, parseError: error, }) } - const { title, canonicalUrl } = pageInfo - - Object.assign(article || {}, { - title: article?.title || title, - }) - - logger.info('parse-article completed') + logger.info('parse-article completed', logRecord) return { - domContent: document, - parsedContent: article, canonicalUrl, - pageType: dom ? parseOriginalContent(dom) : PageType.Unknown, + parsedContent, + domContent: document, + pageType, highlightData, } } diff --git a/packages/content-fetch/src/request_handler.ts b/packages/content-fetch/src/request_handler.ts index 0106d8058..56356ec1e 100644 --- a/packages/content-fetch/src/request_handler.ts +++ b/packages/content-fetch/src/request_handler.ts @@ -50,7 +50,6 @@ interface FetchResult { title?: string content?: string contentType?: string - readabilityResult?: unknown } export const cacheFetchResult = async (fetchResult: FetchResult) => { diff --git a/packages/puppeteer-parse/src/index.ts b/packages/puppeteer-parse/src/index.ts index 2629689c2..a02a3595b 100644 --- a/packages/puppeteer-parse/src/index.ts +++ b/packages/puppeteer-parse/src/index.ts @@ -1,10 +1,7 @@ /* eslint-disable @typescript-eslint/no-unsafe-member-access */ /* eslint-disable @typescript-eslint/no-unsafe-assignment */ -import { preHandleContent, preParseContent } from '@omnivore/content-handler' -import { Readability } from '@omnivore/readability' +import { preHandleContent } from '@omnivore/content-handler' import axios from 'axios' -import crypto from 'crypto' -import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify' // const { Storage } = require('@google-cloud/storage'); import { parseHTML } from 'linkedom' import path from 'path' @@ -13,7 +10,6 @@ import puppeteer from 'puppeteer-extra' import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker' import StealthPlugin from 'puppeteer-extra-plugin-stealth' import Url from 'url' -import { encode } from 'urlsafe-base64' // Add stealth plugin to hide puppeteer usage puppeteer.use(StealthPlugin()) @@ -28,12 +24,12 @@ puppeteer.use(AdblockerPlugin({ blockTrackers: true })) // const filePath = `${os.tmpdir()}/previewImage.png` -const MOBILE_USER_AGENT = - 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' +// const MOBILE_USER_AGENT = +// 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' -const BOT_DESKTOP_USER_AGENT = - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' +// const BOT_DESKTOP_USER_AGENT = +// 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' const NON_BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com'] @@ -156,8 +152,8 @@ export const fetchContent = async ( page: Page | undefined, title: string | undefined, content: string | undefined, - contentType: string | undefined, - readabilityResult: Readability.ParseResult | null | undefined + contentType: string | undefined + try { url = getUrl(url) if (!url) { @@ -230,34 +226,26 @@ export const fetchContent = async ( console.info('fallback to scrapingbee', url) const sbResult = await fetchContentWithScrapingBee(url) - content = sbResult.domContent - title = sbResult.title - } else { - throw e + + return { + finalUrl: url, + title: sbResult.title, + content: sbResult.domContent, + contentType, + } } + + throw e } finally { // close browser context if it was opened if (context) { await context.close() } - // save non pdf content - if (url && contentType !== 'application/pdf') { - // parse content if it is not empty - if (content) { - let document = parseHTML(content).document - // preParse content - const preParsedDom = await preParseContent(url, document) - if (preParsedDom) { - document = preParsedDom - } - readabilityResult = await getReadabilityResult(url, document) - } - } console.info(`content-fetch result`, logRecord) } - return { finalUrl: url, title, content, readabilityResult, contentType } + return { finalUrl: url, title, content, contentType } } function validateUrlString(url: string) { @@ -741,99 +729,3 @@ async function retrieveHtml(page: Page, logRecord: Record) { // console.info(`preview-image`, logRecord); // return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`); // } - -const DOM_PURIFY_CONFIG = { - ADD_TAGS: ['iframe'], - ADD_ATTR: ['allow', 'allowfullscreen', 'frameborder', 'scrolling'], - FORBID_ATTR: [ - 'data-ml-dynamic', - 'data-ml-dynamic-type', - 'data-orig-url', - 'data-ml-id', - 'data-ml', - 'data-xid', - 'data-feature', - ], -} - -function domPurifySanitizeHook(node: Element, data: SanitizeElementHookEvent) { - if (data.tagName === 'iframe') { - const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i - const src = node.getAttribute('src') || '' - const dataSrc = node.getAttribute('data-src') || '' - - if (src && urlRegex.test(src)) { - return - } - - if (dataSrc && urlRegex.test(dataSrc)) { - node.setAttribute('src', dataSrc) - return - } - - node.parentNode?.removeChild(node) - } -} - -function getPurifiedContent(html: Document) { - const newWindow = parseHTML('') - const DOMPurify = createDOMPurify(newWindow) - DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook) - const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG) - return parseHTML(clean).document -} - -function signImageProxyUrl(url: string) { - return encode( - crypto - .createHmac('sha256', process.env.IMAGE_PROXY_SECRET || '') - .update(url) - .digest() - ) -} - -function createImageProxyUrl(url: string, width = 0, height = 0) { - if (!process.env.IMAGE_PROXY_URL || !process.env.IMAGE_PROXY_SECRET) { - return url - } - - const urlWithOptions = `${url}#${width}x${height}` - const signature = signImageProxyUrl(urlWithOptions) - - return `${process.env.IMAGE_PROXY_URL}/${width}x${height},s${signature}/${url}` -} - -async function getReadabilityResult(url: string, document: Document) { - // First attempt to read the article as is. - // if that fails attempt to purify then read - const sources = [ - () => { - return document - }, - () => { - return getPurifiedContent(document) - }, - ] - - for (const source of sources) { - const document = source() - if (!document) { - continue - } - - try { - const article = await new Readability(document, { - createImageProxyUrl, - url, - }).parse() - - if (article) { - return article - } - } catch (error) { - console.log('parsing error for url', url, error) - } - } - - return null -}