run readability in save-page instead of puppeteer

This commit is contained in:
Hongbo Wu
2024-01-24 18:57:25 +08:00
parent 7567e00e51
commit 5e239d2568
6 changed files with 126 additions and 252 deletions

View File

@ -1,4 +1,3 @@
import { Readability } from '@omnivore/readability'
import axios from 'axios'
import jwt from 'jsonwebtoken'
import { promisify } from 'util'
@ -63,7 +62,6 @@ interface FetchResult {
title?: string
content?: string
contentType?: string
readabilityResult?: Readability.ParseResult
}
const isFetchResult = (obj: unknown): obj is FetchResult => {
@ -297,7 +295,7 @@ export const savePageJob = async (data: Data, attemptsMade: number) => {
// get the fetch result from cache
const fetchedResult = await getCachedFetchResult(url)
const { title, contentType, readabilityResult } = fetchedResult
const { title, contentType } = fetchedResult
let content = fetchedResult.content
// for pdf content, we need to upload the pdf
@ -350,7 +348,6 @@ export const savePageJob = async (data: Data, attemptsMade: number) => {
clientRequestId: articleSavingRequestId,
title,
originalContent: content,
parseResult: readabilityResult,
state: state ? (state as ArticleSavingRequestStatus) : undefined,
labels: labels,
rssFeedUrl,
@ -362,13 +359,11 @@ export const savePageJob = async (data: Data, attemptsMade: number) => {
user
)
// if (result.__typename == 'SaveError') {
// logger.error('Error saving page', { userId, url, result })
// throw new Error('Error saving page')
// }
if (result.__typename == 'SaveError') {
throw new Error(result.message || 'Failed to save page')
}
// if the readability result is not parsed, the import is failed
isImported = !!readabilityResult
isImported = true
isSaved = true
} catch (e) {
if (e instanceof Error) {

View File

@ -53,7 +53,6 @@ export const saveEmail = async (
// can leave this empty for now
},
},
null,
true
)

View File

@ -68,17 +68,13 @@ export const savePage = async (
input: SavePageInput,
user: User
): Promise<SaveResult> => {
const parseResult = await parsePreparedContent(
input.url,
{
document: input.originalContent,
pageInfo: {
title: input.title,
canonicalUrl: input.url,
},
const parseResult = await parsePreparedContent(input.url, {
document: input.originalContent,
pageInfo: {
title: input.title,
canonicalUrl: input.url,
},
input.parseResult
)
})
const [newSlug, croppedPathname] = createSlug(input.url, input.title)
let slug = newSlug
let clientRequestId = input.clientRequestId

View File

@ -223,7 +223,6 @@ const getReadabilityResult = async (
export const parsePreparedContent = async (
url: string,
preparedDocument: PreparedDocumentInput,
parseResult?: Readability.ParseResult | null,
isNewsletter?: boolean,
allowRetry = true
): Promise<ParsedContentPuppeteer> => {
@ -232,9 +231,6 @@ export const parsePreparedContent = async (
labels: { source: 'parsePreparedContent' },
}
// If we have a parse result, use it
let article = parseResult || null
let highlightData = undefined
const { document, pageInfo } = preparedDocument
if (!document) {
@ -262,137 +258,134 @@ export const parsePreparedContent = async (
}
}
let dom: Document | null = null
const { title: pageInfoTitle, canonicalUrl } = pageInfo
let parsedContent = null
let pageType = PageType.Unknown
let highlightData = undefined
try {
dom = parseHTML(document).document
const dom = parseHTML(document).document
pageType = parseOriginalContent(dom)
if (!article) {
// Attempt to parse the article
// preParse content
dom = (await preParseContent(url, dom)) || dom
// Run readability
await preParseContent(url, dom)
article = await getReadabilityResult(url, document, dom, isNewsletter)
parsedContent = await getReadabilityResult(url, document, dom, isNewsletter)
if (!parsedContent || !parsedContent.content) {
logger.info('No parsed content')
if (allowRetry) {
logger.info('Retrying with content wrapped in html body')
const newDocument = {
...preparedDocument,
document: '<html><body>' + document + '</body></html>', // wrap in body
}
return parsePreparedContent(url, newDocument, isNewsletter, false)
}
return {
canonicalUrl,
parsedContent,
domContent: document,
pageType,
}
}
if (!article?.textContent && allowRetry) {
const newDocument = {
...preparedDocument,
document: '<html><body>' + document + '</body></html>', // wrap in body
}
return parsePreparedContent(
url,
newDocument,
parseResult,
isNewsletter,
false
)
// use title if not found after running readability
if (!parsedContent.title && pageInfoTitle) {
parsedContent.title = pageInfoTitle
}
// Format code blocks
// TODO: we probably want to move this type of thing
// to the handlers, and have some concept of postHandle
if (article?.content) {
const articleDom = parseHTML(article.content).document
const codeBlocks = articleDom.querySelectorAll(
'code, pre[class^="prism-"], pre[class^="language-"]'
)
if (codeBlocks.length > 0) {
codeBlocks.forEach((e) => {
if (e.textContent) {
const att = hljs.highlightAuto(e.textContent)
const code = articleDom.createElement('code')
const langClass =
`hljs language-${att.language}` +
(att.second_best?.language
? ` language-${att.second_best?.language}`
: '')
code.setAttribute('class', langClass)
code.innerHTML = att.value
e.replaceWith(code)
}
})
article.content = articleDom.documentElement.outerHTML
}
highlightData = findEmbeddedHighlight(articleDom.documentElement)
const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
'omnivore-highlight-id',
'data-twitter-tweet-id',
'data-instagram-id',
]
// Get the top level element?
const pageNode = articleDom.firstElementChild as HTMLElement
const nodesToVisitStack: [HTMLElement] = [pageNode]
const visitedNodeList = []
while (nodesToVisitStack.length > 0) {
const currentNode = nodesToVisitStack.pop()
if (
currentNode?.nodeType !== 1 ||
// Avoiding dynamic elements from being counted as anchor-allowed elements
ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
currentNode.hasAttribute(attrib)
)
) {
continue
const newDom = parseHTML(parsedContent.content).document
const codeBlocks = newDom.querySelectorAll(
'code, pre[class^="prism-"], pre[class^="language-"]'
)
if (codeBlocks.length > 0) {
codeBlocks.forEach((e) => {
if (e.textContent) {
const att = hljs.highlightAuto(e.textContent)
const code = newDom.createElement('code')
const langClass =
`hljs language-${att.language}` +
(att.second_best?.language
? ` language-${att.second_best?.language}`
: '')
code.setAttribute('class', langClass)
code.innerHTML = att.value
e.replaceWith(code)
}
visitedNodeList.push(currentNode)
;[].slice
.call(currentNode.childNodes)
.reverse()
.forEach(function (node) {
nodesToVisitStack.push(node)
})
}
visitedNodeList.shift()
visitedNodeList.forEach((node, index) => {
// start from index 1, index 0 reserved for anchor unknown.
node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString())
})
article.content = articleDom.documentElement.outerHTML
}
highlightData = findEmbeddedHighlight(newDom.documentElement)
const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
'omnivore-highlight-id',
'data-twitter-tweet-id',
'data-instagram-id',
]
// Get the top level element?
const pageNode = newDom.firstElementChild as HTMLElement
const nodesToVisitStack: [HTMLElement] = [pageNode]
const visitedNodeList = []
while (nodesToVisitStack.length > 0) {
const currentNode = nodesToVisitStack.pop()
if (
currentNode?.nodeType !== 1 ||
// Avoiding dynamic elements from being counted as anchor-allowed elements
ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
currentNode.hasAttribute(attrib)
)
) {
continue
}
visitedNodeList.push(currentNode)
;[].slice
.call(currentNode.childNodes)
.reverse()
.forEach(function (node) {
nodesToVisitStack.push(node)
})
}
visitedNodeList.shift()
visitedNodeList.forEach((node, index) => {
// start from index 1, index 0 reserved for anchor unknown.
node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString())
})
const newHtml = newDom.documentElement.outerHTML
const newWindow = parseHTML('')
const DOMPurify = createDOMPurify(newWindow)
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
const clean = DOMPurify.sanitize(article?.content || '', DOM_PURIFY_CONFIG)
const cleanHtml = DOMPurify.sanitize(newHtml, DOM_PURIFY_CONFIG)
parsedContent.content = cleanHtml
Object.assign(article || {}, {
content: clean,
title: article?.title,
previewImage: article?.previewImage,
siteName: article?.siteName,
siteIcon: article?.siteIcon,
byline: article?.byline,
language: article?.language,
})
logRecord.parseSuccess = true
} catch (error) {
logger.info('Error parsing content', error)
logger.error('Error parsing content', error)
Object.assign(logRecord, {
parseSuccess: false,
parseError: error,
})
}
const { title, canonicalUrl } = pageInfo
Object.assign(article || {}, {
title: article?.title || title,
})
logger.info('parse-article completed')
logger.info('parse-article completed', logRecord)
return {
domContent: document,
parsedContent: article,
canonicalUrl,
pageType: dom ? parseOriginalContent(dom) : PageType.Unknown,
parsedContent,
domContent: document,
pageType,
highlightData,
}
}

View File

@ -50,7 +50,6 @@ interface FetchResult {
title?: string
content?: string
contentType?: string
readabilityResult?: unknown
}
export const cacheFetchResult = async (fetchResult: FetchResult) => {

View File

@ -1,10 +1,7 @@
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
import { preHandleContent, preParseContent } from '@omnivore/content-handler'
import { Readability } from '@omnivore/readability'
import { preHandleContent } from '@omnivore/content-handler'
import axios from 'axios'
import crypto from 'crypto'
import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify'
// const { Storage } = require('@google-cloud/storage');
import { parseHTML } from 'linkedom'
import path from 'path'
@ -13,7 +10,6 @@ import puppeteer from 'puppeteer-extra'
import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker'
import StealthPlugin from 'puppeteer-extra-plugin-stealth'
import Url from 'url'
import { encode } from 'urlsafe-base64'
// Add stealth plugin to hide puppeteer usage
puppeteer.use(StealthPlugin())
@ -28,12 +24,12 @@ puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
// const filePath = `${os.tmpdir()}/previewImage.png`
const MOBILE_USER_AGENT =
'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
// const MOBILE_USER_AGENT =
// 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
const DESKTOP_USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const BOT_DESKTOP_USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
// const BOT_DESKTOP_USER_AGENT =
// 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_DESKTOP_USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
@ -156,8 +152,8 @@ export const fetchContent = async (
page: Page | undefined,
title: string | undefined,
content: string | undefined,
contentType: string | undefined,
readabilityResult: Readability.ParseResult | null | undefined
contentType: string | undefined
try {
url = getUrl(url)
if (!url) {
@ -230,34 +226,26 @@ export const fetchContent = async (
console.info('fallback to scrapingbee', url)
const sbResult = await fetchContentWithScrapingBee(url)
content = sbResult.domContent
title = sbResult.title
} else {
throw e
return {
finalUrl: url,
title: sbResult.title,
content: sbResult.domContent,
contentType,
}
}
throw e
} finally {
// close browser context if it was opened
if (context) {
await context.close()
}
// save non pdf content
if (url && contentType !== 'application/pdf') {
// parse content if it is not empty
if (content) {
let document = parseHTML(content).document
// preParse content
const preParsedDom = await preParseContent(url, document)
if (preParsedDom) {
document = preParsedDom
}
readabilityResult = await getReadabilityResult(url, document)
}
}
console.info(`content-fetch result`, logRecord)
}
return { finalUrl: url, title, content, readabilityResult, contentType }
return { finalUrl: url, title, content, contentType }
}
function validateUrlString(url: string) {
@ -741,99 +729,3 @@ async function retrieveHtml(page: Page, logRecord: Record<string, any>) {
// console.info(`preview-image`, logRecord);
// return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`);
// }
const DOM_PURIFY_CONFIG = {
ADD_TAGS: ['iframe'],
ADD_ATTR: ['allow', 'allowfullscreen', 'frameborder', 'scrolling'],
FORBID_ATTR: [
'data-ml-dynamic',
'data-ml-dynamic-type',
'data-orig-url',
'data-ml-id',
'data-ml',
'data-xid',
'data-feature',
],
}
function domPurifySanitizeHook(node: Element, data: SanitizeElementHookEvent) {
if (data.tagName === 'iframe') {
const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i
const src = node.getAttribute('src') || ''
const dataSrc = node.getAttribute('data-src') || ''
if (src && urlRegex.test(src)) {
return
}
if (dataSrc && urlRegex.test(dataSrc)) {
node.setAttribute('src', dataSrc)
return
}
node.parentNode?.removeChild(node)
}
}
function getPurifiedContent(html: Document) {
const newWindow = parseHTML('')
const DOMPurify = createDOMPurify(newWindow)
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG)
return parseHTML(clean).document
}
function signImageProxyUrl(url: string) {
return encode(
crypto
.createHmac('sha256', process.env.IMAGE_PROXY_SECRET || '')
.update(url)
.digest()
)
}
function createImageProxyUrl(url: string, width = 0, height = 0) {
if (!process.env.IMAGE_PROXY_URL || !process.env.IMAGE_PROXY_SECRET) {
return url
}
const urlWithOptions = `${url}#${width}x${height}`
const signature = signImageProxyUrl(urlWithOptions)
return `${process.env.IMAGE_PROXY_URL}/${width}x${height},s${signature}/${url}`
}
async function getReadabilityResult(url: string, document: Document) {
// First attempt to read the article as is.
// if that fails attempt to purify then read
const sources = [
() => {
return document
},
() => {
return getPurifiedContent(document)
},
]
for (const source of sources) {
const document = source()
if (!document) {
continue
}
try {
const article = await new Readability(document, {
createImageProxyUrl,
url,
}).parse()
if (article) {
return article
}
} catch (error) {
console.log('parsing error for url', url, error)
}
}
return null
}