run readability in save-page instead of puppeteer
This commit is contained in:
@ -1,4 +1,3 @@
|
||||
import { Readability } from '@omnivore/readability'
|
||||
import axios from 'axios'
|
||||
import jwt from 'jsonwebtoken'
|
||||
import { promisify } from 'util'
|
||||
@ -63,7 +62,6 @@ interface FetchResult {
|
||||
title?: string
|
||||
content?: string
|
||||
contentType?: string
|
||||
readabilityResult?: Readability.ParseResult
|
||||
}
|
||||
|
||||
const isFetchResult = (obj: unknown): obj is FetchResult => {
|
||||
@ -297,7 +295,7 @@ export const savePageJob = async (data: Data, attemptsMade: number) => {
|
||||
|
||||
// get the fetch result from cache
|
||||
const fetchedResult = await getCachedFetchResult(url)
|
||||
const { title, contentType, readabilityResult } = fetchedResult
|
||||
const { title, contentType } = fetchedResult
|
||||
let content = fetchedResult.content
|
||||
|
||||
// for pdf content, we need to upload the pdf
|
||||
@ -350,7 +348,6 @@ export const savePageJob = async (data: Data, attemptsMade: number) => {
|
||||
clientRequestId: articleSavingRequestId,
|
||||
title,
|
||||
originalContent: content,
|
||||
parseResult: readabilityResult,
|
||||
state: state ? (state as ArticleSavingRequestStatus) : undefined,
|
||||
labels: labels,
|
||||
rssFeedUrl,
|
||||
@ -362,13 +359,11 @@ export const savePageJob = async (data: Data, attemptsMade: number) => {
|
||||
user
|
||||
)
|
||||
|
||||
// if (result.__typename == 'SaveError') {
|
||||
// logger.error('Error saving page', { userId, url, result })
|
||||
// throw new Error('Error saving page')
|
||||
// }
|
||||
if (result.__typename == 'SaveError') {
|
||||
throw new Error(result.message || 'Failed to save page')
|
||||
}
|
||||
|
||||
// if the readability result is not parsed, the import is failed
|
||||
isImported = !!readabilityResult
|
||||
isImported = true
|
||||
isSaved = true
|
||||
} catch (e) {
|
||||
if (e instanceof Error) {
|
||||
|
||||
@ -53,7 +53,6 @@ export const saveEmail = async (
|
||||
// can leave this empty for now
|
||||
},
|
||||
},
|
||||
null,
|
||||
true
|
||||
)
|
||||
|
||||
|
||||
@ -68,17 +68,13 @@ export const savePage = async (
|
||||
input: SavePageInput,
|
||||
user: User
|
||||
): Promise<SaveResult> => {
|
||||
const parseResult = await parsePreparedContent(
|
||||
input.url,
|
||||
{
|
||||
document: input.originalContent,
|
||||
pageInfo: {
|
||||
title: input.title,
|
||||
canonicalUrl: input.url,
|
||||
},
|
||||
const parseResult = await parsePreparedContent(input.url, {
|
||||
document: input.originalContent,
|
||||
pageInfo: {
|
||||
title: input.title,
|
||||
canonicalUrl: input.url,
|
||||
},
|
||||
input.parseResult
|
||||
)
|
||||
})
|
||||
const [newSlug, croppedPathname] = createSlug(input.url, input.title)
|
||||
let slug = newSlug
|
||||
let clientRequestId = input.clientRequestId
|
||||
|
||||
@ -223,7 +223,6 @@ const getReadabilityResult = async (
|
||||
export const parsePreparedContent = async (
|
||||
url: string,
|
||||
preparedDocument: PreparedDocumentInput,
|
||||
parseResult?: Readability.ParseResult | null,
|
||||
isNewsletter?: boolean,
|
||||
allowRetry = true
|
||||
): Promise<ParsedContentPuppeteer> => {
|
||||
@ -232,9 +231,6 @@ export const parsePreparedContent = async (
|
||||
labels: { source: 'parsePreparedContent' },
|
||||
}
|
||||
|
||||
// If we have a parse result, use it
|
||||
let article = parseResult || null
|
||||
let highlightData = undefined
|
||||
const { document, pageInfo } = preparedDocument
|
||||
|
||||
if (!document) {
|
||||
@ -262,137 +258,134 @@ export const parsePreparedContent = async (
|
||||
}
|
||||
}
|
||||
|
||||
let dom: Document | null = null
|
||||
const { title: pageInfoTitle, canonicalUrl } = pageInfo
|
||||
|
||||
let parsedContent = null
|
||||
let pageType = PageType.Unknown
|
||||
let highlightData = undefined
|
||||
|
||||
try {
|
||||
dom = parseHTML(document).document
|
||||
const dom = parseHTML(document).document
|
||||
pageType = parseOriginalContent(dom)
|
||||
|
||||
if (!article) {
|
||||
// Attempt to parse the article
|
||||
// preParse content
|
||||
dom = (await preParseContent(url, dom)) || dom
|
||||
// Run readability
|
||||
await preParseContent(url, dom)
|
||||
|
||||
article = await getReadabilityResult(url, document, dom, isNewsletter)
|
||||
parsedContent = await getReadabilityResult(url, document, dom, isNewsletter)
|
||||
|
||||
if (!parsedContent || !parsedContent.content) {
|
||||
logger.info('No parsed content')
|
||||
|
||||
if (allowRetry) {
|
||||
logger.info('Retrying with content wrapped in html body')
|
||||
|
||||
const newDocument = {
|
||||
...preparedDocument,
|
||||
document: '<html><body>' + document + '</body></html>', // wrap in body
|
||||
}
|
||||
return parsePreparedContent(url, newDocument, isNewsletter, false)
|
||||
}
|
||||
|
||||
return {
|
||||
canonicalUrl,
|
||||
parsedContent,
|
||||
domContent: document,
|
||||
pageType,
|
||||
}
|
||||
}
|
||||
|
||||
if (!article?.textContent && allowRetry) {
|
||||
const newDocument = {
|
||||
...preparedDocument,
|
||||
document: '<html><body>' + document + '</body></html>', // wrap in body
|
||||
}
|
||||
return parsePreparedContent(
|
||||
url,
|
||||
newDocument,
|
||||
parseResult,
|
||||
isNewsletter,
|
||||
false
|
||||
)
|
||||
// use title if not found after running readability
|
||||
if (!parsedContent.title && pageInfoTitle) {
|
||||
parsedContent.title = pageInfoTitle
|
||||
}
|
||||
|
||||
// Format code blocks
|
||||
// TODO: we probably want to move this type of thing
|
||||
// to the handlers, and have some concept of postHandle
|
||||
if (article?.content) {
|
||||
const articleDom = parseHTML(article.content).document
|
||||
const codeBlocks = articleDom.querySelectorAll(
|
||||
'code, pre[class^="prism-"], pre[class^="language-"]'
|
||||
)
|
||||
if (codeBlocks.length > 0) {
|
||||
codeBlocks.forEach((e) => {
|
||||
if (e.textContent) {
|
||||
const att = hljs.highlightAuto(e.textContent)
|
||||
const code = articleDom.createElement('code')
|
||||
const langClass =
|
||||
`hljs language-${att.language}` +
|
||||
(att.second_best?.language
|
||||
? ` language-${att.second_best?.language}`
|
||||
: '')
|
||||
code.setAttribute('class', langClass)
|
||||
code.innerHTML = att.value
|
||||
e.replaceWith(code)
|
||||
}
|
||||
})
|
||||
article.content = articleDom.documentElement.outerHTML
|
||||
}
|
||||
|
||||
highlightData = findEmbeddedHighlight(articleDom.documentElement)
|
||||
|
||||
const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
|
||||
'omnivore-highlight-id',
|
||||
'data-twitter-tweet-id',
|
||||
'data-instagram-id',
|
||||
]
|
||||
|
||||
// Get the top level element?
|
||||
const pageNode = articleDom.firstElementChild as HTMLElement
|
||||
const nodesToVisitStack: [HTMLElement] = [pageNode]
|
||||
const visitedNodeList = []
|
||||
|
||||
while (nodesToVisitStack.length > 0) {
|
||||
const currentNode = nodesToVisitStack.pop()
|
||||
if (
|
||||
currentNode?.nodeType !== 1 ||
|
||||
// Avoiding dynamic elements from being counted as anchor-allowed elements
|
||||
ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
|
||||
currentNode.hasAttribute(attrib)
|
||||
)
|
||||
) {
|
||||
continue
|
||||
const newDom = parseHTML(parsedContent.content).document
|
||||
const codeBlocks = newDom.querySelectorAll(
|
||||
'code, pre[class^="prism-"], pre[class^="language-"]'
|
||||
)
|
||||
if (codeBlocks.length > 0) {
|
||||
codeBlocks.forEach((e) => {
|
||||
if (e.textContent) {
|
||||
const att = hljs.highlightAuto(e.textContent)
|
||||
const code = newDom.createElement('code')
|
||||
const langClass =
|
||||
`hljs language-${att.language}` +
|
||||
(att.second_best?.language
|
||||
? ` language-${att.second_best?.language}`
|
||||
: '')
|
||||
code.setAttribute('class', langClass)
|
||||
code.innerHTML = att.value
|
||||
e.replaceWith(code)
|
||||
}
|
||||
visitedNodeList.push(currentNode)
|
||||
;[].slice
|
||||
.call(currentNode.childNodes)
|
||||
.reverse()
|
||||
.forEach(function (node) {
|
||||
nodesToVisitStack.push(node)
|
||||
})
|
||||
}
|
||||
|
||||
visitedNodeList.shift()
|
||||
visitedNodeList.forEach((node, index) => {
|
||||
// start from index 1, index 0 reserved for anchor unknown.
|
||||
node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString())
|
||||
})
|
||||
|
||||
article.content = articleDom.documentElement.outerHTML
|
||||
}
|
||||
|
||||
highlightData = findEmbeddedHighlight(newDom.documentElement)
|
||||
|
||||
const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
|
||||
'omnivore-highlight-id',
|
||||
'data-twitter-tweet-id',
|
||||
'data-instagram-id',
|
||||
]
|
||||
|
||||
// Get the top level element?
|
||||
const pageNode = newDom.firstElementChild as HTMLElement
|
||||
const nodesToVisitStack: [HTMLElement] = [pageNode]
|
||||
const visitedNodeList = []
|
||||
|
||||
while (nodesToVisitStack.length > 0) {
|
||||
const currentNode = nodesToVisitStack.pop()
|
||||
if (
|
||||
currentNode?.nodeType !== 1 ||
|
||||
// Avoiding dynamic elements from being counted as anchor-allowed elements
|
||||
ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
|
||||
currentNode.hasAttribute(attrib)
|
||||
)
|
||||
) {
|
||||
continue
|
||||
}
|
||||
visitedNodeList.push(currentNode)
|
||||
;[].slice
|
||||
.call(currentNode.childNodes)
|
||||
.reverse()
|
||||
.forEach(function (node) {
|
||||
nodesToVisitStack.push(node)
|
||||
})
|
||||
}
|
||||
|
||||
visitedNodeList.shift()
|
||||
visitedNodeList.forEach((node, index) => {
|
||||
// start from index 1, index 0 reserved for anchor unknown.
|
||||
node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString())
|
||||
})
|
||||
|
||||
const newHtml = newDom.documentElement.outerHTML
|
||||
const newWindow = parseHTML('')
|
||||
const DOMPurify = createDOMPurify(newWindow)
|
||||
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
|
||||
const clean = DOMPurify.sanitize(article?.content || '', DOM_PURIFY_CONFIG)
|
||||
const cleanHtml = DOMPurify.sanitize(newHtml, DOM_PURIFY_CONFIG)
|
||||
parsedContent.content = cleanHtml
|
||||
|
||||
Object.assign(article || {}, {
|
||||
content: clean,
|
||||
title: article?.title,
|
||||
previewImage: article?.previewImage,
|
||||
siteName: article?.siteName,
|
||||
siteIcon: article?.siteIcon,
|
||||
byline: article?.byline,
|
||||
language: article?.language,
|
||||
})
|
||||
logRecord.parseSuccess = true
|
||||
} catch (error) {
|
||||
logger.info('Error parsing content', error)
|
||||
logger.error('Error parsing content', error)
|
||||
|
||||
Object.assign(logRecord, {
|
||||
parseSuccess: false,
|
||||
parseError: error,
|
||||
})
|
||||
}
|
||||
|
||||
const { title, canonicalUrl } = pageInfo
|
||||
|
||||
Object.assign(article || {}, {
|
||||
title: article?.title || title,
|
||||
})
|
||||
|
||||
logger.info('parse-article completed')
|
||||
logger.info('parse-article completed', logRecord)
|
||||
|
||||
return {
|
||||
domContent: document,
|
||||
parsedContent: article,
|
||||
canonicalUrl,
|
||||
pageType: dom ? parseOriginalContent(dom) : PageType.Unknown,
|
||||
parsedContent,
|
||||
domContent: document,
|
||||
pageType,
|
||||
highlightData,
|
||||
}
|
||||
}
|
||||
|
||||
@ -50,7 +50,6 @@ interface FetchResult {
|
||||
title?: string
|
||||
content?: string
|
||||
contentType?: string
|
||||
readabilityResult?: unknown
|
||||
}
|
||||
|
||||
export const cacheFetchResult = async (fetchResult: FetchResult) => {
|
||||
|
||||
@ -1,10 +1,7 @@
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
|
||||
import { preHandleContent, preParseContent } from '@omnivore/content-handler'
|
||||
import { Readability } from '@omnivore/readability'
|
||||
import { preHandleContent } from '@omnivore/content-handler'
|
||||
import axios from 'axios'
|
||||
import crypto from 'crypto'
|
||||
import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify'
|
||||
// const { Storage } = require('@google-cloud/storage');
|
||||
import { parseHTML } from 'linkedom'
|
||||
import path from 'path'
|
||||
@ -13,7 +10,6 @@ import puppeteer from 'puppeteer-extra'
|
||||
import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker'
|
||||
import StealthPlugin from 'puppeteer-extra-plugin-stealth'
|
||||
import Url from 'url'
|
||||
import { encode } from 'urlsafe-base64'
|
||||
|
||||
// Add stealth plugin to hide puppeteer usage
|
||||
puppeteer.use(StealthPlugin())
|
||||
@ -28,12 +24,12 @@ puppeteer.use(AdblockerPlugin({ blockTrackers: true }))
|
||||
|
||||
// const filePath = `${os.tmpdir()}/previewImage.png`
|
||||
|
||||
const MOBILE_USER_AGENT =
|
||||
'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
|
||||
// const MOBILE_USER_AGENT =
|
||||
// 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
|
||||
const DESKTOP_USER_AGENT =
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||
const BOT_DESKTOP_USER_AGENT =
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||
// const BOT_DESKTOP_USER_AGENT =
|
||||
// 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||
const NON_BOT_DESKTOP_USER_AGENT =
|
||||
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||
const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
|
||||
@ -156,8 +152,8 @@ export const fetchContent = async (
|
||||
page: Page | undefined,
|
||||
title: string | undefined,
|
||||
content: string | undefined,
|
||||
contentType: string | undefined,
|
||||
readabilityResult: Readability.ParseResult | null | undefined
|
||||
contentType: string | undefined
|
||||
|
||||
try {
|
||||
url = getUrl(url)
|
||||
if (!url) {
|
||||
@ -230,34 +226,26 @@ export const fetchContent = async (
|
||||
console.info('fallback to scrapingbee', url)
|
||||
|
||||
const sbResult = await fetchContentWithScrapingBee(url)
|
||||
content = sbResult.domContent
|
||||
title = sbResult.title
|
||||
} else {
|
||||
throw e
|
||||
|
||||
return {
|
||||
finalUrl: url,
|
||||
title: sbResult.title,
|
||||
content: sbResult.domContent,
|
||||
contentType,
|
||||
}
|
||||
}
|
||||
|
||||
throw e
|
||||
} finally {
|
||||
// close browser context if it was opened
|
||||
if (context) {
|
||||
await context.close()
|
||||
}
|
||||
// save non pdf content
|
||||
if (url && contentType !== 'application/pdf') {
|
||||
// parse content if it is not empty
|
||||
if (content) {
|
||||
let document = parseHTML(content).document
|
||||
// preParse content
|
||||
const preParsedDom = await preParseContent(url, document)
|
||||
if (preParsedDom) {
|
||||
document = preParsedDom
|
||||
}
|
||||
readabilityResult = await getReadabilityResult(url, document)
|
||||
}
|
||||
}
|
||||
|
||||
console.info(`content-fetch result`, logRecord)
|
||||
}
|
||||
|
||||
return { finalUrl: url, title, content, readabilityResult, contentType }
|
||||
return { finalUrl: url, title, content, contentType }
|
||||
}
|
||||
|
||||
function validateUrlString(url: string) {
|
||||
@ -741,99 +729,3 @@ async function retrieveHtml(page: Page, logRecord: Record<string, any>) {
|
||||
// console.info(`preview-image`, logRecord);
|
||||
// return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`);
|
||||
// }
|
||||
|
||||
const DOM_PURIFY_CONFIG = {
|
||||
ADD_TAGS: ['iframe'],
|
||||
ADD_ATTR: ['allow', 'allowfullscreen', 'frameborder', 'scrolling'],
|
||||
FORBID_ATTR: [
|
||||
'data-ml-dynamic',
|
||||
'data-ml-dynamic-type',
|
||||
'data-orig-url',
|
||||
'data-ml-id',
|
||||
'data-ml',
|
||||
'data-xid',
|
||||
'data-feature',
|
||||
],
|
||||
}
|
||||
|
||||
function domPurifySanitizeHook(node: Element, data: SanitizeElementHookEvent) {
|
||||
if (data.tagName === 'iframe') {
|
||||
const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i
|
||||
const src = node.getAttribute('src') || ''
|
||||
const dataSrc = node.getAttribute('data-src') || ''
|
||||
|
||||
if (src && urlRegex.test(src)) {
|
||||
return
|
||||
}
|
||||
|
||||
if (dataSrc && urlRegex.test(dataSrc)) {
|
||||
node.setAttribute('src', dataSrc)
|
||||
return
|
||||
}
|
||||
|
||||
node.parentNode?.removeChild(node)
|
||||
}
|
||||
}
|
||||
|
||||
function getPurifiedContent(html: Document) {
|
||||
const newWindow = parseHTML('')
|
||||
const DOMPurify = createDOMPurify(newWindow)
|
||||
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
|
||||
const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG)
|
||||
return parseHTML(clean).document
|
||||
}
|
||||
|
||||
function signImageProxyUrl(url: string) {
|
||||
return encode(
|
||||
crypto
|
||||
.createHmac('sha256', process.env.IMAGE_PROXY_SECRET || '')
|
||||
.update(url)
|
||||
.digest()
|
||||
)
|
||||
}
|
||||
|
||||
function createImageProxyUrl(url: string, width = 0, height = 0) {
|
||||
if (!process.env.IMAGE_PROXY_URL || !process.env.IMAGE_PROXY_SECRET) {
|
||||
return url
|
||||
}
|
||||
|
||||
const urlWithOptions = `${url}#${width}x${height}`
|
||||
const signature = signImageProxyUrl(urlWithOptions)
|
||||
|
||||
return `${process.env.IMAGE_PROXY_URL}/${width}x${height},s${signature}/${url}`
|
||||
}
|
||||
|
||||
async function getReadabilityResult(url: string, document: Document) {
|
||||
// First attempt to read the article as is.
|
||||
// if that fails attempt to purify then read
|
||||
const sources = [
|
||||
() => {
|
||||
return document
|
||||
},
|
||||
() => {
|
||||
return getPurifiedContent(document)
|
||||
},
|
||||
]
|
||||
|
||||
for (const source of sources) {
|
||||
const document = source()
|
||||
if (!document) {
|
||||
continue
|
||||
}
|
||||
|
||||
try {
|
||||
const article = await new Readability(document, {
|
||||
createImageProxyUrl,
|
||||
url,
|
||||
}).parse()
|
||||
|
||||
if (article) {
|
||||
return article
|
||||
}
|
||||
} catch (error) {
|
||||
console.log('parsing error for url', url, error)
|
||||
}
|
||||
}
|
||||
|
||||
return null
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user