Files
omnivore/packages/api/src/utils/parser.ts

545 lines
15 KiB
TypeScript

/* eslint-disable @typescript-eslint/no-unsafe-member-access */
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
/* eslint-disable @typescript-eslint/no-unused-vars */
import { Readability } from '@omnivore/readability'
import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify'
import { PageType, PreparedDocumentInput } from '../generated/graphql'
import { buildLogger, LogRecord } from './logger'
import { createImageProxyUrl } from './imageproxy'
import axios from 'axios'
import { WikipediaHandler } from './wikipedia-handler'
import { SubstackHandler } from './substack-handler'
import { AxiosHandler } from './axios-handler'
import { BloombergHandler } from './bloomberg-handler'
import { GolangHandler } from './golang-handler'
import * as hljs from 'highlightjs'
import { decode } from 'html-entities'
import { parseHTML } from 'linkedom'
const logger = buildLogger('utils.parse')
export const ALLOWED_CONTENT_TYPES = [
'text/html',
'application/octet-stream',
'text/plain',
]
const DOM_PURIFY_CONFIG = {
ADD_TAGS: ['iframe'],
ADD_ATTR: ['allow', 'allowfullscreen', 'frameborder', 'scrolling'],
FORBID_ATTR: [
'data-ml-dynamic',
'data-ml-dynamic-type',
'data-orig-url',
'data-ml-id',
'data-ml',
'data-xid',
'data-feature',
],
}
interface ContentHandler {
shouldPrehandle: (url: URL, dom: Document) => boolean
prehandle: (url: URL, document: Document) => Promise<Document>
}
const HANDLERS = [
new WikipediaHandler(),
new SubstackHandler(),
new AxiosHandler(),
new BloombergHandler(),
new GolangHandler(),
]
/** Hook that prevents DOMPurify from removing youtube iframes */
const domPurifySanitizeHook = (
node: Element,
data: SanitizeElementHookEvent
): void => {
if (data.tagName === 'iframe') {
const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i
const src = node.getAttribute('src') || ''
const dataSrc = node.getAttribute('data-src') || ''
if (src && urlRegex.test(src)) {
return
}
if (dataSrc && urlRegex.test(dataSrc)) {
node.setAttribute('src', dataSrc)
return
}
node.parentNode?.removeChild(node)
}
}
export type ParsedContentPuppeteer = {
domContent: string
parsedContent: Readability.ParseResult | null
canonicalUrl?: string | null
pageType: PageType
}
/* eslint-disable @typescript-eslint/no-explicit-any */
type ArticleParseLogRecord = LogRecord & {
url: string
userAgent?: string
pageInfo?: { [key: string]: any }
blockedByClient?: boolean
parsedOrigin?: boolean
origin?: string
puppeteerSuccess?: boolean
puppeteerError?: { [key: string]: any }
parseSuccess?: boolean
parseError?: { [key: string]: any }
scrollError?: boolean
isAllowedContentType?: boolean
}
/* eslint-enable @typescript-eslint/no-explicit-any */
const DEBUG_MODE = process.env.DEBUG === 'true' || false
const parseOriginalContent = (document: Document): PageType => {
try {
const e = document.querySelector("head meta[property='og:type']")
const content = e?.getAttribute('content')
if (!content) {
return PageType.Unknown
}
switch (content.toLowerCase()) {
case 'article':
return PageType.Article
case 'book':
return PageType.Book
case 'profile':
return PageType.Profile
case 'website':
return PageType.Website
}
} catch (error) {
logger.error('Error extracting og:type from content', error)
}
return PageType.Unknown
}
const getPurifiedContent = (html: string): Document => {
const newWindow = parseHTML('')
const DOMPurify = createDOMPurify(newWindow)
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG)
return parseHTML(clean).document
}
const getReadabilityResult = async (
url: string,
html: string,
document: Document,
isNewsletter?: boolean
): Promise<Readability.ParseResult | null> => {
// First attempt to read the article as is.
// if that fails attempt to purify then read
const sources = [
() => {
return document
},
() => {
return getPurifiedContent(html)
},
]
for (const source of sources) {
const document = source()
if (!document) {
continue
}
try {
const article = await new Readability(document, {
debug: DEBUG_MODE,
createImageProxyUrl,
keepTables: isNewsletter,
url,
}).parse()
if (article) {
return article
}
} catch (error) {
console.log('parsing error for url', url, error)
}
}
return null
}
const applyHandlers = async (
url: string,
document: Document
): Promise<void> => {
try {
const u = new URL(url)
const handler = HANDLERS.find((h) => {
try {
return h.shouldPrehandle(u, document)
} catch (e) {
console.log('error with handler: ', h.name, e)
}
return false
})
if (handler) {
try {
console.log('pre-handling url or content with handler: ', handler.name)
await handler.prehandle(u, document)
} catch (e) {
console.log('error with handler: ', handler, e)
}
}
} catch (error) {
logger.error('Error prehandling url', url, error)
}
}
export const parsePreparedContent = async (
url: string,
preparedDocument: PreparedDocumentInput,
isNewsletter?: boolean,
allowRetry = true
): Promise<ParsedContentPuppeteer> => {
const logRecord: ArticleParseLogRecord = {
url: url,
labels: { source: 'parsePreparedContent' },
}
let article = null
const { document, pageInfo } = preparedDocument
// Checking for content type acceptance or if there are no contentType
// at all (backward extension versions compatibility)
if (
pageInfo.contentType &&
!ALLOWED_CONTENT_TYPES.includes(pageInfo.contentType)
) {
console.log('Not allowed content type', pageInfo.contentType)
return {
canonicalUrl: url,
parsedContent: null,
domContent: preparedDocument.document,
pageType: PageType.Unknown,
}
}
const dom = parseHTML(document).document
await applyHandlers(url, dom)
try {
article = await getReadabilityResult(url, document, dom, isNewsletter)
if (!article?.textContent && allowRetry) {
const newDocument = {
...preparedDocument,
document: '<html>' + preparedDocument.document + '</html>',
}
return parsePreparedContent(url, newDocument, isNewsletter, false)
}
// Format code blocks
// TODO: we probably want to move this type of thing
// to the handlers, and have some concept of postHandle
if (article?.dom) {
const codeBlocks = article.dom.querySelectorAll('code')
if (codeBlocks.length > 0) {
codeBlocks.forEach((e) => {
console.log(e.textContent)
if (e.textContent) {
const att = hljs.highlightAuto(e.textContent)
const code = dom.createElement('code')
const langClass =
`hljs language-${att.language}` +
(att.second_best?.language
? ` language-${att.second_best?.language}`
: '')
code.setAttribute('class', langClass)
code.innerHTML = att.value
e.replaceWith(code)
}
})
article.content = article.dom.outerHTML
}
}
const newWindow = parseHTML('')
const DOMPurify = createDOMPurify(newWindow)
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
const clean = DOMPurify.sanitize(article?.content || '', DOM_PURIFY_CONFIG)
const jsonLdLinkMetadata = (async () => {
return getJSONLdLinkMetadata(dom)
})()
Object.assign(article || {}, {
content: clean,
title: article?.title || (await jsonLdLinkMetadata).title,
previewImage:
article?.previewImage || (await jsonLdLinkMetadata).previewImage,
siteName: article?.siteName || (await jsonLdLinkMetadata).siteName,
siteIcon: article?.siteIcon,
byline: article?.byline || (await jsonLdLinkMetadata).byline,
language: article?.language,
})
logRecord.parseSuccess = true
} catch (error) {
console.log('Error parsing content', error)
Object.assign(logRecord, {
parseSuccess: false,
parseError: error,
})
}
const { title, canonicalUrl } = pageInfo
Object.assign(article || {}, {
title: article?.title || title,
})
logger.info('parse-article completed')
return {
domContent: preparedDocument.document,
parsedContent: article,
canonicalUrl,
pageType: parseOriginalContent(dom),
}
}
/**
* Fetches the JSONLD link if found and parses an article metadata if presented
*
* Example article: https://thoughtsofstone.com/the-great-feminization/
*
* JSONLD Link example: https://thoughtsofstone.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fthoughtsofstone.com%2Fthe-great-feminization%2F
* @param document - JSDOM Document object of the content to parse link from
* @returns Parsed article partial result from the JSONLD link if found (possibly not)
*/
const getJSONLdLinkMetadata = async (
document: Document
): Promise<Partial<Readability.ParseResult>> => {
const result: Partial<Readability.ParseResult> = {}
try {
const jsonLdLink = document.querySelector<HTMLLinkElement>(
"link[type='application/json+oembed']"
)
if (!jsonLdLink || !jsonLdLink.href) return result
const jsonLd =
(await axios.get(jsonLdLink.href, { timeout: 5000 })).data || {}
result.byline = decode(jsonLd['author_name'])
result.previewImage = decode(jsonLd['thumbnail_url'])
result.siteName = decode(jsonLd['provider_name'])
result.title = decode(jsonLd['title'])
return result
} catch (error) {
logger.warning(`Unable to get JSONLD link of the article`, error)
return result
}
}
type Metadata = {
title?: string
author?: string
description: string
previewImage: string
}
export const parsePageMetadata = (html: string): Metadata | undefined => {
try {
const document = parseHTML(html).document
// get open graph metadata
const description =
document
.querySelector("head meta[property='og:description']")
?.getAttribute('content') || ''
const previewImage =
document
.querySelector("head meta[property='og:image']")
?.getAttribute('content') || ''
const title =
document
.querySelector("head meta[property='og:title']")
?.getAttribute('content') || undefined
const author =
document
.querySelector("head meta[name='author']")
?.getAttribute('content') || undefined
// TODO: we should be able to apply the JSONLD metadata
// here too
return { title, author, description, previewImage }
} catch (e) {
console.log('failed to parse page:', html, e)
return undefined
}
}
export const parseUrlMetadata = async (
url: string
): Promise<Metadata | undefined> => {
try {
const res = await axios.get(url)
return parsePageMetadata(res.data)
} catch (e) {
console.log('failed to get:', url, e)
return undefined
}
}
// Attempt to determine if an HTML blob is a newsletter
// based on it's contents.
// TODO: when we consolidate the handlers we could include this
// as a utility method on each one.
export const isProbablyNewsletter = async (html: string): Promise<boolean> => {
const dom = parseHTML(html).document
const domCopy = parseHTML(dom.documentElement.outerHTML).document
const article = await new Readability(domCopy, {
debug: false,
keepTables: true,
}).parse()
if (!article || !article.content) {
return false
}
// substack newsletter emails have tables with a *post-meta class
if (dom.querySelector('table[class$="post-meta"]')) {
return true
}
// If the article has a header link, and substack icons its probably a newsletter
const href = findNewsletterHeaderHref(dom)
const heartIcon = dom.querySelector(
'table tbody td span a img[src*="HeartIcon"]'
)
const recommendIcon = dom.querySelector(
'table tbody td span a img[src*="RecommendIconRounded"]'
)
if (href && (heartIcon || recommendIcon)) {
return true
}
// Check if this is a beehiiv.net newsletter
if (dom.querySelectorAll('img[src*="beehiiv.net"]').length > 0) {
const beehiivUrl = beehiivNewsletterHref(dom)
if (beehiivUrl) {
return true
}
}
// Check if this is a newsletter from revue
if (dom.querySelectorAll('img[src*="getrevue.co"]').length > 0) {
const getrevueUrl = revueNewsletterHref(dom)
if (getrevueUrl) {
return true
}
}
// Check if this is a convertkit.com newsletter
return (
dom.querySelectorAll(
'img[src*="convertkit.com"], img[src*="convertkit-mail.com"]'
).length > 0
)
}
const beehiivNewsletterHref = (dom: Document): string | undefined => {
const readOnline = dom.querySelectorAll('table tr td div a[class*="link"]')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (e.textContent === 'Read Online') {
res = e.getAttribute('href') || undefined
}
})
return res
}
const convertkitNewsletterHref = (dom: Document): string | undefined => {
const readOnline = dom.querySelectorAll('table tr td div a')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (e.textContent === 'View this email in your browser') {
res = e.getAttribute('href') || undefined
}
})
return res
}
const revueNewsletterHref = (dom: Document): string | undefined => {
const viewOnline = dom.querySelectorAll('table tr td div a[target="_blank"]')
let res: string | undefined = undefined
viewOnline.forEach((e) => {
if (e.textContent === 'View online') {
res = e.getAttribute('href') || undefined
}
})
return res
}
const findNewsletterHeaderHref = (dom: Document): string | undefined => {
// Substack header links
const postLink = dom.querySelector('h1 a ')
if (postLink) {
return postLink.getAttribute('href') || undefined
}
// Check if this is a beehiiv.net newsletter
const beehiiv = beehiivNewsletterHref(dom)
if (beehiiv) {
return beehiiv
}
// Check if this is a revue newsletter
const revue = revueNewsletterHref(dom)
if (revue) {
return revue
}
// Check if this is a convertkit.com newsletter
const convertkitUrl = convertkitNewsletterHref(dom)
if (convertkitUrl) {
return convertkitUrl
}
return undefined
}
// Given an HTML blob tries to find a URL to use for
// a canonical URL.
export const findNewsletterUrl = async (
html: string
): Promise<string | undefined> => {
const dom = parseHTML(html).document
// Check if this is a substack newsletter
const href = findNewsletterHeaderHref(dom)
if (href) {
// Try to make a HEAD request so we get the redirected URL, since these
// will usually be behind tracking url redirects
return axios({
method: 'HEAD',
url: href,
})
.then((res) => res.request.res.responseUrl as string | undefined)
.catch((e) => href)
}
return undefined
}