488 lines
14 KiB
TypeScript
488 lines
14 KiB
TypeScript
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
|
|
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
|
|
/* eslint-disable @typescript-eslint/no-unused-vars */
|
|
import { Readability } from '@omnivore/readability'
|
|
import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify'
|
|
import { PageType, PreparedDocumentInput } from '../generated/graphql'
|
|
import { buildLogger, LogRecord } from './logger'
|
|
import { createImageProxyUrl } from './imageproxy'
|
|
import axios from 'axios'
|
|
import * as hljs from 'highlightjs'
|
|
import { decode } from 'html-entities'
|
|
import { parseHTML } from 'linkedom'
|
|
import { getRepository } from '../entity/utils'
|
|
import { User } from '../entity/user'
|
|
import { ILike } from 'typeorm'
|
|
import { v4 as uuid } from 'uuid'
|
|
import addressparser from 'addressparser'
|
|
import { preParseContent } from '@omnivore/content-handler'
|
|
import {
|
|
EmbeddedHighlightData,
|
|
findEmbeddedHighlight,
|
|
} from './highlightGenerator'
|
|
import { NodeHtmlMarkdown } from 'node-html-markdown'
|
|
|
|
const logger = buildLogger('utils.parse')
|
|
|
|
export const ALLOWED_CONTENT_TYPES = [
|
|
'text/html',
|
|
'application/octet-stream',
|
|
'text/plain',
|
|
]
|
|
|
|
const DOM_PURIFY_CONFIG = {
|
|
ADD_TAGS: ['iframe'],
|
|
ADD_ATTR: ['allow', 'allowfullscreen', 'frameborder', 'scrolling'],
|
|
FORBID_ATTR: [
|
|
'data-ml-dynamic',
|
|
'data-ml-dynamic-type',
|
|
'data-orig-url',
|
|
'data-ml-id',
|
|
'data-ml',
|
|
'data-xid',
|
|
'data-feature',
|
|
],
|
|
}
|
|
const ARTICLE_PREFIX = 'omnivore:'
|
|
|
|
export const FAKE_URL_PREFIX = 'https://omnivore.app/no_url?q='
|
|
|
|
/** Hook that prevents DOMPurify from removing youtube iframes */
|
|
const domPurifySanitizeHook = (
|
|
node: Element,
|
|
data: SanitizeElementHookEvent
|
|
): void => {
|
|
if (data.tagName === 'iframe') {
|
|
const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i
|
|
const src = node.getAttribute('src') || ''
|
|
const dataSrc = node.getAttribute('data-src') || ''
|
|
|
|
if (src && urlRegex.test(src)) {
|
|
return
|
|
}
|
|
|
|
if (dataSrc && urlRegex.test(dataSrc)) {
|
|
node.setAttribute('src', dataSrc)
|
|
return
|
|
}
|
|
|
|
node.parentNode?.removeChild(node)
|
|
}
|
|
}
|
|
|
|
export type ParsedContentPuppeteer = {
|
|
domContent: string
|
|
parsedContent: Readability.ParseResult | null
|
|
canonicalUrl?: string | null
|
|
pageType: PageType
|
|
highlightData?: EmbeddedHighlightData
|
|
}
|
|
|
|
/* eslint-disable @typescript-eslint/no-explicit-any */
|
|
type ArticleParseLogRecord = LogRecord & {
|
|
url: string
|
|
userAgent?: string
|
|
pageInfo?: { [key: string]: any }
|
|
blockedByClient?: boolean
|
|
parsedOrigin?: boolean
|
|
origin?: string
|
|
puppeteerSuccess?: boolean
|
|
puppeteerError?: { [key: string]: any }
|
|
parseSuccess?: boolean
|
|
parseError?: { [key: string]: any }
|
|
scrollError?: boolean
|
|
isAllowedContentType?: boolean
|
|
}
|
|
/* eslint-enable @typescript-eslint/no-explicit-any */
|
|
|
|
const DEBUG_MODE = process.env.DEBUG === 'true' || false
|
|
|
|
const parseOriginalContent = (document: Document): PageType => {
|
|
try {
|
|
const e = document.querySelector("head meta[property='og:type']")
|
|
const content = e?.getAttribute('content')
|
|
if (!content) {
|
|
return PageType.Unknown
|
|
}
|
|
|
|
switch (content.toLowerCase()) {
|
|
case 'article':
|
|
return PageType.Article
|
|
case 'book':
|
|
return PageType.Book
|
|
case 'profile':
|
|
return PageType.Profile
|
|
case 'website':
|
|
return PageType.Website
|
|
}
|
|
} catch (error) {
|
|
logger.error('Error extracting og:type from content', error)
|
|
}
|
|
|
|
return PageType.Unknown
|
|
}
|
|
|
|
const getPurifiedContent = (html: string): Document => {
|
|
const newWindow = parseHTML('')
|
|
const DOMPurify = createDOMPurify(newWindow)
|
|
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
|
|
const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG)
|
|
return parseHTML(clean).document
|
|
}
|
|
|
|
const getReadabilityResult = async (
|
|
url: string,
|
|
html: string,
|
|
document: Document,
|
|
isNewsletter?: boolean
|
|
): Promise<Readability.ParseResult | null> => {
|
|
// First attempt to read the article as is.
|
|
// if that fails attempt to purify then read
|
|
const sources = [
|
|
() => {
|
|
return document
|
|
},
|
|
() => {
|
|
return getPurifiedContent(html)
|
|
},
|
|
]
|
|
|
|
for (const source of sources) {
|
|
const document = source()
|
|
if (!document) {
|
|
continue
|
|
}
|
|
|
|
try {
|
|
const article = await new Readability(document, {
|
|
debug: DEBUG_MODE,
|
|
createImageProxyUrl,
|
|
keepTables: isNewsletter,
|
|
url,
|
|
}).parse()
|
|
|
|
if (article) {
|
|
return article
|
|
}
|
|
} catch (error) {
|
|
console.log('parsing error for url', url, error)
|
|
}
|
|
}
|
|
|
|
return null
|
|
}
|
|
|
|
export const parsePreparedContent = async (
|
|
url: string,
|
|
preparedDocument: PreparedDocumentInput,
|
|
parseResult?: Readability.ParseResult | null,
|
|
isNewsletter?: boolean,
|
|
allowRetry = true
|
|
): Promise<ParsedContentPuppeteer> => {
|
|
const logRecord: ArticleParseLogRecord = {
|
|
url: url,
|
|
labels: { source: 'parsePreparedContent' },
|
|
}
|
|
|
|
// If we have a parse result, use it
|
|
let article = parseResult || null
|
|
let highlightData = undefined
|
|
const { document, pageInfo } = preparedDocument
|
|
|
|
// Checking for content type acceptance or if there are no contentType
|
|
// at all (backward extension versions compatibility)
|
|
if (
|
|
pageInfo.contentType &&
|
|
!ALLOWED_CONTENT_TYPES.includes(pageInfo.contentType)
|
|
) {
|
|
console.log('Not allowed content type', pageInfo.contentType)
|
|
return {
|
|
canonicalUrl: url,
|
|
parsedContent: null,
|
|
domContent: preparedDocument.document,
|
|
pageType: PageType.Unknown,
|
|
}
|
|
}
|
|
|
|
let dom = parseHTML(document).document
|
|
|
|
try {
|
|
if (!article) {
|
|
// Attempt to parse the article
|
|
// preParse content
|
|
const preParsedDom = await preParseContent(url, dom)
|
|
preParsedDom && (dom = preParsedDom)
|
|
|
|
article = await getReadabilityResult(url, document, dom, isNewsletter)
|
|
}
|
|
|
|
if (!article?.textContent && allowRetry) {
|
|
const newDocument = {
|
|
...preparedDocument,
|
|
document: '<html>' + preparedDocument.document + '</html>',
|
|
}
|
|
return parsePreparedContent(
|
|
url,
|
|
newDocument,
|
|
parseResult,
|
|
isNewsletter,
|
|
false
|
|
)
|
|
}
|
|
|
|
// Format code blocks
|
|
// TODO: we probably want to move this type of thing
|
|
// to the handlers, and have some concept of postHandle
|
|
if (article?.content) {
|
|
const articleDom = parseHTML(article.content).document
|
|
const codeBlocks = articleDom.querySelectorAll('code')
|
|
if (codeBlocks.length > 0) {
|
|
codeBlocks.forEach((e) => {
|
|
if (e.textContent) {
|
|
const att = hljs.highlightAuto(e.textContent)
|
|
const code = dom.createElement('code')
|
|
const langClass =
|
|
`hljs language-${att.language}` +
|
|
(att.second_best?.language
|
|
? ` language-${att.second_best?.language}`
|
|
: '')
|
|
code.setAttribute('class', langClass)
|
|
code.innerHTML = att.value
|
|
e.replaceWith(code)
|
|
}
|
|
})
|
|
article.content = articleDom.documentElement.outerHTML
|
|
}
|
|
|
|
highlightData = findEmbeddedHighlight(articleDom.documentElement)
|
|
|
|
const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
|
|
'omnivore-highlight-id',
|
|
'data-twitter-tweet-id',
|
|
'data-instagram-id',
|
|
]
|
|
|
|
// Get the top level element?
|
|
const pageNode = articleDom.firstElementChild as HTMLElement
|
|
const nodesToVisitStack: [HTMLElement] = [pageNode]
|
|
const visitedNodeList = []
|
|
|
|
while (nodesToVisitStack.length > 0) {
|
|
const currentNode = nodesToVisitStack.pop()
|
|
if (
|
|
currentNode?.nodeType !== 1 ||
|
|
// Avoiding dynamic elements from being counted as anchor-allowed elements
|
|
ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
|
|
currentNode.hasAttribute(attrib)
|
|
)
|
|
) {
|
|
continue
|
|
}
|
|
visitedNodeList.push(currentNode)
|
|
;[].slice
|
|
.call(currentNode.childNodes)
|
|
.reverse()
|
|
.forEach(function (node) {
|
|
nodesToVisitStack.push(node)
|
|
})
|
|
}
|
|
|
|
visitedNodeList.shift()
|
|
visitedNodeList.forEach((node, index) => {
|
|
// start from index 1, index 0 reserved for anchor unknown.
|
|
node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString())
|
|
})
|
|
|
|
article.content = articleDom.documentElement.outerHTML
|
|
}
|
|
|
|
const newWindow = parseHTML('')
|
|
const DOMPurify = createDOMPurify(newWindow)
|
|
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
|
|
const clean = DOMPurify.sanitize(article?.content || '', DOM_PURIFY_CONFIG)
|
|
|
|
const jsonLdLinkMetadata = (async () => {
|
|
return getJSONLdLinkMetadata(dom)
|
|
})()
|
|
|
|
Object.assign(article || {}, {
|
|
content: clean,
|
|
title: article?.title || (await jsonLdLinkMetadata).title,
|
|
previewImage:
|
|
article?.previewImage || (await jsonLdLinkMetadata).previewImage,
|
|
siteName: article?.siteName || (await jsonLdLinkMetadata).siteName,
|
|
siteIcon: article?.siteIcon,
|
|
byline: article?.byline || (await jsonLdLinkMetadata).byline,
|
|
language: article?.language,
|
|
})
|
|
logRecord.parseSuccess = true
|
|
} catch (error) {
|
|
console.log('Error parsing content', error)
|
|
Object.assign(logRecord, {
|
|
parseSuccess: false,
|
|
parseError: error,
|
|
})
|
|
}
|
|
|
|
const { title, canonicalUrl } = pageInfo
|
|
|
|
Object.assign(article || {}, {
|
|
title: article?.title || title,
|
|
})
|
|
|
|
logger.info('parse-article completed')
|
|
|
|
return {
|
|
domContent: preparedDocument.document,
|
|
parsedContent: article,
|
|
canonicalUrl,
|
|
pageType: parseOriginalContent(dom),
|
|
highlightData,
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Fetches the JSONLD link if found and parses an article metadata if presented
|
|
*
|
|
* Example article: https://thoughtsofstone.com/the-great-feminization/
|
|
*
|
|
* JSONLD Link example: https://thoughtsofstone.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fthoughtsofstone.com%2Fthe-great-feminization%2F
|
|
* @param document - JSDOM Document object of the content to parse link from
|
|
* @returns Parsed article partial result from the JSONLD link if found (possibly not)
|
|
*/
|
|
const getJSONLdLinkMetadata = async (
|
|
document: Document
|
|
): Promise<Partial<Readability.ParseResult>> => {
|
|
const result: Partial<Readability.ParseResult> = {}
|
|
try {
|
|
const jsonLdLink = document.querySelector<HTMLLinkElement>(
|
|
"link[type='application/json+oembed']"
|
|
)
|
|
if (!jsonLdLink || !jsonLdLink.href) return result
|
|
|
|
const jsonLd =
|
|
(await axios.get(jsonLdLink.href, { timeout: 5000 })).data || {}
|
|
|
|
result.byline = decode(jsonLd['author_name'])
|
|
result.previewImage = decode(jsonLd['thumbnail_url'])
|
|
result.siteName = decode(jsonLd['provider_name'])
|
|
result.title = decode(jsonLd['title'])
|
|
|
|
return result
|
|
} catch (error) {
|
|
logger.warning(`Unable to get JSONLD link of the article`, error)
|
|
return result
|
|
}
|
|
}
|
|
|
|
type Metadata = {
|
|
title?: string
|
|
author?: string
|
|
description: string
|
|
previewImage: string
|
|
}
|
|
|
|
export const parsePageMetadata = (html: string): Metadata | undefined => {
|
|
try {
|
|
const document = parseHTML(html).document
|
|
|
|
// get open graph metadata
|
|
const description =
|
|
document
|
|
.querySelector("head meta[property='og:description']")
|
|
?.getAttribute('content') || ''
|
|
|
|
const previewImage =
|
|
document
|
|
.querySelector("head meta[property='og:image']")
|
|
?.getAttribute('content') || ''
|
|
|
|
const title =
|
|
document
|
|
.querySelector("head meta[property='og:title']")
|
|
?.getAttribute('content') || undefined
|
|
|
|
const author =
|
|
document
|
|
.querySelector("head meta[name='author']")
|
|
?.getAttribute('content') || undefined
|
|
|
|
// TODO: we should be able to apply the JSONLD metadata
|
|
// here too
|
|
|
|
return { title, author, description, previewImage }
|
|
} catch (e) {
|
|
console.log('failed to parse page:', html, e)
|
|
return undefined
|
|
}
|
|
}
|
|
|
|
export const parseUrlMetadata = async (
|
|
url: string
|
|
): Promise<Metadata | undefined> => {
|
|
try {
|
|
const res = await axios.get(url)
|
|
return parsePageMetadata(res.data)
|
|
} catch (e) {
|
|
console.log('failed to get:', url, e)
|
|
return undefined
|
|
}
|
|
}
|
|
|
|
export const isProbablyArticle = async (
|
|
email: string,
|
|
subject: string
|
|
): Promise<boolean> => {
|
|
const user = await getRepository(User).findOneBy({
|
|
email: ILike(email),
|
|
})
|
|
return !!user || subject.includes(ARTICLE_PREFIX)
|
|
}
|
|
|
|
export const generateUniqueUrl = () => FAKE_URL_PREFIX + uuid()
|
|
|
|
export const getTitleFromEmailSubject = (subject: string) => {
|
|
const title = subject.replace(ARTICLE_PREFIX, '')
|
|
return title.trim()
|
|
}
|
|
|
|
export const parseEmailAddress = (from: string): addressparser.EmailAddress => {
|
|
// get author name from email
|
|
// e.g. 'Jackson Harper from Omnivore App <jacksonh@substack.com>'
|
|
// or 'Mike Allen <mike@axios.com>'
|
|
const parsed = addressparser(from)
|
|
if (parsed.length > 0) {
|
|
return parsed[0]
|
|
}
|
|
return { name: '', address: from }
|
|
}
|
|
|
|
export const fetchFavicon = async (
|
|
url: string
|
|
): Promise<string | undefined> => {
|
|
try {
|
|
// get the correct url if it's a redirect
|
|
const response = await axios.head(url, { timeout: 5000 })
|
|
const realUrl = response.request.res.responseUrl
|
|
const domain = new URL(realUrl).hostname
|
|
return `https://api.faviconkit.com/${domain}/128`
|
|
} catch (e) {
|
|
console.log('Error fetching favicon', e)
|
|
return undefined
|
|
}
|
|
}
|
|
|
|
/* ********************************************************* *
|
|
* Re-use
|
|
* If using it several times, creating an instance saves time
|
|
* ********************************************************* */
|
|
const nhm = new NodeHtmlMarkdown(
|
|
/* options (optional) */ {},
|
|
/* customTransformers (optional) */ undefined,
|
|
/* customCodeBlockTranslators (optional) */ undefined
|
|
)
|
|
|
|
export const htmlToMarkdown = (html: string) => {
|
|
return nhm.translate(/* html */ html)
|
|
}
|