/* eslint-disable @typescript-eslint/no-unsafe-member-access */ /* eslint-disable @typescript-eslint/no-unsafe-assignment */ /* eslint-disable @typescript-eslint/no-unused-vars */ import { Readability } from '@omnivore/readability' import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify' import { PageType, PreparedDocumentInput } from '../generated/graphql' import { buildLogger, LogRecord } from './logger' import { createImageProxyUrl } from './imageproxy' import axios from 'axios' import * as hljs from 'highlightjs' import { decode } from 'html-entities' import { parseHTML } from 'linkedom' import { getRepository } from '../entity/utils' import { User } from '../entity/user' import { ILike } from 'typeorm' import { v4 as uuid } from 'uuid' import addressparser from 'addressparser' import { preParseContent } from '@omnivore/content-handler' import { EmbeddedHighlightData, findEmbeddedHighlight, } from './highlightGenerator' import { NodeHtmlMarkdown } from 'node-html-markdown' const logger = buildLogger('utils.parse') export const ALLOWED_CONTENT_TYPES = [ 'text/html', 'application/octet-stream', 'text/plain', ] const DOM_PURIFY_CONFIG = { ADD_TAGS: ['iframe'], ADD_ATTR: ['allow', 'allowfullscreen', 'frameborder', 'scrolling'], FORBID_ATTR: [ 'data-ml-dynamic', 'data-ml-dynamic-type', 'data-orig-url', 'data-ml-id', 'data-ml', 'data-xid', 'data-feature', ], } const ARTICLE_PREFIX = 'omnivore:' export const FAKE_URL_PREFIX = 'https://omnivore.app/no_url?q=' /** Hook that prevents DOMPurify from removing youtube iframes */ const domPurifySanitizeHook = ( node: Element, data: SanitizeElementHookEvent ): void => { if (data.tagName === 'iframe') { const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i const src = node.getAttribute('src') || '' const dataSrc = node.getAttribute('data-src') || '' if (src && urlRegex.test(src)) { return } if (dataSrc && urlRegex.test(dataSrc)) { node.setAttribute('src', dataSrc) return } node.parentNode?.removeChild(node) } } export type ParsedContentPuppeteer = { domContent: string parsedContent: Readability.ParseResult | null canonicalUrl?: string | null pageType: PageType highlightData?: EmbeddedHighlightData } /* eslint-disable @typescript-eslint/no-explicit-any */ type ArticleParseLogRecord = LogRecord & { url: string userAgent?: string pageInfo?: { [key: string]: any } blockedByClient?: boolean parsedOrigin?: boolean origin?: string puppeteerSuccess?: boolean puppeteerError?: { [key: string]: any } parseSuccess?: boolean parseError?: { [key: string]: any } scrollError?: boolean isAllowedContentType?: boolean } /* eslint-enable @typescript-eslint/no-explicit-any */ const DEBUG_MODE = process.env.DEBUG === 'true' || false const parseOriginalContent = (document: Document): PageType => { try { const e = document.querySelector("head meta[property='og:type']") const content = e?.getAttribute('content') if (!content) { return PageType.Unknown } switch (content.toLowerCase()) { case 'article': return PageType.Article case 'book': return PageType.Book case 'profile': return PageType.Profile case 'website': return PageType.Website } } catch (error) { logger.error('Error extracting og:type from content', error) } return PageType.Unknown } const getPurifiedContent = (html: string): Document => { const newWindow = parseHTML('') const DOMPurify = createDOMPurify(newWindow) DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook) const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG) return parseHTML(clean).document } const getReadabilityResult = async ( url: string, html: string, document: Document, isNewsletter?: boolean ): Promise => { // First attempt to read the article as is. // if that fails attempt to purify then read const sources = [ () => { return document }, () => { return getPurifiedContent(html) }, ] for (const source of sources) { const document = source() if (!document) { continue } try { const article = await new Readability(document, { debug: DEBUG_MODE, createImageProxyUrl, keepTables: isNewsletter, url, }).parse() if (article) { return article } } catch (error) { console.log('parsing error for url', url, error) } } return null } export const parsePreparedContent = async ( url: string, preparedDocument: PreparedDocumentInput, parseResult?: Readability.ParseResult | null, isNewsletter?: boolean, allowRetry = true ): Promise => { const logRecord: ArticleParseLogRecord = { url: url, labels: { source: 'parsePreparedContent' }, } // If we have a parse result, use it let article = parseResult || null let highlightData = undefined const { document, pageInfo } = preparedDocument // Checking for content type acceptance or if there are no contentType // at all (backward extension versions compatibility) if ( pageInfo.contentType && !ALLOWED_CONTENT_TYPES.includes(pageInfo.contentType) ) { console.log('Not allowed content type', pageInfo.contentType) return { canonicalUrl: url, parsedContent: null, domContent: preparedDocument.document, pageType: PageType.Unknown, } } let dom = parseHTML(document).document try { if (!article) { // Attempt to parse the article // preParse content const preParsedDom = await preParseContent(url, dom) preParsedDom && (dom = preParsedDom) article = await getReadabilityResult(url, document, dom, isNewsletter) } if (!article?.textContent && allowRetry) { const newDocument = { ...preparedDocument, document: '' + preparedDocument.document + '', } return parsePreparedContent( url, newDocument, parseResult, isNewsletter, false ) } // Format code blocks // TODO: we probably want to move this type of thing // to the handlers, and have some concept of postHandle if (article?.content) { const articleDom = parseHTML(article.content).document const codeBlocks = articleDom.querySelectorAll('code') if (codeBlocks.length > 0) { codeBlocks.forEach((e) => { if (e.textContent) { const att = hljs.highlightAuto(e.textContent) const code = dom.createElement('code') const langClass = `hljs language-${att.language}` + (att.second_best?.language ? ` language-${att.second_best?.language}` : '') code.setAttribute('class', langClass) code.innerHTML = att.value e.replaceWith(code) } }) article.content = articleDom.documentElement.outerHTML } highlightData = findEmbeddedHighlight(articleDom.documentElement) const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [ 'omnivore-highlight-id', 'data-twitter-tweet-id', 'data-instagram-id', ] // Get the top level element? const pageNode = articleDom.firstElementChild as HTMLElement const nodesToVisitStack: [HTMLElement] = [pageNode] const visitedNodeList = [] while (nodesToVisitStack.length > 0) { const currentNode = nodesToVisitStack.pop() if ( currentNode?.nodeType !== 1 || // Avoiding dynamic elements from being counted as anchor-allowed elements ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) => currentNode.hasAttribute(attrib) ) ) { continue } visitedNodeList.push(currentNode) ;[].slice .call(currentNode.childNodes) .reverse() .forEach(function (node) { nodesToVisitStack.push(node) }) } visitedNodeList.shift() visitedNodeList.forEach((node, index) => { // start from index 1, index 0 reserved for anchor unknown. node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString()) }) article.content = articleDom.documentElement.outerHTML } const newWindow = parseHTML('') const DOMPurify = createDOMPurify(newWindow) DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook) const clean = DOMPurify.sanitize(article?.content || '', DOM_PURIFY_CONFIG) const jsonLdLinkMetadata = (async () => { return getJSONLdLinkMetadata(dom) })() Object.assign(article || {}, { content: clean, title: article?.title || (await jsonLdLinkMetadata).title, previewImage: article?.previewImage || (await jsonLdLinkMetadata).previewImage, siteName: article?.siteName || (await jsonLdLinkMetadata).siteName, siteIcon: article?.siteIcon, byline: article?.byline || (await jsonLdLinkMetadata).byline, language: article?.language, }) logRecord.parseSuccess = true } catch (error) { console.log('Error parsing content', error) Object.assign(logRecord, { parseSuccess: false, parseError: error, }) } const { title, canonicalUrl } = pageInfo Object.assign(article || {}, { title: article?.title || title, }) logger.info('parse-article completed') return { domContent: preparedDocument.document, parsedContent: article, canonicalUrl, pageType: parseOriginalContent(dom), highlightData, } } /** * Fetches the JSONLD link if found and parses an article metadata if presented * * Example article: https://thoughtsofstone.com/the-great-feminization/ * * JSONLD Link example: https://thoughtsofstone.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fthoughtsofstone.com%2Fthe-great-feminization%2F * @param document - JSDOM Document object of the content to parse link from * @returns Parsed article partial result from the JSONLD link if found (possibly not) */ const getJSONLdLinkMetadata = async ( document: Document ): Promise> => { const result: Partial = {} try { const jsonLdLink = document.querySelector( "link[type='application/json+oembed']" ) if (!jsonLdLink || !jsonLdLink.href) return result const jsonLd = (await axios.get(jsonLdLink.href, { timeout: 5000 })).data || {} result.byline = decode(jsonLd['author_name']) result.previewImage = decode(jsonLd['thumbnail_url']) result.siteName = decode(jsonLd['provider_name']) result.title = decode(jsonLd['title']) return result } catch (error) { logger.warning(`Unable to get JSONLD link of the article`, error) return result } } type Metadata = { title?: string author?: string description: string previewImage: string } export const parsePageMetadata = (html: string): Metadata | undefined => { try { const document = parseHTML(html).document // get open graph metadata const description = document .querySelector("head meta[property='og:description']") ?.getAttribute('content') || '' const previewImage = document .querySelector("head meta[property='og:image']") ?.getAttribute('content') || '' const title = document .querySelector("head meta[property='og:title']") ?.getAttribute('content') || undefined const author = document .querySelector("head meta[name='author']") ?.getAttribute('content') || undefined // TODO: we should be able to apply the JSONLD metadata // here too return { title, author, description, previewImage } } catch (e) { console.log('failed to parse page:', html, e) return undefined } } export const parseUrlMetadata = async ( url: string ): Promise => { try { const res = await axios.get(url) return parsePageMetadata(res.data) } catch (e) { console.log('failed to get:', url, e) return undefined } } export const isProbablyArticle = async ( email: string, subject: string ): Promise => { const user = await getRepository(User).findOneBy({ email: ILike(email), }) return !!user || subject.includes(ARTICLE_PREFIX) } export const generateUniqueUrl = () => FAKE_URL_PREFIX + uuid() export const getTitleFromEmailSubject = (subject: string) => { const title = subject.replace(ARTICLE_PREFIX, '') return title.trim() } export const parseEmailAddress = (from: string): addressparser.EmailAddress => { // get author name from email // e.g. 'Jackson Harper from Omnivore App ' // or 'Mike Allen ' const parsed = addressparser(from) if (parsed.length > 0) { return parsed[0] } return { name: '', address: from } } export const fetchFavicon = async ( url: string ): Promise => { try { // get the correct url if it's a redirect const response = await axios.head(url, { timeout: 5000 }) const realUrl = response.request.res.responseUrl const domain = new URL(realUrl).hostname return `https://api.faviconkit.com/${domain}/128` } catch (e) { console.log('Error fetching favicon', e) return undefined } } /* ********************************************************* * * Re-use * If using it several times, creating an instance saves time * ********************************************************* */ const nhm = new NodeHtmlMarkdown( /* options (optional) */ {}, /* customTransformers (optional) */ undefined, /* customCodeBlockTranslators (optional) */ undefined ) export const htmlToMarkdown = (html: string) => { return nhm.translate(/* html */ html) }