/* eslint-disable @typescript-eslint/no-unsafe-member-access */ /* eslint-disable @typescript-eslint/no-unsafe-assignment */ /* eslint-disable @typescript-eslint/no-unused-vars */ import { Readability } from '@omnivore/readability' import { DOMWindow, JSDOM, VirtualConsole } from 'jsdom' import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify' import { PageType, PreparedDocumentInput } from '../generated/graphql' import { buildLogger, LogRecord } from './logger' import { createImageProxyUrl } from './imageproxy' import axios from 'axios' import { WikipediaHandler } from './wikipedia-handler' import { SubstackHandler } from './substack-handler' import { AxiosHandler } from './axios-handler' import { BloombergHandler } from './bloomberg-handler' import * as hljs from 'highlightjs' const logger = buildLogger('utils.parse') const virtualConsole = new VirtualConsole() export const ALLOWED_CONTENT_TYPES = [ 'text/html', 'application/octet-stream', 'text/plain', ] const DOM_PURIFY_CONFIG = { ADD_TAGS: ['iframe'], ADD_ATTR: ['allow', 'allowfullscreen', 'frameborder', 'scrolling'], FORBID_ATTR: [ 'data-ml-dynamic', 'data-ml-dynamic-type', 'data-orig-url', 'data-ml-id', 'data-ml', 'data-xid', 'data-feature', ], } interface ContentHandler { shouldPrehandle: (url: URL, dom: DOMWindow) => boolean prehandle: (url: URL, document: DOMWindow) => Promise } const HANDLERS = [ new WikipediaHandler(), new SubstackHandler(), new AxiosHandler(), new BloombergHandler(), ] /** Hook that prevents DOMPurify from removing youtube iframes */ const domPurifySanitizeHook = ( node: Element, data: SanitizeElementHookEvent ): void => { if (data.tagName === 'iframe') { const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i const src = node.getAttribute('src') || '' const dataSrc = node.getAttribute('data-src') || '' if (src && urlRegex.test(src)) { return } if (dataSrc && urlRegex.test(dataSrc)) { node.setAttribute('src', dataSrc) return } node.parentNode?.removeChild(node) } } export type ParsedContentPuppeteer = { domContent: string parsedContent: Readability.ParseResult | null canonicalUrl?: string | null } /* eslint-disable @typescript-eslint/no-explicit-any */ type ArticleParseLogRecord = LogRecord & { url: string userAgent?: string pageInfo?: { [key: string]: any } blockedByClient?: boolean parsedOrigin?: boolean origin?: string puppeteerSuccess?: boolean puppeteerError?: { [key: string]: any } parseSuccess?: boolean parseError?: { [key: string]: any } scrollError?: boolean isAllowedContentType?: boolean } /* eslint-enable @typescript-eslint/no-explicit-any */ const DEBUG_MODE = process.env.DEBUG === 'true' || false export const parseOriginalContent = (url: string, html: string): PageType => { try { const { window } = new JSDOM(html, { url }) const e = window.document.querySelector("head meta[property='og:type']") const content = e?.getAttribute('content') if (!content) { return PageType.Unknown } switch (content.toLowerCase()) { case 'article': return PageType.Article case 'book': return PageType.Book case 'profile': return PageType.Profile case 'website': return PageType.Website } } catch (error) { logger.error('Error extracting og:type from content for url', url, error) } return PageType.Unknown } const getPurifiedContent = (html: string): Document => { const newWindow = new JSDOM('').window const DOMPurify = createDOMPurify(newWindow as unknown as Window) DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook) const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG) return new JSDOM(clean).window.document } const getReadabilityResult = ( url: string, html: string, window: DOMWindow ): Readability.ParseResult | null => { virtualConsole.removeAllListeners('jsdomError') virtualConsole.on('jsdomError', ({ message, stack: _stack, ...details }) => { logger.warning(`JSDOM error occurred`, { errorMsg: message, ...details, }) }) // First attempt to read the article as is. // if that fails attempt to purify then read const sources = [ () => { return window.document }, () => { return getPurifiedContent(html) }, ] for (const source of sources) { const document = source() if (!document) { continue } try { const article = new Readability(document, { debug: DEBUG_MODE, createImageProxyUrl, }).parse() if (article) { return article } } catch (error) { console.log('parsing error for url', url, error) } } return null } const applyHandlers = async (url: string, window: DOMWindow): Promise => { try { const u = new URL(url) const handler = HANDLERS.find((h) => { try { return h.shouldPrehandle(u, window) } catch (e) { console.log('error with handler: ', h.name, e) } return false }) if (handler) { try { // The only handler we have now can modify the URL, but in the // future maybe we let it modify content. In that case // we might exit the request early. console.log('pre-handling url or content with handler: ', handler.name) await handler.prehandle(u, window) } catch (e) { console.log('error with handler: ', handler, e) } } } catch (error) { logger.error('Error prehandling url', url, error) } } export const parsePreparedContent = async ( url: string, preparedDocument: PreparedDocumentInput ): Promise => { const logRecord: ArticleParseLogRecord = { url: url, labels: { source: 'parsePreparedContent' }, } let article = null const { document, pageInfo } = preparedDocument // Checking for content type acceptance or if there are no contentType // at all (backward extension versions compatibility) if ( pageInfo.contentType && !ALLOWED_CONTENT_TYPES.includes(pageInfo.contentType) ) { console.log('Not allowed content type', pageInfo.contentType) return { canonicalUrl: url, parsedContent: null, domContent: preparedDocument.document, } } virtualConsole.removeAllListeners('jsdomError') virtualConsole.on('jsdomError', ({ message, stack: _stack, ...details }) => { logger.warning(`JSDOM error occurred`, { ...logRecord, errorMsg: message, ...details, }) }) const { window } = new JSDOM(document, { url, virtualConsole }) await applyHandlers(url, window) try { article = getReadabilityResult(url, document, window) const newWindow = new JSDOM('').window const DOMPurify = createDOMPurify(newWindow as unknown as Window) DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook) const clean = DOMPurify.sanitize(article?.content || '', DOM_PURIFY_CONFIG) const jsonLdLinkMetadata = await getJSONLdLinkMetadata(window.document) logRecord.JSONLdParsed = jsonLdLinkMetadata if (article?.content) { const cWindow = new JSDOM(article?.content).window cWindow.document.querySelectorAll('code').forEach((e) => { console.log(e.textContent) if (e.textContent) { const att = hljs.highlightAuto(e.textContent) const code = window.document.createElement('code') const langClass = `hljs language-${att.language}` + att.second_best?.language ? ` language-${att.second_best?.language}` : '' code.setAttribute('class', langClass) code.innerHTML = att.value e.replaceWith(code) } }) article.content = cWindow.document.body.outerHTML } Object.assign(article, { content: clean, title: article?.title || jsonLdLinkMetadata.title, previewImage: article?.previewImage || jsonLdLinkMetadata.previewImage, siteName: article?.siteName || jsonLdLinkMetadata.siteName, byline: article?.byline || jsonLdLinkMetadata.byline, }) logRecord.parseSuccess = true } catch (error) { console.log('Error parsing content', error) Object.assign(logRecord, { parseSuccess: false, parseError: error, }) } const { title, canonicalUrl } = pageInfo Object.assign(article || {}, { title: article?.title || title, }) logger.info('parse-article completed') return { domContent: preparedDocument.document, parsedContent: article, canonicalUrl, } } /** * Fetches the JSONLD link if found and parses an article metadata if presented * * Example article: https://thoughtsofstone.com/the-great-feminization/ * * JSONLD Link example: https://thoughtsofstone.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fthoughtsofstone.com%2Fthe-great-feminization%2F * @param document - JSDOM Document object of the content to parse link from * @returns Parsed article partial result from the JSONLD link if found (possibly not) */ const getJSONLdLinkMetadata = async ( document: Document ): Promise> => { const result: Partial = {} try { const jsonLdLink = document.querySelector( "link[type='application/json+oembed']" ) if (!jsonLdLink || !jsonLdLink.href) return result const jsonLd = (await axios.get(jsonLdLink.href, { timeout: 5000 })).data || {} result.byline = jsonLd['author_name'] result.previewImage = jsonLd['thumbnail_url'] result.siteName = jsonLd['provider_name'] result.title = jsonLd['title'] return result } catch (error) { logger.warning(`Unable to get JSONLD link of the article`, error) return result } } type Metadata = { description: string previewImage: string } export const parseMetadata = async ( url: string ): Promise => { try { const res = await axios.get(url) const window = new JSDOM(res.data).window // get open graph metadata const description = window.document .querySelector("head meta[property='og:description']") ?.getAttribute('content') || '' const previewImage = window.document .querySelector("head meta[property='og:image']") ?.getAttribute('content') || '' return { description: description, previewImage: previewImage } } catch (e) { console.log('failed to got:', url, e) return undefined } }