omnivore/packages/api/src/utils/parser.ts

/* eslint-disable @typescript-eslint/no-unsafe-member-access */
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
/* eslint-disable @typescript-eslint/no-unused-vars */
import { Readability } from '@omnivore/readability'
import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify'
import { PageType, PreparedDocumentInput } from '../generated/graphql'
import { buildLogger, LogRecord } from './logger'
import { createImageProxyUrl } from './imageproxy'
import axios from 'axios'
import { WikipediaHandler } from './wikipedia-handler'
import { SubstackHandler } from './substack-handler'
import { AxiosHandler } from './axios-handler'
import { BloombergHandler } from './bloomberg-handler'
import { GolangHandler } from './golang-handler'
import * as hljs from 'highlightjs'
import { decode } from 'html-entities'
import { parseHTML } from 'linkedom'

const logger = buildLogger('utils.parse')

export const ALLOWED_CONTENT_TYPES = [
  'text/html',
  'application/octet-stream',
  'text/plain',
]

const DOM_PURIFY_CONFIG = {
  ADD_TAGS: ['iframe'],
  ADD_ATTR: ['allow', 'allowfullscreen', 'frameborder', 'scrolling'],
  FORBID_ATTR: [
    'data-ml-dynamic',
    'data-ml-dynamic-type',
    'data-orig-url',
    'data-ml-id',
    'data-ml',
    'data-xid',
    'data-feature',
  ],
}

interface ContentHandler {
  shouldPrehandle: (url: URL, dom: Document) => boolean
  prehandle: (url: URL, document: Document) => Promise<Document>
}

const HANDLERS = [
  new WikipediaHandler(),
  new SubstackHandler(),
  new AxiosHandler(),
  new BloombergHandler(),
  new GolangHandler(),
]

/** Hook that prevents DOMPurify from removing youtube iframes */
const domPurifySanitizeHook = (
  node: Element,
  data: SanitizeElementHookEvent
): void => {
  if (data.tagName === 'iframe') {
    const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i
    const src = node.getAttribute('src') || ''
    const dataSrc = node.getAttribute('data-src') || ''

    if (src && urlRegex.test(src)) {
      return
    }

    if (dataSrc && urlRegex.test(dataSrc)) {
      node.setAttribute('src', dataSrc)
      return
    }

    node.parentNode?.removeChild(node)
  }
}

export type ParsedContentPuppeteer = {
  domContent: string
  parsedContent: Readability.ParseResult | null
  canonicalUrl?: string | null
  pageType: PageType
}

/* eslint-disable @typescript-eslint/no-explicit-any */
type ArticleParseLogRecord = LogRecord & {
  url: string
  userAgent?: string
  pageInfo?: { [key: string]: any }
  blockedByClient?: boolean
  parsedOrigin?: boolean
  origin?: string
  puppeteerSuccess?: boolean
  puppeteerError?: { [key: string]: any }
  parseSuccess?: boolean
  parseError?: { [key: string]: any }
  scrollError?: boolean
  isAllowedContentType?: boolean
}
/* eslint-enable @typescript-eslint/no-explicit-any */

const DEBUG_MODE = process.env.DEBUG === 'true' || false

const parseOriginalContent = (document: Document): PageType => {
  try {
    const e = document.querySelector("head meta[property='og:type']")
    const content = e?.getAttribute('content')
    if (!content) {
      return PageType.Unknown
    }

    switch (content.toLowerCase()) {
      case 'article':
        return PageType.Article
      case 'book':
        return PageType.Book
      case 'profile':
        return PageType.Profile
      case 'website':
        return PageType.Website
    }
  } catch (error) {
    logger.error('Error extracting og:type from content', error)
  }

  return PageType.Unknown
}

const getPurifiedContent = (html: string): Document => {
  const newWindow = parseHTML('')
  const DOMPurify = createDOMPurify(newWindow)
  DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
  const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG)
  return parseHTML(clean).document
}

const getReadabilityResult = async (
  url: string,
  html: string,
  document: Document,
  isNewsletter?: boolean
): Promise<Readability.ParseResult | null> => {
  // First attempt to read the article as is.
  // if that fails attempt to purify then read
  const sources = [
    () => {
      return document
    },
    () => {
      return getPurifiedContent(html)
    },
  ]

  for (const source of sources) {
    const document = source()
    if (!document) {
      continue
    }

    try {
      const article = await new Readability(document, {
        debug: DEBUG_MODE,
        createImageProxyUrl,
        keepTables: isNewsletter,
        url,
      }).parse()

      if (article) {
        return article
      }
    } catch (error) {
      console.log('parsing error for url', url, error)
    }
  }

  return null
}

const applyHandlers = async (
  url: string,
  document: Document
): Promise<void> => {
  try {
    const u = new URL(url)
    const handler = HANDLERS.find((h) => {
      try {
        return h.shouldPrehandle(u, document)
      } catch (e) {
        console.log('error with handler: ', h.name, e)
      }
      return false
    })
    if (handler) {
      try {
        console.log('pre-handling url or content with handler: ', handler.name)
        await handler.prehandle(u, document)
      } catch (e) {
        console.log('error with handler: ', handler, e)
      }
    }
  } catch (error) {
    logger.error('Error prehandling url', url, error)
  }
}

export const parsePreparedContent = async (
  url: string,
  preparedDocument: PreparedDocumentInput,
  isNewsletter?: boolean,
  allowRetry = true
): Promise<ParsedContentPuppeteer> => {
  const logRecord: ArticleParseLogRecord = {
    url: url,
    labels: { source: 'parsePreparedContent' },
  }

  let article = null
  const { document, pageInfo } = preparedDocument

  // Checking for content type acceptance or if there are no contentType
  // at all (backward extension versions compatibility)
  if (
    pageInfo.contentType &&
    !ALLOWED_CONTENT_TYPES.includes(pageInfo.contentType)
  ) {
    console.log('Not allowed content type', pageInfo.contentType)
    return {
      canonicalUrl: url,
      parsedContent: null,
      domContent: preparedDocument.document,
      pageType: PageType.Unknown,
    }
  }

  const dom = parseHTML(document).document

  await applyHandlers(url, dom)

  try {
    article = await getReadabilityResult(url, document, dom, isNewsletter)
    if (!article?.textContent && allowRetry) {
      const newDocument = {
        ...preparedDocument,
        document: '<html>' + preparedDocument.document + '</html>',
      }
      return parsePreparedContent(url, newDocument, isNewsletter, false)
    }

    // Format code blocks
    // TODO: we probably want to move this type of thing
    // to the handlers, and have some concept of postHandle
    if (article?.dom) {
      const codeBlocks = article.dom.querySelectorAll('code')
      if (codeBlocks.length > 0) {
        codeBlocks.forEach((e) => {
          console.log(e.textContent)
          if (e.textContent) {
            const att = hljs.highlightAuto(e.textContent)
            const code = dom.createElement('code')
            const langClass =
              `hljs language-${att.language}` +
              (att.second_best?.language
                ? ` language-${att.second_best?.language}`
                : '')
            code.setAttribute('class', langClass)
            code.innerHTML = att.value
            e.replaceWith(code)
          }
        })
        article.content = article.dom.outerHTML
      }
    }

    const newWindow = parseHTML('')
    const DOMPurify = createDOMPurify(newWindow)
    DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
    const clean = DOMPurify.sanitize(article?.content || '', DOM_PURIFY_CONFIG)

    const jsonLdLinkMetadata = (async () => {
      return getJSONLdLinkMetadata(dom)
    })()

    Object.assign(article || {}, {
      content: clean,
      title: article?.title || (await jsonLdLinkMetadata).title,
      previewImage:
        article?.previewImage || (await jsonLdLinkMetadata).previewImage,
      siteName: article?.siteName || (await jsonLdLinkMetadata).siteName,
      siteIcon: article?.siteIcon,
      byline: article?.byline || (await jsonLdLinkMetadata).byline,
      language: article?.language,
    })
    logRecord.parseSuccess = true
  } catch (error) {
    console.log('Error parsing content', error)
    Object.assign(logRecord, {
      parseSuccess: false,
      parseError: error,
    })
  }

  const { title, canonicalUrl } = pageInfo

  Object.assign(article || {}, {
    title: article?.title || title,
  })

  logger.info('parse-article completed')

  return {
    domContent: preparedDocument.document,
    parsedContent: article,
    canonicalUrl,
    pageType: parseOriginalContent(dom),
  }
}

/**
 * Fetches the JSONLD link if found and parses an article metadata if presented
 *
 * Example article: https://thoughtsofstone.com/the-great-feminization/
 *
 * JSONLD Link example: https://thoughtsofstone.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fthoughtsofstone.com%2Fthe-great-feminization%2F
 * @param document - JSDOM Document object of the content to parse link from
 * @returns Parsed article partial result from the JSONLD link if found (possibly not)
 */
const getJSONLdLinkMetadata = async (
  document: Document
): Promise<Partial<Readability.ParseResult>> => {
  const result: Partial<Readability.ParseResult> = {}
  try {
    const jsonLdLink = document.querySelector<HTMLLinkElement>(
      "link[type='application/json+oembed']"
    )
    if (!jsonLdLink || !jsonLdLink.href) return result

    const jsonLd =
      (await axios.get(jsonLdLink.href, { timeout: 5000 })).data || {}

    result.byline = decode(jsonLd['author_name'])
    result.previewImage = decode(jsonLd['thumbnail_url'])
    result.siteName = decode(jsonLd['provider_name'])
    result.title = decode(jsonLd['title'])

    return result
  } catch (error) {
    logger.warning(`Unable to get JSONLD link of the article`, error)
    return result
  }
}

type Metadata = {
  title?: string
  author?: string
  description: string
  previewImage: string
}

export const parsePageMetadata = (html: string): Metadata | undefined => {
  try {
    const document = parseHTML(html).document

    // get open graph metadata
    const description =
      document
        .querySelector("head meta[property='og:description']")
        ?.getAttribute('content') || ''

    const previewImage =
      document
        .querySelector("head meta[property='og:image']")
        ?.getAttribute('content') || ''

    const title =
      document
        .querySelector("head meta[property='og:title']")
        ?.getAttribute('content') || undefined

    const author =
      document
        .querySelector("head meta[name='author']")
        ?.getAttribute('content') || undefined

    // TODO: we should be able to apply the JSONLD metadata
    // here too

    return { title, author, description, previewImage }
  } catch (e) {
    console.log('failed to parse page:', html, e)
    return undefined
  }
}

export const parseUrlMetadata = async (
  url: string
): Promise<Metadata | undefined> => {
  try {
    const res = await axios.get(url)
    return parsePageMetadata(res.data)
  } catch (e) {
    console.log('failed to get:', url, e)
    return undefined
  }
}

// Attempt to determine if an HTML blob is a newsletter
// based on it's contents.
// TODO: when we consolidate the handlers we could include this
// as a utility method on each one.
export const isProbablyNewsletter = async (html: string): Promise<boolean> => {
  const dom = parseHTML(html).document
  const domCopy = parseHTML(dom.documentElement.outerHTML).document
  const article = await new Readability(domCopy, {
    debug: false,
    keepTables: true,
  }).parse()

  if (!article || !article.content) {
    return false
  }

  // substack newsletter emails have tables with a *post-meta class
  if (dom.querySelector('table[class$="post-meta"]')) {
    return true
  }

  // If the article has a header link, and substack icons its probably a newsletter
  const href = findNewsletterHeaderHref(dom)
  const heartIcon = dom.querySelector(
    'table tbody td span a img[src*="HeartIcon"]'
  )
  const recommendIcon = dom.querySelector(
    'table tbody td span a img[src*="RecommendIconRounded"]'
  )
  if (href && (heartIcon || recommendIcon)) {
    return true
  }

  // Check if this is a beehiiv.net newsletter
  if (dom.querySelectorAll('img[src*="beehiiv.net"]').length > 0) {
    const beehiivUrl = beehiivNewsletterHref(dom)
    if (beehiivUrl) {
      return true
    }
  }

  // Check if this is a newsletter from revue
  if (dom.querySelectorAll('img[src*="getrevue.co"]').length > 0) {
    const getrevueUrl = revueNewsletterHref(dom)
    if (getrevueUrl) {
      return true
    }
  }

  // Check if this is a convertkit.com newsletter
  return (
    dom.querySelectorAll(
      'img[src*="convertkit.com"], img[src*="convertkit-mail.com"]'
    ).length > 0
  )
}

const beehiivNewsletterHref = (dom: Document): string | undefined => {
  const readOnline = dom.querySelectorAll('table tr td div a[class*="link"]')
  let res: string | undefined = undefined
  readOnline.forEach((e) => {
    if (e.textContent === 'Read Online') {
      res = e.getAttribute('href') || undefined
    }
  })
  return res
}

const convertkitNewsletterHref = (dom: Document): string | undefined => {
  const readOnline = dom.querySelectorAll('table tr td div a')
  let res: string | undefined = undefined
  readOnline.forEach((e) => {
    if (e.textContent === 'View this email in your browser') {
      res = e.getAttribute('href') || undefined
    }
  })
  return res
}

const revueNewsletterHref = (dom: Document): string | undefined => {
  const viewOnline = dom.querySelectorAll('table tr td div a[target="_blank"]')
  let res: string | undefined = undefined
  viewOnline.forEach((e) => {
    if (e.textContent === 'View online') {
      res = e.getAttribute('href') || undefined
    }
  })
  return res
}

const findNewsletterHeaderHref = (dom: Document): string | undefined => {
  // Substack header links
  const postLink = dom.querySelector('h1 a ')
  if (postLink) {
    return postLink.getAttribute('href') || undefined
  }

  // Check if this is a beehiiv.net newsletter
  const beehiiv = beehiivNewsletterHref(dom)
  if (beehiiv) {
    return beehiiv
  }

  // Check if this is a revue newsletter
  const revue = revueNewsletterHref(dom)
  if (revue) {
    return revue
  }

  // Check if this is a convertkit.com newsletter
  const convertkitUrl = convertkitNewsletterHref(dom)
  if (convertkitUrl) {
    return convertkitUrl
  }

  return undefined
}

// Given an HTML blob tries to find a URL to use for
// a canonical URL.
export const findNewsletterUrl = async (
  html: string
): Promise<string | undefined> => {
  const dom = parseHTML(html).document

  // Check if this is a substack newsletter
  const href = findNewsletterHeaderHref(dom)
  if (href) {
    // Try to make a HEAD request so we get the redirected URL, since these
    // will usually be behind tracking url redirects
    return axios({
      method: 'HEAD',
      url: href,
    })
      .then((res) => res.request.res.responseUrl as string | undefined)
      .catch((e) => href)
  }

  return undefined
}