omnivore/packages/api/src/utils/parser.ts

/* eslint-disable @typescript-eslint/no-unsafe-member-access */
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
/* eslint-disable @typescript-eslint/no-unused-vars */
import { Readability } from '@omnivore/readability'
import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify'
import { PageType, PreparedDocumentInput } from '../generated/graphql'
import { buildLogger, LogRecord } from './logger'
import { createImageProxyUrl } from './imageproxy'
import axios from 'axios'
import * as hljs from 'highlightjs'
import { decode } from 'html-entities'
import { parseHTML } from 'linkedom'
import { getRepository } from '../entity/utils'
import { User } from '../entity/user'
import { ILike } from 'typeorm'
import { v4 as uuid } from 'uuid'
import addressparser from 'addressparser'
import { preParseContent } from '@omnivore/content-handler'
import {
  EmbeddedHighlightData,
  findEmbeddedHighlight,
} from './highlightGenerator'
import { NodeHtmlMarkdown } from 'node-html-markdown'

const logger = buildLogger('utils.parse')

export const ALLOWED_CONTENT_TYPES = [
  'text/html',
  'application/octet-stream',
  'text/plain',
]

const DOM_PURIFY_CONFIG = {
  ADD_TAGS: ['iframe'],
  ADD_ATTR: ['allow', 'allowfullscreen', 'frameborder', 'scrolling'],
  FORBID_ATTR: [
    'data-ml-dynamic',
    'data-ml-dynamic-type',
    'data-orig-url',
    'data-ml-id',
    'data-ml',
    'data-xid',
    'data-feature',
  ],
}
const ARTICLE_PREFIX = 'omnivore:'

export const FAKE_URL_PREFIX = 'https://omnivore.app/no_url?q='

/** Hook that prevents DOMPurify from removing youtube iframes */
const domPurifySanitizeHook = (
  node: Element,
  data: SanitizeElementHookEvent
): void => {
  if (data.tagName === 'iframe') {
    const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i
    const src = node.getAttribute('src') || ''
    const dataSrc = node.getAttribute('data-src') || ''

    if (src && urlRegex.test(src)) {
      return
    }

    if (dataSrc && urlRegex.test(dataSrc)) {
      node.setAttribute('src', dataSrc)
      return
    }

    node.parentNode?.removeChild(node)
  }
}

export type ParsedContentPuppeteer = {
  domContent: string
  parsedContent: Readability.ParseResult | null
  canonicalUrl?: string | null
  pageType: PageType
  highlightData?: EmbeddedHighlightData
}

/* eslint-disable @typescript-eslint/no-explicit-any */
type ArticleParseLogRecord = LogRecord & {
  url: string
  userAgent?: string
  pageInfo?: { [key: string]: any }
  blockedByClient?: boolean
  parsedOrigin?: boolean
  origin?: string
  puppeteerSuccess?: boolean
  puppeteerError?: { [key: string]: any }
  parseSuccess?: boolean
  parseError?: { [key: string]: any }
  scrollError?: boolean
  isAllowedContentType?: boolean
}
/* eslint-enable @typescript-eslint/no-explicit-any */

const DEBUG_MODE = process.env.DEBUG === 'true' || false

const parseOriginalContent = (document: Document): PageType => {
  try {
    const e = document.querySelector("head meta[property='og:type']")
    const content = e?.getAttribute('content')
    if (!content) {
      return PageType.Unknown
    }

    switch (content.toLowerCase()) {
      case 'article':
        return PageType.Article
      case 'book':
        return PageType.Book
      case 'profile':
        return PageType.Profile
      case 'website':
        return PageType.Website
    }
  } catch (error) {
    logger.error('Error extracting og:type from content', error)
  }

  return PageType.Unknown
}

const getPurifiedContent = (html: string): Document => {
  const newWindow = parseHTML('')
  const DOMPurify = createDOMPurify(newWindow)
  DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
  const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG)
  return parseHTML(clean).document
}

const getReadabilityResult = async (
  url: string,
  html: string,
  document: Document,
  isNewsletter?: boolean
): Promise<Readability.ParseResult | null> => {
  // First attempt to read the article as is.
  // if that fails attempt to purify then read
  const sources = [
    () => {
      return document
    },
    () => {
      return getPurifiedContent(html)
    },
  ]

  for (const source of sources) {
    const document = source()
    if (!document) {
      continue
    }

    try {
      const article = await new Readability(document, {
        debug: DEBUG_MODE,
        createImageProxyUrl,
        keepTables: isNewsletter,
        url,
      }).parse()

      if (article) {
        return article
      }
    } catch (error) {
      console.log('parsing error for url', url, error)
    }
  }

  return null
}

export const parsePreparedContent = async (
  url: string,
  preparedDocument: PreparedDocumentInput,
  parseResult?: Readability.ParseResult | null,
  isNewsletter?: boolean,
  allowRetry = true
): Promise<ParsedContentPuppeteer> => {
  const logRecord: ArticleParseLogRecord = {
    url: url,
    labels: { source: 'parsePreparedContent' },
  }

  // If we have a parse result, use it
  let article = parseResult || null
  let highlightData = undefined
  const { document, pageInfo } = preparedDocument

  // Checking for content type acceptance or if there are no contentType
  // at all (backward extension versions compatibility)
  if (
    pageInfo.contentType &&
    !ALLOWED_CONTENT_TYPES.includes(pageInfo.contentType)
  ) {
    console.log('Not allowed content type', pageInfo.contentType)
    return {
      canonicalUrl: url,
      parsedContent: null,
      domContent: preparedDocument.document,
      pageType: PageType.Unknown,
    }
  }

  let dom = parseHTML(document).document

  try {
    if (!article) {
      // Attempt to parse the article
      // preParse content
      const preParsedDom = await preParseContent(url, dom)
      preParsedDom && (dom = preParsedDom)

      article = await getReadabilityResult(url, document, dom, isNewsletter)
    }

    if (!article?.textContent && allowRetry) {
      const newDocument = {
        ...preparedDocument,
        document: '<html>' + preparedDocument.document + '</html>',
      }
      return parsePreparedContent(
        url,
        newDocument,
        parseResult,
        isNewsletter,
        false
      )
    }

    // Format code blocks
    // TODO: we probably want to move this type of thing
    // to the handlers, and have some concept of postHandle
    if (article?.content) {
      const articleDom = parseHTML(article.content).document
      const codeBlocks = articleDom.querySelectorAll('code')
      if (codeBlocks.length > 0) {
        codeBlocks.forEach((e) => {
          if (e.textContent) {
            const att = hljs.highlightAuto(e.textContent)
            const code = dom.createElement('code')
            const langClass =
              `hljs language-${att.language}` +
              (att.second_best?.language
                ? ` language-${att.second_best?.language}`
                : '')
            code.setAttribute('class', langClass)
            code.innerHTML = att.value
            e.replaceWith(code)
          }
        })
        article.content = articleDom.documentElement.outerHTML
      }

      highlightData = findEmbeddedHighlight(articleDom.documentElement)

      const ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES = [
        'omnivore-highlight-id',
        'data-twitter-tweet-id',
        'data-instagram-id',
      ]

      // Get the top level element?
      const pageNode = articleDom.firstElementChild as HTMLElement
      const nodesToVisitStack: [HTMLElement] = [pageNode]
      const visitedNodeList = []

      while (nodesToVisitStack.length > 0) {
        const currentNode = nodesToVisitStack.pop()
        if (
          currentNode?.nodeType !== 1 ||
          // Avoiding dynamic elements from being counted as anchor-allowed elements
          ANCHOR_ELEMENTS_BLOCKED_ATTRIBUTES.some((attrib) =>
            currentNode.hasAttribute(attrib)
          )
        ) {
          continue
        }
        visitedNodeList.push(currentNode)
        ;[].slice
          .call(currentNode.childNodes)
          .reverse()
          .forEach(function (node) {
            nodesToVisitStack.push(node)
          })
      }

      visitedNodeList.shift()
      visitedNodeList.forEach((node, index) => {
        // start from index 1, index 0 reserved for anchor unknown.
        node.setAttribute('data-omnivore-anchor-idx', (index + 1).toString())
      })

      article.content = articleDom.documentElement.outerHTML
    }

    const newWindow = parseHTML('')
    const DOMPurify = createDOMPurify(newWindow)
    DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
    const clean = DOMPurify.sanitize(article?.content || '', DOM_PURIFY_CONFIG)

    const jsonLdLinkMetadata = (async () => {
      return getJSONLdLinkMetadata(dom)
    })()

    Object.assign(article || {}, {
      content: clean,
      title: article?.title || (await jsonLdLinkMetadata).title,
      previewImage:
        article?.previewImage || (await jsonLdLinkMetadata).previewImage,
      siteName: article?.siteName || (await jsonLdLinkMetadata).siteName,
      siteIcon: article?.siteIcon,
      byline: article?.byline || (await jsonLdLinkMetadata).byline,
      language: article?.language,
    })
    logRecord.parseSuccess = true
  } catch (error) {
    console.log('Error parsing content', error)
    Object.assign(logRecord, {
      parseSuccess: false,
      parseError: error,
    })
  }

  const { title, canonicalUrl } = pageInfo

  Object.assign(article || {}, {
    title: article?.title || title,
  })

  logger.info('parse-article completed')

  return {
    domContent: preparedDocument.document,
    parsedContent: article,
    canonicalUrl,
    pageType: parseOriginalContent(dom),
    highlightData,
  }
}

/**
 * Fetches the JSONLD link if found and parses an article metadata if presented
 *
 * Example article: https://thoughtsofstone.com/the-great-feminization/
 *
 * JSONLD Link example: https://thoughtsofstone.com/wp-json/oembed/1.0/embed?url=https%3A%2F%2Fthoughtsofstone.com%2Fthe-great-feminization%2F
 * @param document - JSDOM Document object of the content to parse link from
 * @returns Parsed article partial result from the JSONLD link if found (possibly not)
 */
const getJSONLdLinkMetadata = async (
  document: Document
): Promise<Partial<Readability.ParseResult>> => {
  const result: Partial<Readability.ParseResult> = {}
  try {
    const jsonLdLink = document.querySelector<HTMLLinkElement>(
      "link[type='application/json+oembed']"
    )
    if (!jsonLdLink || !jsonLdLink.href) return result

    const jsonLd =
      (await axios.get(jsonLdLink.href, { timeout: 5000 })).data || {}

    result.byline = decode(jsonLd['author_name'])
    result.previewImage = decode(jsonLd['thumbnail_url'])
    result.siteName = decode(jsonLd['provider_name'])
    result.title = decode(jsonLd['title'])

    return result
  } catch (error) {
    logger.warning(`Unable to get JSONLD link of the article`, error)
    return result
  }
}

type Metadata = {
  title?: string
  author?: string
  description: string
  previewImage: string
}

export const parsePageMetadata = (html: string): Metadata | undefined => {
  try {
    const document = parseHTML(html).document

    // get open graph metadata
    const description =
      document
        .querySelector("head meta[property='og:description']")
        ?.getAttribute('content') || ''

    const previewImage =
      document
        .querySelector("head meta[property='og:image']")
        ?.getAttribute('content') || ''

    const title =
      document
        .querySelector("head meta[property='og:title']")
        ?.getAttribute('content') || undefined

    const author =
      document
        .querySelector("head meta[name='author']")
        ?.getAttribute('content') || undefined

    // TODO: we should be able to apply the JSONLD metadata
    // here too

    return { title, author, description, previewImage }
  } catch (e) {
    console.log('failed to parse page:', html, e)
    return undefined
  }
}

export const parseUrlMetadata = async (
  url: string
): Promise<Metadata | undefined> => {
  try {
    const res = await axios.get(url)
    return parsePageMetadata(res.data)
  } catch (e) {
    console.log('failed to get:', url, e)
    return undefined
  }
}

export const isProbablyArticle = async (
  email: string,
  subject: string
): Promise<boolean> => {
  const user = await getRepository(User).findOneBy({
    email: ILike(email),
  })
  return !!user || subject.includes(ARTICLE_PREFIX)
}

export const generateUniqueUrl = () => FAKE_URL_PREFIX + uuid()

export const getTitleFromEmailSubject = (subject: string) => {
  const title = subject.replace(ARTICLE_PREFIX, '')
  return title.trim()
}

export const parseEmailAddress = (from: string): addressparser.EmailAddress => {
  // get author name from email
  // e.g. 'Jackson Harper from Omnivore App <jacksonh@substack.com>'
  // or 'Mike Allen <mike@axios.com>'
  const parsed = addressparser(from)
  if (parsed.length > 0) {
    return parsed[0]
  }
  return { name: '', address: from }
}

export const fetchFavicon = async (
  url: string
): Promise<string | undefined> => {
  try {
    // get the correct url if it's a redirect
    const response = await axios.head(url, { timeout: 5000 })
    const realUrl = response.request.res.responseUrl
    const domain = new URL(realUrl).hostname
    return `https://api.faviconkit.com/${domain}/128`
  } catch (e) {
    console.log('Error fetching favicon', e)
    return undefined
  }
}

/* ********************************************************* *
 * Re-use
 * If using it several times, creating an instance saves time
 * ********************************************************* */
const nhm = new NodeHtmlMarkdown(
  /* options (optional) */ {},
  /* customTransformers (optional) */ undefined,
  /* customCodeBlockTranslators (optional) */ undefined
)

export const htmlToMarkdown = (html: string) => {
  return nhm.translate(/* html */ html)
}