omnivore/packages/api/src/utils/highlightGenerator.ts

import { diff_match_patch as DiffMatchPatch } from 'diff-match-patch'
import { parseHTML } from 'linkedom'
import { nanoid } from 'nanoid'
import { v4 as uuidv4 } from 'uuid'
import { interpolationSearch } from './interpolationSearch'
import { logger } from './logger'

const highlightTag = 'omnivore_highlight'
export const maxHighlightLength = 2000
export const highlightIdAttribute = 'omnivore-highlight-id'

const nonParagraphTagsRegEx =
  /^(a|b|basefont|bdo|big|em|font|i|s|small|span|strike|strong|su[bp]|tt|u|code|mark)$/i
const highlightContentRegex = new RegExp(
  `<${highlightTag}>([\\s\\S]*)<\\/${highlightTag}>`,
  'i'
)
const maxDeepPatchDistance = 4000
const maxDeepPatchThreshhold = 0.5
const maxSurroundingTextLength = 2000

type TextNode = {
  startIndex: number
  node: Node
  isParagraphStart?: boolean
}

type ArticleTextContent = {
  textNodes: TextNode[]
  articleText: string
}

export type EmbeddedHighlightData = {
  prefix: string
  suffix: string
  quote: string
  id: string
  shortId: string
  patch: string
}

type FillNodeResponse = {
  node: Node
  textPartsToHighlight: {
    text: string
    highlight: boolean
  }[]
  isParagraphStart?: boolean
}

function getTextNodesBetween(rootNode: Node, startNode: Node, endNode: Node) {
  const maxTime = 1000 * 60 // 60 seconds
  const start = Date.now()
  let textNodeStartingPoint = 0
  let articleText = ''
  let newParagraph = false
  const textNodes: TextNode[] = []
  let pastStartNode = false,
    reachedEndNode = false

  function pushNode(node: Node) {
    textNodes.push({
      node,
      startIndex: textNodeStartingPoint,
      isParagraphStart: newParagraph,
    })
    textNodeStartingPoint += node.nodeValue?.length || 0
    articleText += node.nodeValue
    newParagraph = false
  }

  function getTextNodes(node: Node) {
    // If the function takes too long, throw an error
    if (Date.now() - start > maxTime) {
      const error = new Error('getTextNodes Timeout')
      logger.error(error)
      throw error
    }

    if (!node) return

    if (node == startNode) {
      pastStartNode = true
    }

    if (node.nodeType == 3) {
      if (
        pastStartNode &&
        !reachedEndNode &&
        !/^\s*$/.test(node.nodeValue || '')
      ) {
        pushNode(node)
      }
    } else {
      if (!nonParagraphTagsRegEx.test((node as Element).tagName))
        newParagraph = true
    }

    for (
      let i = 0, len = node.childNodes.length;
      !reachedEndNode && i < len;
      ++i
    ) {
      getTextNodes(node.childNodes[i])
    }

    if (node == endNode) {
      reachedEndNode = true
    }
  }

  getTextNodes(rootNode)

  return {
    textNodes,
    articleText,
  }
}

export const findEmbeddedHighlight = (
  dom: Element
): EmbeddedHighlightData | undefined => {
  const startNode = dom.querySelector(
    'span[data-omnivore-highlight-start="true"]'
  )
  const endNode = dom.querySelector('span[data-omnivore-highlight-end="true"]')

  const articleContentElement = dom
  if (!articleContentElement || !startNode || !endNode) {
    return undefined
  }

  try {
    const beforeNodes = getTextNodesBetween(
      dom,
      articleContentElement,
      startNode
    )
    const highlightNodes = getTextNodesBetween(dom, startNode, endNode)
    const afterNodes = getTextNodesBetween(dom, endNode, articleContentElement)
    const allArticleNodes = getTextNodesBetween(
      dom,
      articleContentElement,
      articleContentElement
    )

    const patch = generateDiffPatch(
      allArticleNodes,
      beforeNodes,
      highlightNodes,
      afterNodes
    )

    const id = uuidv4()
    const shortId = nanoid(8)
    const info = getPrefixAndSuffix(allArticleNodes, patch)
    const quote = getQuoteText(highlightNodes)

    return {
      id,
      shortId,
      quote,
      patch,
      prefix: info.prefix,
      suffix: info.suffix,
    }
  } catch (error) {
    logger.error(error)
    return undefined
  }
}

const getQuoteText = (highlight: ArticleTextContent): string => {
  let quote = ''

  highlight.textNodes.forEach((textNode, i) => {
    if (textNode.isParagraphStart && i > 0) {
      quote += '\n'
    }
    quote += textNode.node.textContent
  })

  return quote
}

function generateDiffPatch(
  allArticleNodes: ArticleTextContent,
  beforeNodes: ArticleTextContent,
  highlightNodes: ArticleTextContent,
  afterNodes: ArticleTextContent
): string {
  const textWithTags = `${beforeNodes.articleText}<${highlightTag}>${highlightNodes.articleText}</${highlightTag}>${afterNodes.articleText}`
  const diffMatchPatch = new DiffMatchPatch()
  const patch = diffMatchPatch.patch_toText(
    diffMatchPatch.patch_make(allArticleNodes.articleText, textWithTags)
  )

  if (!patch) throw new Error('Invalid patch')
  return patch
}

function getPrefixAndSuffix(
  articleTextNodes: ArticleTextContent,
  patch: string
): {
  prefix: string
  suffix: string
  highlightTextStart: number
  highlightTextEnd: number
  textNodes: TextNode[]
  textNodeIndex: number
} {
  if (!patch) throw new Error('Invalid patch')
  const textNodes = articleTextNodes.textNodes

  const { highlightTextStart, highlightTextEnd } = selectionOffsetsFromPatch(
    articleTextNodes.articleText,
    patch
  )

  // Searching for the starting text node using interpolation search algorithm
  const textNodeIndex = interpolationSearch(
    textNodes.map(({ startIndex: startIndex }) => startIndex),
    highlightTextStart
  )
  const endTextNodeIndex = interpolationSearch(
    textNodes.map(({ startIndex: startIndex }) => startIndex),
    highlightTextEnd
  )

  const prefix = getSurroundingText({
    textNodes,
    startingTextNodeIndex: textNodeIndex,
    startingOffset: highlightTextStart - textNodes[textNodeIndex].startIndex,
    side: 'prefix',
  })
  const suffix = getSurroundingText({
    textNodes,
    startingTextNodeIndex: endTextNodeIndex,
    startingOffset: highlightTextEnd - textNodes[endTextNodeIndex].startIndex,
    side: 'suffix',
  })
  return {
    prefix,
    suffix,
    highlightTextStart,
    highlightTextEnd,
    textNodes,
    textNodeIndex,
  }
}

/**
 * Gets the part of text from the starting point to the paragraph ending from the
 * specified side
 * @param param0 - Object that includes textNodes array, starting point and the
 * way of movement (prefix, suffix)
 * @returns String of text to fulfill the paragraph that surrounds the
 * highlight from either starting or ending point
 */
const getSurroundingText = ({
  textNodes,
  startingTextNodeIndex,
  startingOffset,
  side,
}: {
  textNodes: TextNode[]
  startingTextNodeIndex: number
  startingOffset: number
  side: 'prefix' | 'suffix'
}): string => {
  const isPrefix = side === 'prefix'
  let i = startingTextNodeIndex
  const getTextPart = (): string => {
    i += isPrefix ? -1 : 1
    const { node, isParagraphStart: startsParagraph } = textNodes[i]
    const text = node.nodeValue || ''

    if (isPrefix) {
      if (startsParagraph) return text
      if (text.length > maxSurroundingTextLength) return text
      return getTextPart() + text
    } else {
      if (!textNodes[i + 1] || textNodes[i + 1].isParagraphStart) return text
      if (text.length > maxSurroundingTextLength) return text
      return text + getTextPart()
    }
  }
  const truncateText = (str: string): string => {
    if (str.length <= maxSurroundingTextLength) return str
    if (isPrefix) {
      return str.slice(str.length - maxSurroundingTextLength)
    }
    return str.substring(0, maxSurroundingTextLength)
  }

  const { isParagraphStart: startsParagraph, node } =
    textNodes[startingTextNodeIndex]
  const nodeText = node.nodeValue || ''

  const text = isPrefix
    ? nodeText.substring(0, startingOffset)
    : nodeText.substring(startingOffset)

  if (isPrefix) {
    return truncateText(startsParagraph ? text : getTextPart() + text)
  } else {
    return truncateText(
      !textNodes[i + 1] || textNodes[i + 1].isParagraphStart
        ? text
        : text + getTextPart()
    )
  }
}

const selectionOffsetsFromPatch = (
  articleText: string,
  patch: string
): {
  highlightTextStart: number
  highlightTextEnd: number
  matchingHighlightContent: RegExpExecArray
} => {
  if (!patch) throw new Error('Invalid patch')
  const dmp = new DiffMatchPatch()
  // Applying a patch to the whole article text to find the selection content via regexp
  const appliedPatch = dmp.patch_apply(dmp.patch_fromText(patch), articleText)

  let matchingHighlightContent
  if (!appliedPatch[1][0]) {
    dmp.Match_Threshold = maxDeepPatchThreshhold
    dmp.Match_Distance = maxDeepPatchDistance
    const deeperAppliedPatch = dmp.patch_apply(
      dmp.patch_fromText(patch),
      articleText
    )
    if (!deeperAppliedPatch[1][0]) {
      throw new Error('Unable to find the highlight')
    } else {
      matchingHighlightContent = highlightContentRegex.exec(
        deeperAppliedPatch[0]
      )
    }
  } else {
    matchingHighlightContent = highlightContentRegex.exec(appliedPatch[0])
  }

  if (!matchingHighlightContent)
    throw new Error('Unable to find the highlight from patch')

  const highlightTextStart = matchingHighlightContent.index
  const highlightTextEnd =
    highlightTextStart + matchingHighlightContent[1].length
  return {
    highlightTextStart,
    highlightTextEnd,
    matchingHighlightContent,
  }
}

const fillHighlight = ({
  textNodes,
  startingTextNodeIndex,
  highlightTextStart,
  highlightTextEnd,
}: {
  textNodes: TextNode[]
  startingTextNodeIndex: number
  highlightTextStart: number
  highlightTextEnd: number
}): FillNodeResponse => {
  const {
    node,
    startIndex: startIndex,
    isParagraphStart,
  } = textNodes[startingTextNodeIndex]
  const text = node.nodeValue || ''

  const textBeforeHighlightLength = highlightTextStart - startIndex
  const textAfterHighlightLength = highlightTextEnd - startIndex

  const textPartsToHighlight = []
  textBeforeHighlightLength > 0 &&
    textPartsToHighlight.push({
      text: text.substring(0, textBeforeHighlightLength),
      highlight: false,
    })
  textPartsToHighlight.push({
    text: text.substring(textBeforeHighlightLength, textAfterHighlightLength),
    highlight: true,
  })
  textAfterHighlightLength <= text.length &&
    textPartsToHighlight.push({
      text: text.substring(textAfterHighlightLength),
      highlight: false,
    })
  return {
    node,
    textPartsToHighlight,
    isParagraphStart,
  }
}

export function getArticleTextNodes(
  document: Document
): ArticleTextContent | null {
  try {
    const rootNode = document.getRootNode()
    return getTextNodesBetween(rootNode, rootNode, rootNode)
  } catch (error) {
    logger.error(error)
    return null
  }
}

export function makeHighlightNodeAttributes(
  id: string,
  patch: string,
  articleTextNodes: ArticleTextContent
) {
  const document = parseHTML('').document
  const textNodes = articleTextNodes.textNodes
  const { highlightTextStart, highlightTextEnd } = selectionOffsetsFromPatch(
    articleTextNodes.articleText,
    patch
  )

  // Searching for the starting text node using interpolation search algorithm
  let startingTextNodeIndex = interpolationSearch(
    textNodes.map(({ startIndex: startIndex }) => startIndex),
    highlightTextStart
  )
  let quote = ''

  while (
    startingTextNodeIndex < textNodes.length &&
    highlightTextEnd > textNodes[startingTextNodeIndex].startIndex
  ) {
    const { node, textPartsToHighlight, isParagraphStart } = fillHighlight({
      textNodes,
      startingTextNodeIndex,
      highlightTextStart,
      highlightTextEnd,
    })
    const { parentNode, nextSibling } = node

    // check if the node is a <pre> tag
    const isPre = node.parentElement?.tagName === 'PRE'

    parentNode?.removeChild(node)
    textPartsToHighlight.forEach(({ highlight, text: rawText }, i) => {
      // If we are not in preformatted text, prevent hardcoded \n,
      // we'll create new-lines based on the startsParagraph data
      const text = isPre ? rawText : rawText.replace(/\n/g, '')
      const newTextNode = document.createTextNode(rawText)

      if (!highlight) {
        return parentNode?.insertBefore(newTextNode, nextSibling)
      } else {
        if (text) {
          isParagraphStart && !i && quote && (quote += '\n')
          quote += text
        }

        const newHighlightSpan = document.createElement('span')
        newHighlightSpan.setAttribute(highlightIdAttribute, id)
        newHighlightSpan.appendChild(newTextNode)
        return parentNode?.insertBefore(newHighlightSpan, nextSibling)
      }
    })
    startingTextNodeIndex++
  }
}