Files
omnivore/packages/api/src/utils/highlightGenerator.ts
2024-04-23 21:28:02 +08:00

474 lines
12 KiB
TypeScript

import { diff_match_patch as DiffMatchPatch } from 'diff-match-patch'
import { parseHTML } from 'linkedom'
import { nanoid } from 'nanoid'
import { v4 as uuidv4 } from 'uuid'
import { interpolationSearch } from './interpolationSearch'
import { logger } from './logger'
const highlightTag = 'omnivore_highlight'
export const maxHighlightLength = 2000
export const highlightIdAttribute = 'omnivore-highlight-id'
const nonParagraphTagsRegEx =
/^(a|b|basefont|bdo|big|em|font|i|s|small|span|strike|strong|su[bp]|tt|u|code|mark)$/i
const highlightContentRegex = new RegExp(
`<${highlightTag}>([\\s\\S]*)<\\/${highlightTag}>`,
'i'
)
const maxDeepPatchDistance = 4000
const maxDeepPatchThreshhold = 0.5
const maxSurroundingTextLength = 2000
type TextNode = {
startIndex: number
node: Node
isParagraphStart?: boolean
}
type ArticleTextContent = {
textNodes: TextNode[]
articleText: string
}
export type EmbeddedHighlightData = {
prefix: string
suffix: string
quote: string
id: string
shortId: string
patch: string
}
type FillNodeResponse = {
node: Node
textPartsToHighlight: {
text: string
highlight: boolean
}[]
isParagraphStart?: boolean
}
function getTextNodesBetween(rootNode: Node, startNode: Node, endNode: Node) {
const maxTime = 1000 * 60 // 60 seconds
const start = Date.now()
let textNodeStartingPoint = 0
let articleText = ''
let newParagraph = false
const textNodes: TextNode[] = []
let pastStartNode = false,
reachedEndNode = false
function pushNode(node: Node) {
textNodes.push({
node,
startIndex: textNodeStartingPoint,
isParagraphStart: newParagraph,
})
textNodeStartingPoint += node.nodeValue?.length || 0
articleText += node.nodeValue
newParagraph = false
}
function getTextNodes(node: Node) {
// If the function takes too long, throw an error
if (Date.now() - start > maxTime) {
const error = new Error('getTextNodes Timeout')
logger.error(error)
throw error
}
if (!node) return
if (node == startNode) {
pastStartNode = true
}
if (node.nodeType == 3) {
if (
pastStartNode &&
!reachedEndNode &&
!/^\s*$/.test(node.nodeValue || '')
) {
pushNode(node)
}
} else {
if (!nonParagraphTagsRegEx.test((node as Element).tagName))
newParagraph = true
}
for (
let i = 0, len = node.childNodes.length;
!reachedEndNode && i < len;
++i
) {
getTextNodes(node.childNodes[i])
}
if (node == endNode) {
reachedEndNode = true
}
}
getTextNodes(rootNode)
return {
textNodes,
articleText,
}
}
export const findEmbeddedHighlight = (
dom: Element
): EmbeddedHighlightData | undefined => {
const startNode = dom.querySelector(
'span[data-omnivore-highlight-start="true"]'
)
const endNode = dom.querySelector('span[data-omnivore-highlight-end="true"]')
const articleContentElement = dom
if (!articleContentElement || !startNode || !endNode) {
return undefined
}
try {
const beforeNodes = getTextNodesBetween(
dom,
articleContentElement,
startNode
)
const highlightNodes = getTextNodesBetween(dom, startNode, endNode)
const afterNodes = getTextNodesBetween(dom, endNode, articleContentElement)
const allArticleNodes = getTextNodesBetween(
dom,
articleContentElement,
articleContentElement
)
const patch = generateDiffPatch(
allArticleNodes,
beforeNodes,
highlightNodes,
afterNodes
)
const id = uuidv4()
const shortId = nanoid(8)
const info = getPrefixAndSuffix(allArticleNodes, patch)
const quote = getQuoteText(highlightNodes)
return {
id,
shortId,
quote,
patch,
prefix: info.prefix,
suffix: info.suffix,
}
} catch (error) {
logger.error(error)
return undefined
}
}
const getQuoteText = (highlight: ArticleTextContent): string => {
let quote = ''
highlight.textNodes.forEach((textNode, i) => {
if (textNode.isParagraphStart && i > 0) {
quote += '\n'
}
quote += textNode.node.textContent
})
return quote
}
function generateDiffPatch(
allArticleNodes: ArticleTextContent,
beforeNodes: ArticleTextContent,
highlightNodes: ArticleTextContent,
afterNodes: ArticleTextContent
): string {
const textWithTags = `${beforeNodes.articleText}<${highlightTag}>${highlightNodes.articleText}</${highlightTag}>${afterNodes.articleText}`
const diffMatchPatch = new DiffMatchPatch()
const patch = diffMatchPatch.patch_toText(
diffMatchPatch.patch_make(allArticleNodes.articleText, textWithTags)
)
if (!patch) throw new Error('Invalid patch')
return patch
}
function getPrefixAndSuffix(
articleTextNodes: ArticleTextContent,
patch: string
): {
prefix: string
suffix: string
highlightTextStart: number
highlightTextEnd: number
textNodes: TextNode[]
textNodeIndex: number
} {
if (!patch) throw new Error('Invalid patch')
const textNodes = articleTextNodes.textNodes
const { highlightTextStart, highlightTextEnd } = selectionOffsetsFromPatch(
articleTextNodes.articleText,
patch
)
// Searching for the starting text node using interpolation search algorithm
const textNodeIndex = interpolationSearch(
textNodes.map(({ startIndex: startIndex }) => startIndex),
highlightTextStart
)
const endTextNodeIndex = interpolationSearch(
textNodes.map(({ startIndex: startIndex }) => startIndex),
highlightTextEnd
)
const prefix = getSurroundingText({
textNodes,
startingTextNodeIndex: textNodeIndex,
startingOffset: highlightTextStart - textNodes[textNodeIndex].startIndex,
side: 'prefix',
})
const suffix = getSurroundingText({
textNodes,
startingTextNodeIndex: endTextNodeIndex,
startingOffset: highlightTextEnd - textNodes[endTextNodeIndex].startIndex,
side: 'suffix',
})
return {
prefix,
suffix,
highlightTextStart,
highlightTextEnd,
textNodes,
textNodeIndex,
}
}
/**
* Gets the part of text from the starting point to the paragraph ending from the
* specified side
* @param param0 - Object that includes textNodes array, starting point and the
* way of movement (prefix, suffix)
* @returns String of text to fulfill the paragraph that surrounds the
* highlight from either starting or ending point
*/
const getSurroundingText = ({
textNodes,
startingTextNodeIndex,
startingOffset,
side,
}: {
textNodes: TextNode[]
startingTextNodeIndex: number
startingOffset: number
side: 'prefix' | 'suffix'
}): string => {
const isPrefix = side === 'prefix'
let i = startingTextNodeIndex
const getTextPart = (): string => {
i += isPrefix ? -1 : 1
const { node, isParagraphStart: startsParagraph } = textNodes[i]
const text = node.nodeValue || ''
if (isPrefix) {
if (startsParagraph) return text
if (text.length > maxSurroundingTextLength) return text
return getTextPart() + text
} else {
if (!textNodes[i + 1] || textNodes[i + 1].isParagraphStart) return text
if (text.length > maxSurroundingTextLength) return text
return text + getTextPart()
}
}
const truncateText = (str: string): string => {
if (str.length <= maxSurroundingTextLength) return str
if (isPrefix) {
return str.slice(str.length - maxSurroundingTextLength)
}
return str.substring(0, maxSurroundingTextLength)
}
const { isParagraphStart: startsParagraph, node } =
textNodes[startingTextNodeIndex]
const nodeText = node.nodeValue || ''
const text = isPrefix
? nodeText.substring(0, startingOffset)
: nodeText.substring(startingOffset)
if (isPrefix) {
return truncateText(startsParagraph ? text : getTextPart() + text)
} else {
return truncateText(
!textNodes[i + 1] || textNodes[i + 1].isParagraphStart
? text
: text + getTextPart()
)
}
}
const selectionOffsetsFromPatch = (
articleText: string,
patch: string
): {
highlightTextStart: number
highlightTextEnd: number
matchingHighlightContent: RegExpExecArray
} => {
if (!patch) throw new Error('Invalid patch')
const dmp = new DiffMatchPatch()
// Applying a patch to the whole article text to find the selection content via regexp
const appliedPatch = dmp.patch_apply(dmp.patch_fromText(patch), articleText)
let matchingHighlightContent
if (!appliedPatch[1][0]) {
dmp.Match_Threshold = maxDeepPatchThreshhold
dmp.Match_Distance = maxDeepPatchDistance
const deeperAppliedPatch = dmp.patch_apply(
dmp.patch_fromText(patch),
articleText
)
if (!deeperAppliedPatch[1][0]) {
throw new Error('Unable to find the highlight')
} else {
matchingHighlightContent = highlightContentRegex.exec(
deeperAppliedPatch[0]
)
}
} else {
matchingHighlightContent = highlightContentRegex.exec(appliedPatch[0])
}
if (!matchingHighlightContent)
throw new Error('Unable to find the highlight from patch')
const highlightTextStart = matchingHighlightContent.index
const highlightTextEnd =
highlightTextStart + matchingHighlightContent[1].length
return {
highlightTextStart,
highlightTextEnd,
matchingHighlightContent,
}
}
const fillHighlight = ({
textNodes,
startingTextNodeIndex,
highlightTextStart,
highlightTextEnd,
}: {
textNodes: TextNode[]
startingTextNodeIndex: number
highlightTextStart: number
highlightTextEnd: number
}): FillNodeResponse => {
const {
node,
startIndex: startIndex,
isParagraphStart,
} = textNodes[startingTextNodeIndex]
const text = node.nodeValue || ''
const textBeforeHighlightLength = highlightTextStart - startIndex
const textAfterHighlightLength = highlightTextEnd - startIndex
const textPartsToHighlight = []
textBeforeHighlightLength > 0 &&
textPartsToHighlight.push({
text: text.substring(0, textBeforeHighlightLength),
highlight: false,
})
textPartsToHighlight.push({
text: text.substring(textBeforeHighlightLength, textAfterHighlightLength),
highlight: true,
})
textAfterHighlightLength <= text.length &&
textPartsToHighlight.push({
text: text.substring(textAfterHighlightLength),
highlight: false,
})
return {
node,
textPartsToHighlight,
isParagraphStart,
}
}
export function getArticleTextNodes(
document: Document
): ArticleTextContent | null {
try {
const rootNode = document.getRootNode()
return getTextNodesBetween(rootNode, rootNode, rootNode)
} catch (error) {
logger.error(error)
return null
}
}
export function makeHighlightNodeAttributes(
id: string,
patch: string,
articleTextNodes: ArticleTextContent
) {
const document = parseHTML('').document
const textNodes = articleTextNodes.textNodes
const { highlightTextStart, highlightTextEnd } = selectionOffsetsFromPatch(
articleTextNodes.articleText,
patch
)
// Searching for the starting text node using interpolation search algorithm
let startingTextNodeIndex = interpolationSearch(
textNodes.map(({ startIndex: startIndex }) => startIndex),
highlightTextStart
)
let quote = ''
while (
startingTextNodeIndex < textNodes.length &&
highlightTextEnd > textNodes[startingTextNodeIndex].startIndex
) {
const { node, textPartsToHighlight, isParagraphStart } = fillHighlight({
textNodes,
startingTextNodeIndex,
highlightTextStart,
highlightTextEnd,
})
const { parentNode, nextSibling } = node
// check if the node is a <pre> tag
const isPre = node.parentElement?.tagName === 'PRE'
parentNode?.removeChild(node)
textPartsToHighlight.forEach(({ highlight, text: rawText }, i) => {
// If we are not in preformatted text, prevent hardcoded \n,
// we'll create new-lines based on the startsParagraph data
const text = isPre ? rawText : rawText.replace(/\n/g, '')
const newTextNode = document.createTextNode(rawText)
if (!highlight) {
return parentNode?.insertBefore(newTextNode, nextSibling)
} else {
if (text) {
isParagraphStart && !i && quote && (quote += '\n')
quote += text
}
const newHighlightSpan = document.createElement('span')
newHighlightSpan.setAttribute(highlightIdAttribute, id)
newHighlightSpan.appendChild(newTextNode)
return parentNode?.insertBefore(newHighlightSpan, nextSibling)
}
})
startingTextNodeIndex++
}
}