omnivore/packages/pdf-handler/src/pdf.ts

/* eslint-disable @typescript-eslint/no-unsafe-call */
/* eslint-disable @typescript-eslint/restrict-plus-operands */
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
/* eslint-disable @typescript-eslint/no-unsafe-argument */
import { getDocument as _getDocument } from 'pdfjs-dist/legacy/build/pdf'
import {
  TextItem,
  PDFPageProxy,
  PDFDocumentProxy,
} from 'pdfjs-dist/types/display/api'

interface Page {
  lines: string[]
}

// Unused at the moment -- commented out for now to satisfy linter
const MAX_TITLE_LENGTH = 95

type MetadataInfoKey =
  | 'Title'
  | 'Author'
  | 'Subject'
  | 'CreationDate'
  | 'ModDate'

interface MetadataInfo {
  Title?: string
  Author?: string
  CreationDate?: string
  ModDate?: string
  Subject?: string
}

interface ParsedPdf {
  content: string
  title?: string
  author?: string
  description?: string
}

export const parsePdf = async (url: URL): Promise<ParsedPdf> => {
  const documentLoadingTask = _getDocument(url)
  const document = await documentLoadingTask.promise

  const text = await getDocumentText(document)
  // eslint-disable-next-line no-control-regex
  const result: ParsedPdf = { content: text.replace(/\x00/g, '') }

  const title = await getMetadataItem(document, 'Title')
  if (title) result.title = title

  const author = await getMetadataItem(document, 'Author')
  if (author) result.author = author

  const description = await getMetadataItem(document, 'Subject')
  if (description) result.description = description

  return result
}

export const getDocument = (source: string): Promise<PDFDocumentProxy> => {
  const documentLoadingTask = _getDocument(source)
  return documentLoadingTask.promise
}

const getMetadataItem = async (
  document: PDFDocumentProxy,
  key: MetadataInfoKey
): Promise<string | undefined> => {
  return await document
    .getMetadata()
    .then((metadata) => metadata.info as MetadataInfo)
    .then((info) => {
      return info[key]
    })
}

export const getDocumentTitle = async (
  document: PDFDocumentProxy
): Promise<string | undefined> => {
  const title = await getMetadataItem(document, 'Title')
  if (title) {
    return title
  }

  // Attempt to grab the title from the first page
  // because extracted text is returned as joined
  // lines, we replace the line breaks with spaces
  const pageText = await readPdfText(document, 1)
  if (pageText.length) {
    const result = pageText.substring(0, MAX_TITLE_LENGTH)
    return result.split('\n').join('')
  }

  return undefined
}

export const getDocumentText = async (
  document: PDFDocumentProxy
): Promise<string> => {
  const pages = await readPdfText(document)
  return pages
}

export const readPdfText = async (
  document: PDFDocumentProxy,
  maxPages: number | undefined = undefined
): Promise<string> => {
  const pages: Page[] = []
  const numPages = maxPages || document.numPages

  for (let i = 0; i < numPages; i++) {
    pages.push(await parsePage(await document.getPage(i + 1)))
  }

  return pages.reduce((accum, page) => {
    return accum.concat(page.lines.join('\n') + '\n')
  }, '')
}

const parsePage = async (pdfPage: PDFPageProxy): Promise<Page> => {
  const rawContent = await pdfPage.getTextContent()
  return parsePageItems(
    rawContent.items.filter((item): item is TextItem => 'str' in item)
  )
}

/**
 * Parses individual text items generated by pdf.js This allows lower level control of what actually
 * gets parsed. For example, a consumer of this function may remove entire sections of the pdf text
 * prior to passing items in here. See parsePage function above for example usage.
 *
 * @param pdfItems An array of TextItem items.
 */
const parsePageItems = (pdfItems: TextItem[]): Page => {
  const lineData: { [y: number]: TextItem[] } = {}

  for (let i = 0; i < pdfItems.length; i++) {
    const item = pdfItems[i]
    const y = item.transform[5]
    /* eslint-disable no-prototype-builtins */
    if (!lineData.hasOwnProperty(y)) {
      lineData[y] = []
    }
    lineData[y].push(item)
  }

  const yCoords = Object.keys(lineData)
    .map((key) => Number(key))
    // b - a here because the bottom is y = 0 so we want that to be last
    .sort((a, b) => b - a)
    // insert an empty line between any 2 lines where their distance is greater than the upper line's height
    .reduce((accum: number[], currentY, index, array) => {
      const nextY = array[index + 1]
      if (nextY != undefined) {
        const currentLineHeight: number = lineData[currentY].reduce(
          (finalValue, current) =>
            finalValue > current.height ? finalValue : current.height,
          -1
        )

        // currentY - nextY because currentY will be higher than nextY
        if (Math.floor((currentY - nextY) / currentLineHeight) > 1) {
          const newY = currentY - currentLineHeight
          lineData[newY] = []
          return accum.concat(currentY, newY)
        }
      }
      return accum.concat(currentY)
    }, [])

  const lines: string[] = []
  for (let i = 0; i < yCoords.length; i++) {
    const y = yCoords[i]
    // sort by x position (position in line)
    const lineItems = lineData[y]
      .sort((a, b) => a.transform[4] - b.transform[4])
      .filter((item) => !!item.str)
    let line = lineItems.length ? lineItems[0].str : ''
    for (let j = 1; j < lineItems.length; j++) {
      const item = lineItems[j]
      const lastItem = lineItems[j - 1]
      const xDiff = item.transform[4] - (lastItem.transform[4] + lastItem.width)

      // insert spaces for items that are far apart horizontally
      if (
        item.height !== 0 &&
        (xDiff > item.height || xDiff > lastItem.height)
      ) {
        const spaceCountA = Math.ceil(xDiff / item.height)
        let spaceCount = spaceCountA
        if (lastItem.height !== item.height) {
          const spaceCountB = Math.ceil(xDiff / lastItem.height)
          spaceCount = spaceCountA > spaceCountB ? spaceCountA : spaceCountB
        }

        if (isNaN(spaceCount) || isFinite(spaceCount) === false) {
          spaceCount = 1
        }

        line += Array(spaceCount).fill('').join(' ')
      }
      line += item.str
    }
    lines.push(line)
  }

  return {
    lines,
  }
}