Files
omnivore/packages/pdf-handler/src/pdf.ts
2023-10-20 11:51:02 +05:30

213 lines
6.1 KiB
TypeScript

/* eslint-disable @typescript-eslint/no-unsafe-call */
/* eslint-disable @typescript-eslint/restrict-plus-operands */
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
/* eslint-disable @typescript-eslint/no-unsafe-argument */
import { getDocument as _getDocument } from 'pdfjs-dist/legacy/build/pdf'
import {
TextItem,
PDFPageProxy,
PDFDocumentProxy,
} from 'pdfjs-dist/types/display/api'
interface Page {
lines: string[]
}
// Unused at the moment -- commented out for now to satisfy linter
const MAX_TITLE_LENGTH = 95
type MetadataInfoKey =
| 'Title'
| 'Author'
| 'Subject'
| 'CreationDate'
| 'ModDate'
interface MetadataInfo {
Title?: string
Author?: string
CreationDate?: string
ModDate?: string
Subject?: string
}
interface ParsedPdf {
content: string
title?: string
author?: string
description?: string
}
export const parsePdf = async (url: URL): Promise<ParsedPdf> => {
const documentLoadingTask = _getDocument(url)
const document = await documentLoadingTask.promise
const text = await getDocumentText(document)
// eslint-disable-next-line no-control-regex
const result: ParsedPdf = { content: text.replace(/\x00/g, '') }
const title = await getMetadataItem(document, 'Title')
if (title) result.title = title
const author = await getMetadataItem(document, 'Author')
if (author) result.author = author
const description = await getMetadataItem(document, 'Subject')
if (description) result.description = description
return result
}
export const getDocument = (source: string): Promise<PDFDocumentProxy> => {
const documentLoadingTask = _getDocument(source)
return documentLoadingTask.promise
}
const getMetadataItem = async (
document: PDFDocumentProxy,
key: MetadataInfoKey
): Promise<string | undefined> => {
return await document
.getMetadata()
.then((metadata) => metadata.info as MetadataInfo)
.then((info) => {
return info[key]
})
}
export const getDocumentTitle = async (
document: PDFDocumentProxy
): Promise<string | undefined> => {
const title = await getMetadataItem(document, 'Title')
if (title) {
return title
}
// Attempt to grab the title from the first page
// because extracted text is returned as joined
// lines, we replace the line breaks with spaces
const pageText = await readPdfText(document, 1)
if (pageText.length) {
const result = pageText.substring(0, MAX_TITLE_LENGTH)
return result.split('\n').join('')
}
return undefined
}
export const getDocumentText = async (
document: PDFDocumentProxy
): Promise<string> => {
const pages = await readPdfText(document)
return pages
}
export const readPdfText = async (
document: PDFDocumentProxy,
maxPages: number | undefined = undefined
): Promise<string> => {
const pages: Page[] = []
const numPages = maxPages || document.numPages
for (let i = 0; i < numPages; i++) {
pages.push(await parsePage(await document.getPage(i + 1)))
}
return pages.reduce((accum, page) => {
return accum.concat(page.lines.join('\n') + '\n')
}, '')
}
const parsePage = async (pdfPage: PDFPageProxy): Promise<Page> => {
const rawContent = await pdfPage.getTextContent()
return parsePageItems(
rawContent.items.filter((item): item is TextItem => 'str' in item)
)
}
/**
* Parses individual text items generated by pdf.js This allows lower level control of what actually
* gets parsed. For example, a consumer of this function may remove entire sections of the pdf text
* prior to passing items in here. See parsePage function above for example usage.
*
* @param pdfItems An array of TextItem items.
*/
const parsePageItems = (pdfItems: TextItem[]): Page => {
const lineData: { [y: number]: TextItem[] } = {}
for (let i = 0; i < pdfItems.length; i++) {
const item = pdfItems[i]
const y = item.transform[5]
/* eslint-disable no-prototype-builtins */
if (!lineData.hasOwnProperty(y)) {
lineData[y] = []
}
lineData[y].push(item)
}
const yCoords = Object.keys(lineData)
.map((key) => Number(key))
// b - a here because the bottom is y = 0 so we want that to be last
.sort((a, b) => b - a)
// insert an empty line between any 2 lines where their distance is greater than the upper line's height
.reduce((accum: number[], currentY, index, array) => {
const nextY = array[index + 1]
if (nextY != undefined) {
const currentLineHeight: number = lineData[currentY].reduce(
(finalValue, current) =>
finalValue > current.height ? finalValue : current.height,
-1
)
// currentY - nextY because currentY will be higher than nextY
if (Math.floor((currentY - nextY) / currentLineHeight) > 1) {
const newY = currentY - currentLineHeight
lineData[newY] = []
return accum.concat(currentY, newY)
}
}
return accum.concat(currentY)
}, [])
const lines: string[] = []
for (let i = 0; i < yCoords.length; i++) {
const y = yCoords[i]
// sort by x position (position in line)
const lineItems = lineData[y]
.sort((a, b) => a.transform[4] - b.transform[4])
.filter((item) => !!item.str)
let line = lineItems.length ? lineItems[0].str : ''
for (let j = 1; j < lineItems.length; j++) {
const item = lineItems[j]
const lastItem = lineItems[j - 1]
const xDiff = item.transform[4] - (lastItem.transform[4] + lastItem.width)
// insert spaces for items that are far apart horizontally
if (
item.height !== 0 &&
(xDiff > item.height || xDiff > lastItem.height)
) {
const spaceCountA = Math.ceil(xDiff / item.height)
let spaceCount = spaceCountA
if (lastItem.height !== item.height) {
const spaceCountB = Math.ceil(xDiff / lastItem.height)
spaceCount = spaceCountA > spaceCountB ? spaceCountA : spaceCountB
}
if (isNaN(spaceCount) || isFinite(spaceCount) === false) {
spaceCount = 1
}
line += Array(spaceCount).fill('').join(' ')
}
line += item.str
}
lines.push(line)
}
return {
lines,
}
}