/* eslint-disable @typescript-eslint/no-unsafe-call */ /* eslint-disable @typescript-eslint/restrict-plus-operands */ /* eslint-disable @typescript-eslint/no-unsafe-member-access */ /* eslint-disable @typescript-eslint/no-unsafe-assignment */ /* eslint-disable @typescript-eslint/no-unsafe-argument */ import { getDocument as _getDocument } from 'pdfjs-dist/legacy/build/pdf' import { TextItem, PDFPageProxy, PDFDocumentProxy, } from 'pdfjs-dist/types/display/api' interface Page { lines: string[] } // Unused at the moment -- commented out for now to satisfy linter const MAX_TITLE_LENGTH = 95 type MetadataInfoKey = | 'Title' | 'Author' | 'Subject' | 'CreationDate' | 'ModDate' interface MetadataInfo { Title?: string Author?: string CreationDate?: string ModDate?: string Subject?: string } interface ParsedPdf { content: string title?: string author?: string description?: string } export const parsePdf = async (url: URL): Promise => { const documentLoadingTask = _getDocument(url) const document = await documentLoadingTask.promise const text = await getDocumentText(document) // eslint-disable-next-line no-control-regex const result: ParsedPdf = { content: text.replace(/\x00/g, '') } const title = await getMetadataItem(document, 'Title') if (title) result.title = title const author = await getMetadataItem(document, 'Author') if (author) result.author = author const description = await getMetadataItem(document, 'Subject') if (description) result.description = description return result } export const getDocument = (source: string): Promise => { const documentLoadingTask = _getDocument(source) return documentLoadingTask.promise } const getMetadataItem = async ( document: PDFDocumentProxy, key: MetadataInfoKey ): Promise => { return await document .getMetadata() .then((metadata) => metadata.info as MetadataInfo) .then((info) => { return info[key] }) } export const getDocumentTitle = async ( document: PDFDocumentProxy ): Promise => { const title = await getMetadataItem(document, 'Title') if (title) { return title } // Attempt to grab the title from the first page // because extracted text is returned as joined // lines, we replace the line breaks with spaces const pageText = await readPdfText(document, 1) if (pageText.length) { const result = pageText.substring(0, MAX_TITLE_LENGTH) return result.split('\n').join('') } return undefined } export const getDocumentText = async ( document: PDFDocumentProxy ): Promise => { const pages = await readPdfText(document) return pages } export const readPdfText = async ( document: PDFDocumentProxy, maxPages: number | undefined = undefined ): Promise => { const pages: Page[] = [] const numPages = maxPages || document.numPages for (let i = 0; i < numPages; i++) { pages.push(await parsePage(await document.getPage(i + 1))) } return pages.reduce((accum, page) => { return accum.concat(page.lines.join('\n') + '\n') }, '') } const parsePage = async (pdfPage: PDFPageProxy): Promise => { const rawContent = await pdfPage.getTextContent() return parsePageItems( rawContent.items.filter((item): item is TextItem => 'str' in item) ) } /** * Parses individual text items generated by pdf.js This allows lower level control of what actually * gets parsed. For example, a consumer of this function may remove entire sections of the pdf text * prior to passing items in here. See parsePage function above for example usage. * * @param pdfItems An array of TextItem items. */ const parsePageItems = (pdfItems: TextItem[]): Page => { const lineData: { [y: number]: TextItem[] } = {} for (let i = 0; i < pdfItems.length; i++) { const item = pdfItems[i] const y = item.transform[5] /* eslint-disable no-prototype-builtins */ if (!lineData.hasOwnProperty(y)) { lineData[y] = [] } lineData[y].push(item) } const yCoords = Object.keys(lineData) .map((key) => Number(key)) // b - a here because the bottom is y = 0 so we want that to be last .sort((a, b) => b - a) // insert an empty line between any 2 lines where their distance is greater than the upper line's height .reduce((accum: number[], currentY, index, array) => { const nextY = array[index + 1] if (nextY != undefined) { const currentLineHeight: number = lineData[currentY].reduce( (finalValue, current) => finalValue > current.height ? finalValue : current.height, -1 ) // currentY - nextY because currentY will be higher than nextY if (Math.floor((currentY - nextY) / currentLineHeight) > 1) { const newY = currentY - currentLineHeight lineData[newY] = [] return accum.concat(currentY, newY) } } return accum.concat(currentY) }, []) const lines: string[] = [] for (let i = 0; i < yCoords.length; i++) { const y = yCoords[i] // sort by x position (position in line) const lineItems = lineData[y] .sort((a, b) => a.transform[4] - b.transform[4]) .filter((item) => !!item.str) let line = lineItems.length ? lineItems[0].str : '' for (let j = 1; j < lineItems.length; j++) { const item = lineItems[j] const lastItem = lineItems[j - 1] const xDiff = item.transform[4] - (lastItem.transform[4] + lastItem.width) // insert spaces for items that are far apart horizontally if ( item.height !== 0 && (xDiff > item.height || xDiff > lastItem.height) ) { const spaceCountA = Math.ceil(xDiff / item.height) let spaceCount = spaceCountA if (lastItem.height !== item.height) { const spaceCountB = Math.ceil(xDiff / lastItem.height) spaceCount = spaceCountA > spaceCountB ? spaceCountA : spaceCountB } if (isNaN(spaceCount) || isFinite(spaceCount) === false) { spaceCount = 1 } line += Array(spaceCount).fill('').join(' ') } line += item.str } lines.push(line) } return { lines, } }