213 lines
6.1 KiB
TypeScript
213 lines
6.1 KiB
TypeScript
/* eslint-disable @typescript-eslint/no-unsafe-call */
|
|
/* eslint-disable @typescript-eslint/restrict-plus-operands */
|
|
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
|
|
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
|
|
/* eslint-disable @typescript-eslint/no-unsafe-argument */
|
|
import { getDocument as _getDocument } from 'pdfjs-dist/legacy/build/pdf'
|
|
import {
|
|
TextItem,
|
|
PDFPageProxy,
|
|
PDFDocumentProxy,
|
|
} from 'pdfjs-dist/types/display/api'
|
|
|
|
interface Page {
|
|
lines: string[]
|
|
}
|
|
|
|
// Unused at the moment -- commented out for now to satisfy linter
|
|
const MAX_TITLE_LENGTH = 95
|
|
|
|
type MetadataInfoKey =
|
|
| 'Title'
|
|
| 'Author'
|
|
| 'Subject'
|
|
| 'CreationDate'
|
|
| 'ModDate'
|
|
|
|
interface MetadataInfo {
|
|
Title?: string
|
|
Author?: string
|
|
CreationDate?: string
|
|
ModDate?: string
|
|
Subject?: string
|
|
}
|
|
|
|
interface ParsedPdf {
|
|
content: string
|
|
title?: string
|
|
author?: string
|
|
description?: string
|
|
}
|
|
|
|
export const parsePdf = async (url: URL): Promise<ParsedPdf> => {
|
|
const documentLoadingTask = _getDocument(url)
|
|
const document = await documentLoadingTask.promise
|
|
|
|
const text = await getDocumentText(document)
|
|
// eslint-disable-next-line no-control-regex
|
|
const result: ParsedPdf = { content: text.replace(/\x00/g, '') }
|
|
|
|
const title = await getMetadataItem(document, 'Title')
|
|
if (title) result.title = title
|
|
|
|
const author = await getMetadataItem(document, 'Author')
|
|
if (author) result.author = author
|
|
|
|
const description = await getMetadataItem(document, 'Subject')
|
|
if (description) result.description = description
|
|
|
|
return result
|
|
}
|
|
|
|
export const getDocument = (source: string): Promise<PDFDocumentProxy> => {
|
|
const documentLoadingTask = _getDocument(source)
|
|
return documentLoadingTask.promise
|
|
}
|
|
|
|
const getMetadataItem = async (
|
|
document: PDFDocumentProxy,
|
|
key: MetadataInfoKey
|
|
): Promise<string | undefined> => {
|
|
return await document
|
|
.getMetadata()
|
|
.then((metadata) => metadata.info as MetadataInfo)
|
|
.then((info) => {
|
|
return info[key]
|
|
})
|
|
}
|
|
|
|
export const getDocumentTitle = async (
|
|
document: PDFDocumentProxy
|
|
): Promise<string | undefined> => {
|
|
const title = await getMetadataItem(document, 'Title')
|
|
if (title) {
|
|
return title
|
|
}
|
|
|
|
// Attempt to grab the title from the first page
|
|
// because extracted text is returned as joined
|
|
// lines, we replace the line breaks with spaces
|
|
const pageText = await readPdfText(document, 1)
|
|
if (pageText.length) {
|
|
const result = pageText.substring(0, MAX_TITLE_LENGTH)
|
|
return result.split('\n').join('')
|
|
}
|
|
|
|
return undefined
|
|
}
|
|
|
|
export const getDocumentText = async (
|
|
document: PDFDocumentProxy
|
|
): Promise<string> => {
|
|
const pages = await readPdfText(document)
|
|
return pages
|
|
}
|
|
|
|
export const readPdfText = async (
|
|
document: PDFDocumentProxy,
|
|
maxPages: number | undefined = undefined
|
|
): Promise<string> => {
|
|
const pages: Page[] = []
|
|
const numPages = maxPages || document.numPages
|
|
|
|
for (let i = 0; i < numPages; i++) {
|
|
pages.push(await parsePage(await document.getPage(i + 1)))
|
|
}
|
|
|
|
return pages.reduce((accum, page) => {
|
|
return accum.concat(page.lines.join('\n') + '\n')
|
|
}, '')
|
|
}
|
|
|
|
const parsePage = async (pdfPage: PDFPageProxy): Promise<Page> => {
|
|
const rawContent = await pdfPage.getTextContent()
|
|
return parsePageItems(
|
|
rawContent.items.filter((item): item is TextItem => 'str' in item)
|
|
)
|
|
}
|
|
|
|
/**
|
|
* Parses individual text items generated by pdf.js This allows lower level control of what actually
|
|
* gets parsed. For example, a consumer of this function may remove entire sections of the pdf text
|
|
* prior to passing items in here. See parsePage function above for example usage.
|
|
*
|
|
* @param pdfItems An array of TextItem items.
|
|
*/
|
|
const parsePageItems = (pdfItems: TextItem[]): Page => {
|
|
const lineData: { [y: number]: TextItem[] } = {}
|
|
|
|
for (let i = 0; i < pdfItems.length; i++) {
|
|
const item = pdfItems[i]
|
|
const y = item.transform[5]
|
|
/* eslint-disable no-prototype-builtins */
|
|
if (!lineData.hasOwnProperty(y)) {
|
|
lineData[y] = []
|
|
}
|
|
lineData[y].push(item)
|
|
}
|
|
|
|
const yCoords = Object.keys(lineData)
|
|
.map((key) => Number(key))
|
|
// b - a here because the bottom is y = 0 so we want that to be last
|
|
.sort((a, b) => b - a)
|
|
// insert an empty line between any 2 lines where their distance is greater than the upper line's height
|
|
.reduce((accum: number[], currentY, index, array) => {
|
|
const nextY = array[index + 1]
|
|
if (nextY != undefined) {
|
|
const currentLineHeight: number = lineData[currentY].reduce(
|
|
(finalValue, current) =>
|
|
finalValue > current.height ? finalValue : current.height,
|
|
-1
|
|
)
|
|
|
|
// currentY - nextY because currentY will be higher than nextY
|
|
if (Math.floor((currentY - nextY) / currentLineHeight) > 1) {
|
|
const newY = currentY - currentLineHeight
|
|
lineData[newY] = []
|
|
return accum.concat(currentY, newY)
|
|
}
|
|
}
|
|
return accum.concat(currentY)
|
|
}, [])
|
|
|
|
const lines: string[] = []
|
|
for (let i = 0; i < yCoords.length; i++) {
|
|
const y = yCoords[i]
|
|
// sort by x position (position in line)
|
|
const lineItems = lineData[y]
|
|
.sort((a, b) => a.transform[4] - b.transform[4])
|
|
.filter((item) => !!item.str)
|
|
let line = lineItems.length ? lineItems[0].str : ''
|
|
for (let j = 1; j < lineItems.length; j++) {
|
|
const item = lineItems[j]
|
|
const lastItem = lineItems[j - 1]
|
|
const xDiff = item.transform[4] - (lastItem.transform[4] + lastItem.width)
|
|
|
|
// insert spaces for items that are far apart horizontally
|
|
if (
|
|
item.height !== 0 &&
|
|
(xDiff > item.height || xDiff > lastItem.height)
|
|
) {
|
|
const spaceCountA = Math.ceil(xDiff / item.height)
|
|
let spaceCount = spaceCountA
|
|
if (lastItem.height !== item.height) {
|
|
const spaceCountB = Math.ceil(xDiff / lastItem.height)
|
|
spaceCount = spaceCountA > spaceCountB ? spaceCountA : spaceCountB
|
|
}
|
|
|
|
if (isNaN(spaceCount) || isFinite(spaceCount) === false) {
|
|
spaceCount = 1
|
|
}
|
|
|
|
line += Array(spaceCount).fill('').join(' ')
|
|
}
|
|
line += item.str
|
|
}
|
|
lines.push(line)
|
|
}
|
|
|
|
return {
|
|
lines,
|
|
}
|
|
}
|