Open source omnivore
This commit is contained in:
4
packages/pdf-handler/.eslintignore
Normal file
4
packages/pdf-handler/.eslintignore
Normal file
@ -0,0 +1,4 @@
|
||||
node_modules/
|
||||
dist/
|
||||
readabilityjs/
|
||||
src/generated/
|
||||
6
packages/pdf-handler/.eslintrc
Normal file
6
packages/pdf-handler/.eslintrc
Normal file
@ -0,0 +1,6 @@
|
||||
{
|
||||
"extends": "../../.eslintrc",
|
||||
"parserOptions": {
|
||||
"project": "tsconfig.json"
|
||||
}
|
||||
}
|
||||
16
packages/pdf-handler/.gcloudignore
Normal file
16
packages/pdf-handler/.gcloudignore
Normal file
@ -0,0 +1,16 @@
|
||||
# This file specifies files that are *not* uploaded to Google Cloud Platform
|
||||
# using gcloud. It follows the same syntax as .gitignore, with the addition of
|
||||
# "#!include" directives (which insert the entries of the given .gitignore-style
|
||||
# file at that point).
|
||||
#
|
||||
# For more information, run:
|
||||
# $ gcloud topic gcloudignore
|
||||
#
|
||||
.gcloudignore
|
||||
# If you would like to upload your .git directory, .gitignore file or files
|
||||
# from your .gitignore file, remove the corresponding line
|
||||
# below:
|
||||
.git
|
||||
.gitignore
|
||||
|
||||
node_modules
|
||||
5
packages/pdf-handler/mocha-config.json
Normal file
5
packages/pdf-handler/mocha-config.json
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"extension": ["ts"],
|
||||
"spec": "test/**/*.test.ts",
|
||||
"require": "test/babel-register.js"
|
||||
}
|
||||
33
packages/pdf-handler/package.json
Normal file
33
packages/pdf-handler/package.json
Normal file
@ -0,0 +1,33 @@
|
||||
{
|
||||
"name": "@omnivore/pdf-handler",
|
||||
"version": "1.0.0",
|
||||
"description": "",
|
||||
"main": "build/src/index.js",
|
||||
"types": "build/src/index.d.ts",
|
||||
"files": [
|
||||
"build/src"
|
||||
],
|
||||
"license": "Apache-2.0",
|
||||
"keywords": [],
|
||||
"scripts": {
|
||||
"test": "yarn mocha -r ts-node/register --config mocha-config.json",
|
||||
"lint": "eslint src --ext ts,js,tsx,jsx",
|
||||
"compile": "tsc",
|
||||
"build": "tsc",
|
||||
"start": "functions-framework --source=build/src/ --target=pdfHandler",
|
||||
"dev": "concurrently \"tsc -w\" \"nodemon --watch ./build/ --exec npm run start\"",
|
||||
"gcloud-deploy": "gcloud functions deploy pdfHandler --region=$npm_config_region --runtime nodejs14 --trigger-bucket=$npm_config_bucket --env-vars-file=../gcf-shared/env-$npm_config_env.yaml",
|
||||
"deploy": "yarn build && yarn gcloud-deploy"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^14.11.2"
|
||||
},
|
||||
"dependencies": {
|
||||
"@google-cloud/functions-framework": "1.9.0",
|
||||
"@google-cloud/pubsub": "^2.16.3",
|
||||
"@google-cloud/storage": "^5.13.0",
|
||||
"axios": "^0.21.1",
|
||||
"concurrently": "^6.2.1",
|
||||
"pdfjs-dist": "^2.9.359"
|
||||
}
|
||||
}
|
||||
67
packages/pdf-handler/src/backfill.ts
Normal file
67
packages/pdf-handler/src/backfill.ts
Normal file
@ -0,0 +1,67 @@
|
||||
/* eslint-disable prefer-const */
|
||||
/* eslint-disable @typescript-eslint/restrict-template-expressions */
|
||||
import { Storage } from '@google-cloud/storage'
|
||||
import { parsePdf } from './pdf'
|
||||
import axios from 'axios'
|
||||
|
||||
const storage = new Storage()
|
||||
|
||||
const postUpdate = async (
|
||||
fileId: string,
|
||||
content: string,
|
||||
title?: string,
|
||||
author?: string,
|
||||
description?: string
|
||||
) => {
|
||||
const url =
|
||||
'https://backend-dot-omnivore-production.wl.r.appspot.com/svc/pubsub/content/search?token=aYYLeK0kYlwnQg0wBMHO6EoAjf0LkoQ4Dyx0NGtpdjbh7F52EzHda8'
|
||||
|
||||
// const localUrl =
|
||||
// 'http://localhost:4000/svc/pubsub/content/search?token=aYYLeK0kYlwnQg0wBMHO6EoAjf0LkoQ4Dyx0NGtpdjbh7F52EzHda8'
|
||||
|
||||
const data = JSON.stringify({
|
||||
fileId,
|
||||
content,
|
||||
title,
|
||||
author,
|
||||
description,
|
||||
})
|
||||
|
||||
const body = {
|
||||
message: {
|
||||
data: Buffer.from(data).toString('base64'),
|
||||
},
|
||||
}
|
||||
|
||||
const res = await axios.post(url, body)
|
||||
console.log('res', res.status)
|
||||
}
|
||||
|
||||
const listFiles = async () => {
|
||||
const res = await storage
|
||||
.bucket('omnivore')
|
||||
.getFiles({ prefix: 'u/', maxResults: 50 })
|
||||
console.log('result', res)
|
||||
|
||||
const [files] = res
|
||||
console.log('Files:')
|
||||
for (const file of files) {
|
||||
const url = file.publicUrl()
|
||||
const [isPublic] = await file.isPublic()
|
||||
console.log(file.publicUrl(), 'is public:', isPublic)
|
||||
if (isPublic) {
|
||||
const parsed = await parsePdf(new URL(url))
|
||||
// console.log(text)
|
||||
// console.log('\n\n')
|
||||
await postUpdate(
|
||||
file.name,
|
||||
parsed.content,
|
||||
parsed.title,
|
||||
parsed.author,
|
||||
parsed.description
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
listFiles().catch(console.error)
|
||||
88
packages/pdf-handler/src/index.ts
Normal file
88
packages/pdf-handler/src/index.ts
Normal file
@ -0,0 +1,88 @@
|
||||
import {
|
||||
EventFunction,
|
||||
CloudFunctionsContext,
|
||||
} from '@google-cloud/functions-framework/build/src/functions'
|
||||
import { Storage } from '@google-cloud/storage'
|
||||
import { PubSub } from '@google-cloud/pubsub'
|
||||
import { parsePdf } from './pdf'
|
||||
|
||||
const pubsub = new PubSub()
|
||||
const storage = new Storage()
|
||||
const CONTENT_UPDATE_TOPIC = 'updatePageContent'
|
||||
|
||||
interface StorageEventData {
|
||||
bucket: string
|
||||
name: string
|
||||
contentType: string
|
||||
}
|
||||
|
||||
// Ensure this is a finalize event and that it is stored in the `u/` directory and is a PDF
|
||||
const shouldHandle = (data: StorageEventData, ctx: CloudFunctionsContext) => {
|
||||
if (ctx.eventType !== 'google.storage.object.finalize') {
|
||||
return false
|
||||
}
|
||||
if (
|
||||
!data.name.startsWith('u/') ||
|
||||
data.contentType.toLowerCase() != 'application/pdf'
|
||||
) {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
const getDocumentUrl = (data: StorageEventData): URL | undefined => {
|
||||
try {
|
||||
const bucket = storage.bucket(data.bucket)
|
||||
const file = bucket.file(data.name)
|
||||
return new URL(file.publicUrl())
|
||||
} catch (e) {
|
||||
return undefined
|
||||
}
|
||||
}
|
||||
|
||||
export const updatePageContent = (
|
||||
fileId: string,
|
||||
content: string,
|
||||
title?: string,
|
||||
author?: string,
|
||||
description?: string
|
||||
): Promise<string | undefined> => {
|
||||
return pubsub
|
||||
.topic(CONTENT_UPDATE_TOPIC)
|
||||
.publish(
|
||||
Buffer.from(
|
||||
JSON.stringify({ fileId, content, title, author, description })
|
||||
)
|
||||
)
|
||||
.catch((err) => {
|
||||
console.error('error publishing conentUpdate:', err)
|
||||
return undefined
|
||||
})
|
||||
}
|
||||
|
||||
export const pdfHandler: EventFunction = async (event, context) => {
|
||||
const data = event as StorageEventData
|
||||
const ctx = context as CloudFunctionsContext
|
||||
|
||||
if (shouldHandle(data, ctx)) {
|
||||
console.log('handling pdf data', data)
|
||||
|
||||
const url = getDocumentUrl(data)
|
||||
if (!url) {
|
||||
console.log('Could not fetch PDF', data.bucket, data.name)
|
||||
return
|
||||
}
|
||||
|
||||
const parsed = await parsePdf(url)
|
||||
const res = await updatePageContent(
|
||||
data.name,
|
||||
parsed.content,
|
||||
parsed.title,
|
||||
parsed.author,
|
||||
parsed.description
|
||||
)
|
||||
console.log('publish result', res)
|
||||
} else {
|
||||
console.log('not handling pdf data', data)
|
||||
}
|
||||
}
|
||||
212
packages/pdf-handler/src/pdf.ts
Normal file
212
packages/pdf-handler/src/pdf.ts
Normal file
@ -0,0 +1,212 @@
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-call */
|
||||
/* eslint-disable @typescript-eslint/restrict-plus-operands */
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-member-access */
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-assignment */
|
||||
/* eslint-disable @typescript-eslint/no-unsafe-argument */
|
||||
import { getDocument as _getDocument } from 'pdfjs-dist/legacy/build/pdf'
|
||||
import {
|
||||
TextItem,
|
||||
PDFPageProxy,
|
||||
PDFDocumentProxy,
|
||||
} from 'pdfjs-dist/types/display/api'
|
||||
|
||||
interface Page {
|
||||
lines: string[]
|
||||
}
|
||||
|
||||
// Unused at the moment -- comented out for now to satisfy linter
|
||||
const MAX_TITLE_LENGTH = 95
|
||||
|
||||
type MetadataInfoKey =
|
||||
| 'Title'
|
||||
| 'Author'
|
||||
| 'Subject'
|
||||
| 'CreationDate'
|
||||
| 'ModDate'
|
||||
|
||||
interface MetadataInfo {
|
||||
Title?: string
|
||||
Author?: string
|
||||
CreationDate?: string
|
||||
ModDate?: string
|
||||
Subject?: string
|
||||
}
|
||||
|
||||
interface ParsedPdf {
|
||||
content: string
|
||||
title?: string
|
||||
author?: string
|
||||
description?: string
|
||||
}
|
||||
|
||||
export const parsePdf = async (url: URL): Promise<ParsedPdf> => {
|
||||
const documentLoadingTask = _getDocument(url)
|
||||
const document = await documentLoadingTask.promise
|
||||
|
||||
const text = await getDocumentText(document)
|
||||
// eslint-disable-next-line no-control-regex
|
||||
const result: ParsedPdf = { content: text.replace(/\x00/g, '') }
|
||||
|
||||
const title = await getMetadataItem(document, 'Title')
|
||||
if (title) result.title = title
|
||||
|
||||
const author = await getMetadataItem(document, 'Author')
|
||||
if (author) result.author = author
|
||||
|
||||
const description = await getMetadataItem(document, 'Subject')
|
||||
if (description) result.description = description
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
export const getDocument = (source: string): Promise<PDFDocumentProxy> => {
|
||||
const documentLoadingTask = _getDocument(source)
|
||||
return documentLoadingTask.promise
|
||||
}
|
||||
|
||||
const getMetadataItem = async (
|
||||
document: PDFDocumentProxy,
|
||||
key: MetadataInfoKey
|
||||
): Promise<string | undefined> => {
|
||||
return await document
|
||||
.getMetadata()
|
||||
.then((metadata) => metadata.info as MetadataInfo)
|
||||
.then((info) => {
|
||||
return info[key]
|
||||
})
|
||||
}
|
||||
|
||||
export const getDocumentTitle = async (
|
||||
document: PDFDocumentProxy
|
||||
): Promise<string | undefined> => {
|
||||
const title = await getMetadataItem(document, 'Title')
|
||||
if (title) {
|
||||
return title
|
||||
}
|
||||
|
||||
// Attempt to grab the title from the first page
|
||||
// because extracted text is returned as joined
|
||||
// lines, we replace the line breaks with spaces
|
||||
const pageText = await readPdfText(document, 1)
|
||||
if (pageText.length) {
|
||||
const result = pageText.substring(0, MAX_TITLE_LENGTH)
|
||||
return result.split('\n').join('')
|
||||
}
|
||||
|
||||
return undefined
|
||||
}
|
||||
|
||||
export const getDocumentText = async (
|
||||
document: PDFDocumentProxy
|
||||
): Promise<string> => {
|
||||
const pages = await readPdfText(document)
|
||||
return pages
|
||||
}
|
||||
|
||||
export const readPdfText = async (
|
||||
document: PDFDocumentProxy,
|
||||
maxPages: number | undefined = undefined
|
||||
): Promise<string> => {
|
||||
const pages: Page[] = []
|
||||
const numPages = maxPages || document.numPages
|
||||
|
||||
for (let i = 0; i < numPages; i++) {
|
||||
pages.push(await parsePage(await document.getPage(i + 1)))
|
||||
}
|
||||
|
||||
return pages.reduce((accum, page) => {
|
||||
return accum.concat(page.lines.join('\n') + '\n')
|
||||
}, '')
|
||||
}
|
||||
|
||||
const parsePage = async (pdfPage: PDFPageProxy): Promise<Page> => {
|
||||
const rawContent = await pdfPage.getTextContent()
|
||||
return parsePageItems(
|
||||
rawContent.items.filter((item): item is TextItem => 'str' in item)
|
||||
)
|
||||
}
|
||||
|
||||
/**
|
||||
* Parses individual text items generated by pdf.js This allows lower level control of what actually
|
||||
* gets parsed. For example, a consumer of this function may remove entire sections of the pdf text
|
||||
* prior to passing items in here. See parsePage function above for example usage.
|
||||
*
|
||||
* @param pdfItems An array of TextItem items.
|
||||
*/
|
||||
const parsePageItems = (pdfItems: TextItem[]): Page => {
|
||||
const lineData: { [y: number]: TextItem[] } = {}
|
||||
|
||||
for (let i = 0; i < pdfItems.length; i++) {
|
||||
const item = pdfItems[i]
|
||||
const y = item.transform[5]
|
||||
/* eslint-disable no-prototype-builtins */
|
||||
if (!lineData.hasOwnProperty(y)) {
|
||||
lineData[y] = []
|
||||
}
|
||||
lineData[y].push(item)
|
||||
}
|
||||
|
||||
const yCoords = Object.keys(lineData)
|
||||
.map((key) => Number(key))
|
||||
// b - a here because the bottom is y = 0 so we want that to be last
|
||||
.sort((a, b) => b - a)
|
||||
// insert an empty line between any 2 lines where their distance is greater than the upper line's height
|
||||
.reduce((accum: number[], currentY, index, array) => {
|
||||
const nextY = array[index + 1]
|
||||
if (nextY != undefined) {
|
||||
const currentLineHeight: number = lineData[currentY].reduce(
|
||||
(finalValue, current) =>
|
||||
finalValue > current.height ? finalValue : current.height,
|
||||
-1
|
||||
)
|
||||
|
||||
// currentY - nextY because currentY will be higher than nextY
|
||||
if (Math.floor((currentY - nextY) / currentLineHeight) > 1) {
|
||||
const newY = currentY - currentLineHeight
|
||||
lineData[newY] = []
|
||||
return accum.concat(currentY, newY)
|
||||
}
|
||||
}
|
||||
return accum.concat(currentY)
|
||||
}, [])
|
||||
|
||||
const lines: string[] = []
|
||||
for (let i = 0; i < yCoords.length; i++) {
|
||||
const y = yCoords[i]
|
||||
// sort by x position (position in line)
|
||||
const lineItems = lineData[y]
|
||||
.sort((a, b) => a.transform[4] - b.transform[4])
|
||||
.filter((item) => !!item.str)
|
||||
let line = lineItems.length ? lineItems[0].str : ''
|
||||
for (let j = 1; j < lineItems.length; j++) {
|
||||
const item = lineItems[j]
|
||||
const lastItem = lineItems[j - 1]
|
||||
const xDiff = item.transform[4] - (lastItem.transform[4] + lastItem.width)
|
||||
|
||||
// insert spaces for items that are far apart horizontally
|
||||
if (
|
||||
item.height !== 0 &&
|
||||
(xDiff > item.height || xDiff > lastItem.height)
|
||||
) {
|
||||
const spaceCountA = Math.ceil(xDiff / item.height)
|
||||
let spaceCount = spaceCountA
|
||||
if (lastItem.height !== item.height) {
|
||||
const spaceCountB = Math.ceil(xDiff / lastItem.height)
|
||||
spaceCount = spaceCountA > spaceCountB ? spaceCountA : spaceCountB
|
||||
}
|
||||
|
||||
if (isNaN(spaceCount) || isFinite(spaceCount) === false) {
|
||||
spaceCount = 1
|
||||
}
|
||||
|
||||
line += Array(spaceCount).fill('').join(' ')
|
||||
}
|
||||
line += item.str
|
||||
}
|
||||
lines.push(line)
|
||||
}
|
||||
|
||||
return {
|
||||
lines,
|
||||
}
|
||||
}
|
||||
3
packages/pdf-handler/test/babel-register.js
Normal file
3
packages/pdf-handler/test/babel-register.js
Normal file
@ -0,0 +1,3 @@
|
||||
const register = require('@babel/register').default;
|
||||
|
||||
register({ extensions: ['.ts', '.tsx', '.js', '.jsx'] });
|
||||
6016
packages/pdf-handler/test/pdf/data/pdf-complex-test.pdf
Normal file
6016
packages/pdf-handler/test/pdf/data/pdf-complex-test.pdf
Normal file
File diff suppressed because it is too large
Load Diff
BIN
packages/pdf-handler/test/pdf/data/pdf-simple-test.pdf
Normal file
BIN
packages/pdf-handler/test/pdf/data/pdf-simple-test.pdf
Normal file
Binary file not shown.
BIN
packages/pdf-handler/test/pdf/data/welcome_to_your_library.pdf
Normal file
BIN
packages/pdf-handler/test/pdf/data/welcome_to_your_library.pdf
Normal file
Binary file not shown.
55
packages/pdf-handler/test/pdf/pdf.test.ts
Normal file
55
packages/pdf-handler/test/pdf/pdf.test.ts
Normal file
@ -0,0 +1,55 @@
|
||||
import 'mocha'
|
||||
import * as chai from 'chai'
|
||||
import { expect } from 'chai'
|
||||
import 'chai/register-should'
|
||||
import chaiString from 'chai-string'
|
||||
import {
|
||||
getDocument,
|
||||
getDocumentText,
|
||||
getDocumentTitle,
|
||||
parsePdf,
|
||||
} from '../../src/pdf'
|
||||
|
||||
chai.use(chaiString)
|
||||
|
||||
describe('open a simple PDF with a set title', () => {
|
||||
it('should return the title', async () => {
|
||||
const doc = await getDocument('./test/pdf/data/pdf-simple-test.pdf')
|
||||
const result = await getDocumentTitle(doc)
|
||||
expect('Document1').to.equal(result)
|
||||
})
|
||||
it('should return the document text', async () => {
|
||||
const doc = await getDocument('./test/pdf/data/pdf-simple-test.pdf')
|
||||
const result = await getDocumentText(doc)
|
||||
expect(result).to.equal(
|
||||
'This is the page title \n \nThis is some more text \n'
|
||||
)
|
||||
})
|
||||
})
|
||||
|
||||
describe('open a complex PDF with no title', () => {
|
||||
it('should return some initial content as the title', async () => {
|
||||
const doc = await getDocument('./test/pdf/data/pdf-complex-test.pdf')
|
||||
const result = await getDocumentTitle(doc)
|
||||
expect(result).to.startWith(
|
||||
'Improving communications around vaccine breakthrough and vaccine effectiveness'
|
||||
)
|
||||
})
|
||||
|
||||
it('should be less than the max title length', async () => {
|
||||
const doc = await getDocument('./test/pdf/data/pdf-complex-test.pdf')
|
||||
const result = await getDocumentTitle(doc)
|
||||
expect(result?.length).to.lessThanOrEqual(95)
|
||||
})
|
||||
})
|
||||
|
||||
describe('open a PDF with metadata set', () => {
|
||||
it('should return metadata', async () => {
|
||||
const parsed = await parsePdf(
|
||||
new URL('file://' + __dirname + '/data/welcome_to_your_library.pdf')
|
||||
)
|
||||
expect(parsed.title).to.eq('Welcome to your Omnivore Library')
|
||||
expect(parsed.author).to.eq('Jackson Harper')
|
||||
expect(parsed.description).to.eq('This is the description of my PDF')
|
||||
})
|
||||
})
|
||||
9
packages/pdf-handler/tsconfig.json
Normal file
9
packages/pdf-handler/tsconfig.json
Normal file
@ -0,0 +1,9 @@
|
||||
{
|
||||
"extends": "@tsconfig/node14/tsconfig.json",
|
||||
"compilerOptions": {
|
||||
"outDir": "build",
|
||||
"rootDir": ".",
|
||||
"lib": ["dom"]
|
||||
},
|
||||
"include": ["src", "test"]
|
||||
}
|
||||
Reference in New Issue
Block a user