Open source omnivore

2022-02-11 09:24:33 -08:00
parent b5a6008f76
commit 84f32935f5
1485 changed files with 188409 additions and 0 deletions
--- a/packages/pdf-handler/.eslintignore
+++ b/packages/pdf-handler/.eslintignore
@ -0,0 +1,4 @@
+node_modules/
+dist/
+readabilityjs/
+src/generated/
--- a/packages/pdf-handler/.eslintrc
+++ b/packages/pdf-handler/.eslintrc
@ -0,0 +1,6 @@
+{
+  "extends": "../../.eslintrc",
+  "parserOptions": {
+    "project": "tsconfig.json"
+  }
+}
--- a/packages/pdf-handler/.gcloudignore
+++ b/packages/pdf-handler/.gcloudignore
@ -0,0 +1,16 @@
+# This file specifies files that are *not* uploaded to Google Cloud Platform
+# using gcloud. It follows the same syntax as .gitignore, with the addition of
+# "#!include" directives (which insert the entries of the given .gitignore-style
+# file at that point).
+#
+# For more information, run:
+#   $ gcloud topic gcloudignore
+#
+.gcloudignore
+# If you would like to upload your .git directory, .gitignore file or files
+# from your .gitignore file, remove the corresponding line
+# below:
+.git
+.gitignore
+
+node_modules
--- a/packages/pdf-handler/mocha-config.json
+++ b/packages/pdf-handler/mocha-config.json
@ -0,0 +1,5 @@
+{
+    "extension": ["ts"],
+    "spec": "test/**/*.test.ts",
+    "require": "test/babel-register.js"
+  }
--- a/packages/pdf-handler/package.json
+++ b/packages/pdf-handler/package.json
@ -0,0 +1,33 @@
+{
+  "name": "@omnivore/pdf-handler",
+  "version": "1.0.0",
+  "description": "",
+  "main": "build/src/index.js",
+  "types": "build/src/index.d.ts",
+  "files": [
+    "build/src"
+  ],
+  "license": "Apache-2.0",
+  "keywords": [],
+  "scripts": {
+    "test": "yarn mocha -r ts-node/register --config mocha-config.json",
+    "lint": "eslint src --ext ts,js,tsx,jsx",
+    "compile": "tsc",
+    "build": "tsc",
+    "start": "functions-framework --source=build/src/ --target=pdfHandler",
+    "dev": "concurrently \"tsc -w\" \"nodemon --watch ./build/ --exec npm run start\"",
+    "gcloud-deploy": "gcloud functions deploy pdfHandler --region=$npm_config_region --runtime nodejs14 --trigger-bucket=$npm_config_bucket --env-vars-file=../gcf-shared/env-$npm_config_env.yaml",
+    "deploy": "yarn build && yarn gcloud-deploy"
+  },
+  "devDependencies": {
+    "@types/node": "^14.11.2"
+  },
+  "dependencies": {
+    "@google-cloud/functions-framework": "1.9.0",
+    "@google-cloud/pubsub": "^2.16.3",
+    "@google-cloud/storage": "^5.13.0",
+    "axios": "^0.21.1",
+    "concurrently": "^6.2.1",
+    "pdfjs-dist": "^2.9.359"
+  }
+}
--- a/packages/pdf-handler/src/backfill.ts
+++ b/packages/pdf-handler/src/backfill.ts
@ -0,0 +1,67 @@
+/* eslint-disable prefer-const */
+/* eslint-disable @typescript-eslint/restrict-template-expressions */
+import { Storage } from '@google-cloud/storage'
+import { parsePdf } from './pdf'
+import axios from 'axios'
+
+const storage = new Storage()
+
+const postUpdate = async (
+  fileId: string,
+  content: string,
+  title?: string,
+  author?: string,
+  description?: string
+) => {
+  const url =
+    'https://backend-dot-omnivore-production.wl.r.appspot.com/svc/pubsub/content/search?token=aYYLeK0kYlwnQg0wBMHO6EoAjf0LkoQ4Dyx0NGtpdjbh7F52EzHda8'
+
+  // const localUrl =
+  //   'http://localhost:4000/svc/pubsub/content/search?token=aYYLeK0kYlwnQg0wBMHO6EoAjf0LkoQ4Dyx0NGtpdjbh7F52EzHda8'
+
+  const data = JSON.stringify({
+    fileId,
+    content,
+    title,
+    author,
+    description,
+  })
+
+  const body = {
+    message: {
+      data: Buffer.from(data).toString('base64'),
+    },
+  }
+
+  const res = await axios.post(url, body)
+  console.log('res', res.status)
+}
+
+const listFiles = async () => {
+  const res = await storage
+    .bucket('omnivore')
+    .getFiles({ prefix: 'u/', maxResults: 50 })
+  console.log('result', res)
+
+  const [files] = res
+  console.log('Files:')
+  for (const file of files) {
+    const url = file.publicUrl()
+    const [isPublic] = await file.isPublic()
+    console.log(file.publicUrl(), 'is public:', isPublic)
+    if (isPublic) {
+      const parsed = await parsePdf(new URL(url))
+      // console.log(text)
+      // console.log('\n\n')
+      await postUpdate(
+        file.name,
+        parsed.content,
+        parsed.title,
+        parsed.author,
+        parsed.description
+      )
+    }
+  }
+}
+
+listFiles().catch(console.error)
--- a/packages/pdf-handler/src/index.ts
+++ b/packages/pdf-handler/src/index.ts
@ -0,0 +1,88 @@
+import {
+  EventFunction,
+  CloudFunctionsContext,
+} from '@google-cloud/functions-framework/build/src/functions'
+import { Storage } from '@google-cloud/storage'
+import { PubSub } from '@google-cloud/pubsub'
+import { parsePdf } from './pdf'
+
+const pubsub = new PubSub()
+const storage = new Storage()
+const CONTENT_UPDATE_TOPIC = 'updatePageContent'
+
+interface StorageEventData {
+  bucket: string
+  name: string
+  contentType: string
+}
+
+// Ensure this is a finalize event and that it is stored in the `u/` directory and is a PDF
+const shouldHandle = (data: StorageEventData, ctx: CloudFunctionsContext) => {
+  if (ctx.eventType !== 'google.storage.object.finalize') {
+    return false
+  }
+  if (
+    !data.name.startsWith('u/') ||
+    data.contentType.toLowerCase() != 'application/pdf'
+  ) {
+    return false
+  }
+  return true
+}
+
+const getDocumentUrl = (data: StorageEventData): URL | undefined => {
+  try {
+    const bucket = storage.bucket(data.bucket)
+    const file = bucket.file(data.name)
+    return new URL(file.publicUrl())
+  } catch (e) {
+    return undefined
+  }
+}
+
+export const updatePageContent = (
+  fileId: string,
+  content: string,
+  title?: string,
+  author?: string,
+  description?: string
+): Promise<string | undefined> => {
+  return pubsub
+    .topic(CONTENT_UPDATE_TOPIC)
+    .publish(
+      Buffer.from(
+        JSON.stringify({ fileId, content, title, author, description })
+      )
+    )
+    .catch((err) => {
+      console.error('error publishing conentUpdate:', err)
+      return undefined
+    })
+}
+
+export const pdfHandler: EventFunction = async (event, context) => {
+  const data = event as StorageEventData
+  const ctx = context as CloudFunctionsContext
+
+  if (shouldHandle(data, ctx)) {
+    console.log('handling pdf data', data)
+
+    const url = getDocumentUrl(data)
+    if (!url) {
+      console.log('Could not fetch PDF', data.bucket, data.name)
+      return
+    }
+
+    const parsed = await parsePdf(url)
+    const res = await updatePageContent(
+      data.name,
+      parsed.content,
+      parsed.title,
+      parsed.author,
+      parsed.description
+    )
+    console.log('publish result', res)
+  } else {
+    console.log('not handling pdf data', data)
+  }
+}
--- a/packages/pdf-handler/src/pdf.ts
+++ b/packages/pdf-handler/src/pdf.ts
@ -0,0 +1,212 @@
+/* eslint-disable @typescript-eslint/no-unsafe-call */
+/* eslint-disable @typescript-eslint/restrict-plus-operands */
+/* eslint-disable @typescript-eslint/no-unsafe-member-access */
+/* eslint-disable @typescript-eslint/no-unsafe-assignment */
+/* eslint-disable @typescript-eslint/no-unsafe-argument */
+import { getDocument as _getDocument } from 'pdfjs-dist/legacy/build/pdf'
+import {
+  TextItem,
+  PDFPageProxy,
+  PDFDocumentProxy,
+} from 'pdfjs-dist/types/display/api'
+
+interface Page {
+  lines: string[]
+}
+
+// Unused at the moment -- comented out for now to satisfy linter
+const MAX_TITLE_LENGTH = 95
+
+type MetadataInfoKey =
+  | 'Title'
+  | 'Author'
+  | 'Subject'
+  | 'CreationDate'
+  | 'ModDate'
+
+interface MetadataInfo {
+  Title?: string
+  Author?: string
+  CreationDate?: string
+  ModDate?: string
+  Subject?: string
+}
+
+interface ParsedPdf {
+  content: string
+  title?: string
+  author?: string
+  description?: string
+}
+
+export const parsePdf = async (url: URL): Promise<ParsedPdf> => {
+  const documentLoadingTask = _getDocument(url)
+  const document = await documentLoadingTask.promise
+
+  const text = await getDocumentText(document)
+  // eslint-disable-next-line no-control-regex
+  const result: ParsedPdf = { content: text.replace(/\x00/g, '') }
+
+  const title = await getMetadataItem(document, 'Title')
+  if (title) result.title = title
+
+  const author = await getMetadataItem(document, 'Author')
+  if (author) result.author = author
+
+  const description = await getMetadataItem(document, 'Subject')
+  if (description) result.description = description
+
+  return result
+}
+
+export const getDocument = (source: string): Promise<PDFDocumentProxy> => {
+  const documentLoadingTask = _getDocument(source)
+  return documentLoadingTask.promise
+}
+
+const getMetadataItem = async (
+  document: PDFDocumentProxy,
+  key: MetadataInfoKey
+): Promise<string | undefined> => {
+  return await document
+    .getMetadata()
+    .then((metadata) => metadata.info as MetadataInfo)
+    .then((info) => {
+      return info[key]
+    })
+}
+
+export const getDocumentTitle = async (
+  document: PDFDocumentProxy
+): Promise<string | undefined> => {
+  const title = await getMetadataItem(document, 'Title')
+  if (title) {
+    return title
+  }
+
+  // Attempt to grab the title from the first page
+  // because extracted text is returned as joined
+  // lines, we replace the line breaks with spaces
+  const pageText = await readPdfText(document, 1)
+  if (pageText.length) {
+    const result = pageText.substring(0, MAX_TITLE_LENGTH)
+    return result.split('\n').join('')
+  }
+
+  return undefined
+}
+
+export const getDocumentText = async (
+  document: PDFDocumentProxy
+): Promise<string> => {
+  const pages = await readPdfText(document)
+  return pages
+}
+
+export const readPdfText = async (
+  document: PDFDocumentProxy,
+  maxPages: number | undefined = undefined
+): Promise<string> => {
+  const pages: Page[] = []
+  const numPages = maxPages || document.numPages
+
+  for (let i = 0; i < numPages; i++) {
+    pages.push(await parsePage(await document.getPage(i + 1)))
+  }
+
+  return pages.reduce((accum, page) => {
+    return accum.concat(page.lines.join('\n') + '\n')
+  }, '')
+}
+
+const parsePage = async (pdfPage: PDFPageProxy): Promise<Page> => {
+  const rawContent = await pdfPage.getTextContent()
+  return parsePageItems(
+    rawContent.items.filter((item): item is TextItem => 'str' in item)
+  )
+}
+
+/**
+ * Parses individual text items generated by pdf.js This allows lower level control of what actually
+ * gets parsed. For example, a consumer of this function may remove entire sections of the pdf text
+ * prior to passing items in here. See parsePage function above for example usage.
+ *
+ * @param pdfItems An array of TextItem items.
+ */
+const parsePageItems = (pdfItems: TextItem[]): Page => {
+  const lineData: { [y: number]: TextItem[] } = {}
+
+  for (let i = 0; i < pdfItems.length; i++) {
+    const item = pdfItems[i]
+    const y = item.transform[5]
+    /* eslint-disable no-prototype-builtins */
+    if (!lineData.hasOwnProperty(y)) {
+      lineData[y] = []
+    }
+    lineData[y].push(item)
+  }
+
+  const yCoords = Object.keys(lineData)
+    .map((key) => Number(key))
+    // b - a here because the bottom is y = 0 so we want that to be last
+    .sort((a, b) => b - a)
+    // insert an empty line between any 2 lines where their distance is greater than the upper line's height
+    .reduce((accum: number[], currentY, index, array) => {
+      const nextY = array[index + 1]
+      if (nextY != undefined) {
+        const currentLineHeight: number = lineData[currentY].reduce(
+          (finalValue, current) =>
+            finalValue > current.height ? finalValue : current.height,
+          -1
+        )
+
+        // currentY - nextY because currentY will be higher than nextY
+        if (Math.floor((currentY - nextY) / currentLineHeight) > 1) {
+          const newY = currentY - currentLineHeight
+          lineData[newY] = []
+          return accum.concat(currentY, newY)
+        }
+      }
+      return accum.concat(currentY)
+    }, [])
+
+  const lines: string[] = []
+  for (let i = 0; i < yCoords.length; i++) {
+    const y = yCoords[i]
+    // sort by x position (position in line)
+    const lineItems = lineData[y]
+      .sort((a, b) => a.transform[4] - b.transform[4])
+      .filter((item) => !!item.str)
+    let line = lineItems.length ? lineItems[0].str : ''
+    for (let j = 1; j < lineItems.length; j++) {
+      const item = lineItems[j]
+      const lastItem = lineItems[j - 1]
+      const xDiff = item.transform[4] - (lastItem.transform[4] + lastItem.width)
+
+      // insert spaces for items that are far apart horizontally
+      if (
+        item.height !== 0 &&
+        (xDiff > item.height || xDiff > lastItem.height)
+      ) {
+        const spaceCountA = Math.ceil(xDiff / item.height)
+        let spaceCount = spaceCountA
+        if (lastItem.height !== item.height) {
+          const spaceCountB = Math.ceil(xDiff / lastItem.height)
+          spaceCount = spaceCountA > spaceCountB ? spaceCountA : spaceCountB
+        }
+
+        if (isNaN(spaceCount) || isFinite(spaceCount) === false) {
+          spaceCount = 1
+        }
+
+        line += Array(spaceCount).fill('').join(' ')
+      }
+      line += item.str
+    }
+    lines.push(line)
+  }
+
+  return {
+    lines,
+  }
+}
--- a/packages/pdf-handler/test/babel-register.js
+++ b/packages/pdf-handler/test/babel-register.js
@ -0,0 +1,3 @@
+const register = require('@babel/register').default;
+
+register({ extensions: ['.ts', '.tsx', '.js', '.jsx'] });
--- a/packages/pdf-handler/test/pdf/data/pdf-complex-test.pdf
+++ b/packages/pdf-handler/test/pdf/data/pdf-complex-test.pdf
--- a/packages/pdf-handler/test/pdf/data/pdf-simple-test.pdf
+++ b/packages/pdf-handler/test/pdf/data/pdf-simple-test.pdf
--- a/packages/pdf-handler/test/pdf/data/welcome_to_your_library.pdf
+++ b/packages/pdf-handler/test/pdf/data/welcome_to_your_library.pdf
--- a/packages/pdf-handler/test/pdf/pdf.test.ts
+++ b/packages/pdf-handler/test/pdf/pdf.test.ts
@ -0,0 +1,55 @@
+import 'mocha'
+import * as chai from 'chai'
+import { expect } from 'chai'
+import 'chai/register-should'
+import chaiString from 'chai-string'
+import {
+  getDocument,
+  getDocumentText,
+  getDocumentTitle,
+  parsePdf,
+} from '../../src/pdf'
+
+chai.use(chaiString)
+
+describe('open a simple PDF with a set title', () => {
+  it('should return the title', async () => {
+    const doc = await getDocument('./test/pdf/data/pdf-simple-test.pdf')
+    const result = await getDocumentTitle(doc)
+    expect('Document1').to.equal(result)
+  })
+  it('should return the document text', async () => {
+    const doc = await getDocument('./test/pdf/data/pdf-simple-test.pdf')
+    const result = await getDocumentText(doc)
+    expect(result).to.equal(
+      'This is the page title \n \nThis is some more text \n'
+    )
+  })
+})
+
+describe('open a complex PDF with no title', () => {
+  it('should return some initial content as the title', async () => {
+    const doc = await getDocument('./test/pdf/data/pdf-complex-test.pdf')
+    const result = await getDocumentTitle(doc)
+    expect(result).to.startWith(
+      'Improving communications around vaccine breakthrough and vaccine effectiveness'
+    )
+  })
+
+  it('should be less than the max title length', async () => {
+    const doc = await getDocument('./test/pdf/data/pdf-complex-test.pdf')
+    const result = await getDocumentTitle(doc)
+    expect(result?.length).to.lessThanOrEqual(95)
+  })
+})
+
+describe('open a PDF with metadata set', () => {
+  it('should return metadata', async () => {
+    const parsed = await parsePdf(
+      new URL('file://' + __dirname + '/data/welcome_to_your_library.pdf')
+    )
+    expect(parsed.title).to.eq('Welcome to your Omnivore Library')
+    expect(parsed.author).to.eq('Jackson Harper')
+    expect(parsed.description).to.eq('This is the description of my PDF')
+  })
+})
--- a/packages/pdf-handler/tsconfig.json
+++ b/packages/pdf-handler/tsconfig.json
@ -0,0 +1,9 @@
+{
+  "extends": "@tsconfig/node14/tsconfig.json",
+  "compilerOptions": {
+    "outDir": "build",
+    "rootDir": ".",
+    "lib": ["dom"]
+  },
+  "include": ["src", "test"]
+}