Update matter import handler to use archives instead of just history files

2023-01-11 15:35:05 +08:00
parent 126373a10d
commit e392af4800
10 changed files with 473 additions and 48 deletions
--- a/packages/import-handler/package.json
+++ b/packages/import-handler/package.json
@ -20,8 +20,10 @@
    "deploy": "yarn build && yarn gcloud-deploy"
  },
  "devDependencies": {
-    "@types/node": "^14.11.2",
+    "@types/fs-extra": "^11.0.1",
    "@types/jsonwebtoken": "^8.5.0",
+    "@types/node": "^14.11.2",
+    "@types/unzip-stream": "^0.3.1",
    "eslint-plugin-prettier": "^4.0.0"
  },
  "dependencies": {
@ -29,9 +31,14 @@
    "@google-cloud/functions-framework": "3.1.2",
    "@google-cloud/storage": "^5.18.1",
    "@google-cloud/tasks": "^3.0.5",
+    "@omnivore/content-handler": "1.0.0",
+    "@omnivore/readability": "1.0.0",
    "@types/express": "^4.17.13",
    "csv-parser": "^3.0.0",
+    "dompurify": "^2.4.3",
+    "fs-extra": "^11.1.0",
    "jsonwebtoken": "^8.5.1",
-    "nodemon": "^2.0.15"
+    "nodemon": "^2.0.15",
+    "unzip-stream": "^0.3.1"
  }
 }
--- a/packages/import-handler/src/csv.ts
+++ b/packages/import-handler/src/csv.ts
@ -5,24 +5,19 @@

 import { parse } from '@fast-csv/parse'
 import { Stream } from 'stream'
+import { ImportContext } from '.'

-export type UrlHandler = (url: URL) => Promise<void>
-
-export const importCsv = async (
-  stream: Stream,
-  handler: UrlHandler
-): Promise<number> => {
+export const importCsv = async (stream: Stream, ctx: ImportContext) => {
  const parser = parse()
  stream.pipe(parser)
-  let count = 0
  for await (const row of parser) {
    try {
      const url = new URL(row[0])
-      await handler(url)
+      await ctx.urlHandler(ctx, url)
+      ctx.countImported += 1
    } catch (error) {
      console.log('invalid url', row, error)
+      ctx.countFailed += 1
    }
-    count++
  }
-  return count
 }
--- a/packages/import-handler/src/index.ts
+++ b/packages/import-handler/src/index.ts
@ -3,30 +3,50 @@ import {
  CloudFunctionsContext,
 } from '@google-cloud/functions-framework/build/src/functions'
 import { Storage } from '@google-cloud/storage'
-import { importCsv, UrlHandler } from './csv'
+import { importCsv } from './csv'
 import * as path from 'path'
-import { importMatterHistory } from './matterHistory'
+import { importMatterArchive, importMatterHistoryCsv } from './matterHistory'
 import { Stream } from 'node:stream'
 import { v4 as uuid } from 'uuid'
 import { CONTENT_FETCH_URL, createCloudTask, EMAIL_USER_URL } from './task'

 import { promisify } from 'util'
 import * as jwt from 'jsonwebtoken'
+import { Readability } from '@omnivore/readability'

 const signToken = promisify(jwt.sign)

 const storage = new Storage()

+const CONTENT_TYPES = ['text/csv', 'application/zip']
+
 interface StorageEventData {
  bucket: string
  name: string
  contentType: string
 }

+export type UrlHandler = (ctx: ImportContext, url: URL) => Promise<void>
+export type ContentHandler = (
+  ctx: ImportContext,
+  url: URL,
+  title: string,
+  originalContent: string,
+  parseResult: Readability.ParseResult
+) => Promise<void>
+
+export type ImportContext = {
+  userId: string
+  countImported: number
+  countFailed: number
+  urlHandler: UrlHandler
+  contentHandler: ContentHandler
+}
+
 type importHandlerFunc = (
  stream: Stream,
-  handler: UrlHandler
-) => Promise<number>
+  handler: ImportContext
+) => Promise<void>

 const shouldHandle = (data: StorageEventData, ctx: CloudFunctionsContext) => {
  console.log('deciding to handle', ctx, data)
@ -35,7 +55,7 @@ const shouldHandle = (data: StorageEventData, ctx: CloudFunctionsContext) => {
  }
  if (
    !data.name.startsWith('imports/') ||
-    data.contentType.toLowerCase() != 'text/csv'
+    CONTENT_TYPES.indexOf(data.contentType.toLocaleLowerCase()) == -1
  ) {
    return false
  }
@ -93,7 +113,7 @@ const sendImportCompletedEmail = async (
 const handlerForFile = (name: string): importHandlerFunc | undefined => {
  const fileName = path.parse(name).name
  if (fileName.startsWith('MATTER')) {
-    return importMatterHistory
+    return importMatterArchive
  } else if (fileName.startsWith('URL_LIST')) {
    return importCsv
  }
@ -101,6 +121,35 @@ const handlerForFile = (name: string): importHandlerFunc | undefined => {
  return undefined
 }

+const urlHandler = async (ctx: ImportContext, url: URL): Promise<void> => {
+  try {
+    // Imports are stored in the format imports/<user id>/<type>-<uuid>.csv
+    const result = await importURL(ctx.userId, url, 'csv-importer')
+    if (result) {
+      ctx.countImported += 1
+    }
+  } catch (err) {
+    console.log('error importing url', err)
+  }
+}
+
+const contentHandler = async (
+  ctx: ImportContext,
+  url: URL,
+  title: string,
+  originalContent: string,
+  parseResult: Readability.ParseResult
+): Promise<void> => {
+  // const apiResponse = await sendSavePageMutation(userId, {
+  //   url: finalUrl,
+  //   clientRequestId: articleSavingRequestId,
+  //   title,
+  //   originalContent: content,
+  //   parseResult: readabilityResult,
+  // })
+  return Promise.resolve()
+}
+
 export const importHandler: EventFunction = async (event, context) => {
  const data = event as StorageEventData
  const ctx = context as CloudFunctionsContext
@ -131,18 +180,14 @@ export const importHandler: EventFunction = async (event, context) => {
      return
    }

-    let countFailed = 0
-    let countImported = 0
-    await handler(stream, async (url): Promise<void> => {
-      try {
-        // Imports are stored in the format imports/<user id>/<type>-<uuid>.csv
-        const result = await importURL(userId, url, 'csv-importer')
-        console.log('import url result', result)
-        countImported = countImported + 1
-      } catch (err) {
-        console.log('error importing url', err)
-        countFailed = countFailed + 1
-      }
+    const countFailed = 0
+    const countImported = 0
+    await handler(stream, {
+      userId,
+      countImported: 0,
+      countFailed: 0,
+      urlHandler,
+      contentHandler,
    })

    if (countImported <= 1) {
--- a/packages/import-handler/src/matterHistory.ts
+++ b/packages/import-handler/src/matterHistory.ts
@ -5,28 +5,234 @@

 import { parse } from '@fast-csv/parse'
 import { Stream } from 'stream'
+import unzip from 'unzip-stream'
+import fs from 'fs'
+import path from 'path'
+import * as fsExtra from 'fs-extra'
+
+import { parseHTML } from 'linkedom'
+import { Readability } from '@omnivore/readability'
+import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify'
+
+import { encode } from 'urlsafe-base64'
+import crypto from 'crypto'
+import { ImportContext } from '.'

 export type UrlHandler = (url: URL) => Promise<void>

-export const importMatterHistory = async (
+export const importMatterHistoryCsv = async (
  stream: Stream,
-  handler: UrlHandler
-): Promise<number> => {
+  ctx: ImportContext
+): Promise<void> => {
  const parser = parse({
    headers: true,
    strictColumnHandling: false,
  })
  stream.pipe(parser)

-  let count = 0
  for await (const row of parser) {
    try {
      const url = new URL(row['URL'])
-      await handler(url)
+      await ctx.urlHandler(ctx, url)
+      ctx.countImported += 1
    } catch (error) {
      console.log('invalid url', row, error)
+      ctx.countFailed += 1
+    }
+  }
+}
+
+const DOM_PURIFY_CONFIG = {
+  ADD_TAGS: ['iframe'],
+  ADD_ATTR: ['allow', 'allowfullscreen', 'frameborder', 'scrolling'],
+  FORBID_ATTR: [
+    'data-ml-dynamic',
+    'data-ml-dynamic-type',
+    'data-orig-url',
+    'data-ml-id',
+    'data-ml',
+    'data-xid',
+    'data-feature',
+  ],
+}
+
+function domPurifySanitizeHook(node: Element, data: SanitizeElementHookEvent) {
+  if (data.tagName === 'iframe') {
+    const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i
+    const src = node.getAttribute('src') || ''
+    const dataSrc = node.getAttribute('data-src') || ''
+
+    if (src && urlRegex.test(src)) {
+      return
+    }
+
+    if (dataSrc && urlRegex.test(dataSrc)) {
+      node.setAttribute('src', dataSrc)
+      return
+    }
+
+    node.parentNode?.removeChild(node)
+  }
+}
+
+function getPurifiedContent(html: string) {
+  const newWindow = parseHTML('')
+  const DOMPurify = createDOMPurify(newWindow)
+  DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
+  const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG)
+  return parseHTML(clean).document
+}
+
+function createImageProxyUrl(url: string, width = 0, height = 0) {
+  if (process.env.IMAGE_PROXY_URL && process.env.IMAGE_PROXY_SECRET) {
+    const urlWithOptions = `${url}#${width}x${height}`
+    const signature = signImageProxyUrl(urlWithOptions)
+
+    return `${process.env.IMAGE_PROXY_URL}/${width}x${height},s${signature}/${url}`
+  }
+  return url
+}
+
+function signImageProxyUrl(url: string) {
+  if (process.env.IMAGE_PROXY_SECRET) {
+    return encode(
+      crypto
+        .createHmac('sha256', process.env.IMAGE_PROXY_SECRET)
+        .update(url)
+        .digest()
+    )
+  }
+  return url
+}
+
+async function getReadabilityResult(url: string, originalContent: string) {
+  const document = getPurifiedContent(originalContent)
+
+  try {
+    const article = await new Readability(document, {
+      createImageProxyUrl,
+      url,
+    }).parse()
+
+    if (article) {
+      return article
+    }
+  } catch (error) {
+    console.log('parsing error for url', url, error)
+  }
+
+  return null
+}
+
+const unarchive = async (stream: Stream): Promise<string> => {
+  const archiveDir = `./archive-${Date.now().toString(16)}`
+  await fsExtra.emptyDir(archiveDir)
+
+  return new Promise((resolve, reject) => {
+    stream
+      .pipe(unzip.Extract({ path: archiveDir }))
+      .on('close', () => {
+        resolve(archiveDir)
+      })
+      .on('error', reject)
+  })
+}
+
+const getMatterHistoryContent = (
+  archiveDir: string,
+  row: Record<string, string>
+) => {
+  try {
+    const contentKey = row['File Id']
+    const contentPath = path.join(archiveDir, contentKey)
+    const content = fs.readFileSync(contentPath).toString()
+
+    return content
+  } catch (err) {
+    console.log('error getting matter history content: ', { row, err })
+  }
+  return undefined
+}
+
+const getURL = (str: string | undefined) => {
+  if (!str) {
+    return undefined
+  }
+
+  try {
+    const url = new URL(str)
+    return url
+  } catch (err) {
+    console.log('error parsing url', { str, err })
+  }
+
+  return undefined
+}
+
+const handleMatterHistoryRow = async (
+  ctx: ImportContext,
+  archiveDir: string,
+  row: Record<string, string>
+) => {
+  const title = row['Title']
+  const urlStr = row['URL']
+  const url = getURL(urlStr)
+
+  if (!url) {
+    ctx.countFailed += 1
+    return
+  }
+
+  const originalContent = getMatterHistoryContent(archiveDir, row)
+  const readabilityResult = originalContent
+    ? await getReadabilityResult(urlStr, originalContent)
+    : null
+
+  if (originalContent && readabilityResult) {
+    await ctx.contentHandler(
+      ctx,
+      url,
+      title,
+      originalContent,
+      readabilityResult
+    )
+  } else {
+    await ctx.urlHandler(ctx, url)
+  }
+}
+
+export const importMatterArchive = async (
+  stream: Stream,
+  ctx: ImportContext
+): Promise<void> => {
+  const archiveDir = await unarchive(stream)
+
+  try {
+    const historyFile = path.join(archiveDir, '_matter_history.csv')
+
+    const parser = parse({
+      headers: true,
+      strictColumnHandling: false,
+    })
+
+    fs.createReadStream(historyFile).pipe(parser)
+
+    for await (const row of parser) {
+      try {
+        await handleMatterHistoryRow(ctx, archiveDir, row)
+        ctx.countImported += 1
+      } catch (error) {
+        console.log('invalid url', row, error)
+        ctx.countFailed += 1
+      }
+    }
+  } catch (err) {
+    console.log('error handling archive: ', { err })
+  } finally {
+    try {
+      await fsExtra.rm(archiveDir, { recursive: true, force: true })
+    } catch (err) {
+      console.log('Error removing archive directory', { err })
    }
-    count++
  }
-  return count
 }
--- a/packages/import-handler/src/readability.d.ts
+++ b/packages/import-handler/src/readability.d.ts
@ -0,0 +1,108 @@
+// Type definitions for non-npm package mozilla-readability 0.2
+// Project: https://github.com/mozilla/readability
+// Definitions by: Charles Vandevoorde <https://github.com/charlesvdv>, Alex Wendland <https://github.com/awendland>
+// Definitions: https://github.com/DefinitelyTyped/DefinitelyTyped
+// TypeScript Version: 2.2
+
+declare module '@omnivore/readability' {
+  class Readability {
+    constructor(doc: Document, options?: Readability.Options)
+
+    async parse(): Promise<Readability.ParseResult | null>
+  }
+
+  namespace Readability {
+    interface Options {
+      /**
+       * Control whether log messages are sent to the console
+       */
+      debug?: boolean
+
+      /**
+       * Set a maximum size on the documents that will be processed. This size is
+       * checked before any parsing operations occur. If the number of elements in
+       * the document exceeds this threshold then an Error will be thrown.
+       *
+       * See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L2019
+       */
+      maxElemsToParse?: number
+
+      nbTopCandidates?: number
+
+      /**
+       * Minimum number of characters in the extracted textContent in order to
+       * consider the article correctly identified. If the threshold is not met then
+       * the extraction process will automatically run again with different flags.
+       *
+       * See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L1208
+       *
+       * Changed from wordThreshold in https://github.com/mozilla/readability/commit/3ff9a166fb27928f222c4c0722e730eda412658a
+       */
+      charThreshold?: number
+
+      /**
+       * parse() removes the class="" attribute from every element in the given
+       * subtree, except those that match CLASSES_TO_PRESERVE and
+       * the classesToPreserve array from the options object.
+       */
+      classesToPreserve?: string[]
+
+      /**
+       * By default Readability will strip all classes from the HTML elements in the
+       * processed article. By setting this to `true` the classes will be retained.
+       *
+       * This is a blanket alternative to `classesToPreserve`.
+       *
+       * Added in https://github.com/mozilla/readability/commit/2982216913af2c66b0690e88606b03116553ad92
+       */
+
+      keepClasses?: boolean
+      url?: string
+
+      /**
+       * Function that converts a regular image url into imageproxy url
+       * @param url string
+       */
+      createImageProxyUrl?: (
+        url: string,
+        width?: number,
+        height?: number
+      ) => string
+
+      /**
+       * By default, Readability will clean all tables from the HTML elements in the
+       * processed article. But newsletters in emails use tables to display their content.
+       * By setting this to `true`, these tables will be retained.
+       */
+      keepTables?: boolean
+    }
+
+    interface ParseResult {
+      /** Article title */
+      title: string
+      /** Author metadata */
+      byline?: string | null
+      /** Content direction */
+      dir?: string | null
+      /** HTML string of processed article content */
+      content: string
+      /** non-HTML version of `content`  */
+      textContent: string
+      /** Length of an article, in characters */
+      length: number
+      /** Article description, or short excerpt from the content */
+      excerpt: string
+      /** Article site name */
+      siteName?: string | null
+      /** Article site icon */
+      siteIcon?: string | null
+      /** Article preview image */
+      previewImage?: string | null
+      /** Article published date */
+      publishedDate?: Date | null
+      language?: string | null
+    }
+  }
+
+  export { Readability }
+}
--- a/packages/import-handler/test/csv/csv.test.ts
+++ b/packages/import-handler/test/csv/csv.test.ts
@ -4,6 +4,8 @@ import { expect } from 'chai'
 import chaiString from 'chai-string'
 import * as fs from 'fs'
 import { importCsv } from '../../src/csv'
+import { ImportContext } from '../../src'
+import { stubImportCtx } from '../util'

 chai.use(chaiString)

@ -11,11 +13,15 @@ describe('Load a simple CSV file', () => {
  it('should call the handler for each URL', async () => {
    const urls: URL[] = []
    const stream = fs.createReadStream('./test/csv/data/simple.csv')
-    const count = await importCsv(stream, (url): Promise<void> => {
+    const stub = stubImportCtx()
+    stub.urlHandler = (ctx: ImportContext, url): Promise<void> => {
      urls.push(url)
      return Promise.resolve()
-    })
-    expect(count).to.equal(2)
+    }
+
+    await importCsv(stream, stub)
+    expect(stub.countFailed).to.equal(0)
+    expect(stub.countImported).to.equal(2)
    expect(urls).to.eql([
      new URL('https://omnivore.app'),
      new URL('https://google.com'),
--- a/packages/import-handler/test/matter/data/Archive.zip
+++ b/packages/import-handler/test/matter/data/Archive.zip
--- a/packages/import-handler/test/matter/matter_importer.test.ts
+++ b/packages/import-handler/test/matter/matter_importer.test.ts
@ -3,7 +3,13 @@ import * as chai from 'chai'
 import { expect } from 'chai'
 import chaiString from 'chai-string'
 import * as fs from 'fs'
-import { importMatterHistory } from '../../src/matterHistory'
+import {
+  importMatterArchive,
+  importMatterHistoryCsv,
+} from '../../src/matterHistory'
+import { stubImportCtx } from '../util'
+import { ImportContext } from '../../src'
+import { Readability } from '@omnivore/readability'

 chai.use(chaiString)

@ -11,11 +17,40 @@ describe('Load a simple _matter_history file', () => {
  it('should find the URL of each row', async () => {
    const urls: URL[] = []
    const stream = fs.createReadStream('./test/matter/data/_matter_history.csv')
-    const count = await importMatterHistory(stream, (url): Promise<void> => {
+    const stub = stubImportCtx()
+    stub.urlHandler = (ctx: ImportContext, url): Promise<void> => {
      urls.push(url)
      return Promise.resolve()
-    })
-    expect(count).to.equal(1)
+    }
+
+    await importMatterHistoryCsv(stream, stub)
+    expect(stub.countFailed).to.equal(0)
+    expect(stub.countImported).to.equal(1)
+    expect(urls).to.eql([
+      new URL('https://www.bloomberg.com/features/2022-the-crypto-story/'),
+    ])
+  })
+})
+
+describe('Load archive file', () => {
+  it('should find the URL of each row', async () => {
+    const urls: URL[] = []
+    const stream = fs.createReadStream('./test/matter/data/Archive.zip')
+    const stub = stubImportCtx()
+    stub.contentHandler = (
+      ctx: ImportContext,
+      url: URL,
+      title: string,
+      originalContent: string,
+      parseResult: Readability.ParseResult
+    ): Promise<void> => {
+      urls.push(url)
+      return Promise.resolve()
+    }
+
+    await importMatterArchive(stream, stub)
+    expect(stub.countFailed).to.equal(0)
+    expect(stub.countImported).to.equal(1)
    expect(urls).to.eql([
      new URL('https://www.bloomberg.com/features/2022-the-crypto-story/'),
    ])
--- a/packages/import-handler/test/util.ts
+++ b/packages/import-handler/test/util.ts
@ -0,0 +1,22 @@
+import { Readability } from '@omnivore/readability'
+import { ImportContext } from '../src'
+
+export const stubImportCtx = () => {
+  return {
+    userId: '',
+    countImported: 0,
+    countFailed: 0,
+    urlHandler: (ctx: ImportContext, url: URL): Promise<void> => {
+      return Promise.resolve()
+    },
+    contentHandler: (
+      ctx: ImportContext,
+      url: URL,
+      title: string,
+      originalContent: string,
+      parseResult: Readability.ParseResult
+    ): Promise<void> => {
+      return Promise.resolve()
+    },
+  }
+}
--- a/packages/import-handler/tsconfig.json
+++ b/packages/import-handler/tsconfig.json
@ -1,9 +1,10 @@
 {
-  "extends": "@tsconfig/node14/tsconfig.json",
+  "extends": "./../../tsconfig.json",
+  "ts-node": {
+    "files": true
+  },
  "compilerOptions": {
-    "outDir": "build",
-    "rootDir": ".",
-    "lib": ["dom"]
+    "outDir": "dist"
  },
  "include": ["src", "test"]
 }