Update matter import handler to use archives instead of just history files

This commit is contained in:
Jackson Harper
2023-01-11 15:35:05 +08:00
parent 126373a10d
commit e392af4800
10 changed files with 473 additions and 48 deletions

View File

@ -20,8 +20,10 @@
"deploy": "yarn build && yarn gcloud-deploy"
},
"devDependencies": {
"@types/node": "^14.11.2",
"@types/fs-extra": "^11.0.1",
"@types/jsonwebtoken": "^8.5.0",
"@types/node": "^14.11.2",
"@types/unzip-stream": "^0.3.1",
"eslint-plugin-prettier": "^4.0.0"
},
"dependencies": {
@ -29,9 +31,14 @@
"@google-cloud/functions-framework": "3.1.2",
"@google-cloud/storage": "^5.18.1",
"@google-cloud/tasks": "^3.0.5",
"@omnivore/content-handler": "1.0.0",
"@omnivore/readability": "1.0.0",
"@types/express": "^4.17.13",
"csv-parser": "^3.0.0",
"dompurify": "^2.4.3",
"fs-extra": "^11.1.0",
"jsonwebtoken": "^8.5.1",
"nodemon": "^2.0.15"
"nodemon": "^2.0.15",
"unzip-stream": "^0.3.1"
}
}

View File

@ -5,24 +5,19 @@
import { parse } from '@fast-csv/parse'
import { Stream } from 'stream'
import { ImportContext } from '.'
export type UrlHandler = (url: URL) => Promise<void>
export const importCsv = async (
stream: Stream,
handler: UrlHandler
): Promise<number> => {
export const importCsv = async (stream: Stream, ctx: ImportContext) => {
const parser = parse()
stream.pipe(parser)
let count = 0
for await (const row of parser) {
try {
const url = new URL(row[0])
await handler(url)
await ctx.urlHandler(ctx, url)
ctx.countImported += 1
} catch (error) {
console.log('invalid url', row, error)
ctx.countFailed += 1
}
count++
}
return count
}

View File

@ -3,30 +3,50 @@ import {
CloudFunctionsContext,
} from '@google-cloud/functions-framework/build/src/functions'
import { Storage } from '@google-cloud/storage'
import { importCsv, UrlHandler } from './csv'
import { importCsv } from './csv'
import * as path from 'path'
import { importMatterHistory } from './matterHistory'
import { importMatterArchive, importMatterHistoryCsv } from './matterHistory'
import { Stream } from 'node:stream'
import { v4 as uuid } from 'uuid'
import { CONTENT_FETCH_URL, createCloudTask, EMAIL_USER_URL } from './task'
import { promisify } from 'util'
import * as jwt from 'jsonwebtoken'
import { Readability } from '@omnivore/readability'
const signToken = promisify(jwt.sign)
const storage = new Storage()
const CONTENT_TYPES = ['text/csv', 'application/zip']
interface StorageEventData {
bucket: string
name: string
contentType: string
}
export type UrlHandler = (ctx: ImportContext, url: URL) => Promise<void>
export type ContentHandler = (
ctx: ImportContext,
url: URL,
title: string,
originalContent: string,
parseResult: Readability.ParseResult
) => Promise<void>
export type ImportContext = {
userId: string
countImported: number
countFailed: number
urlHandler: UrlHandler
contentHandler: ContentHandler
}
type importHandlerFunc = (
stream: Stream,
handler: UrlHandler
) => Promise<number>
handler: ImportContext
) => Promise<void>
const shouldHandle = (data: StorageEventData, ctx: CloudFunctionsContext) => {
console.log('deciding to handle', ctx, data)
@ -35,7 +55,7 @@ const shouldHandle = (data: StorageEventData, ctx: CloudFunctionsContext) => {
}
if (
!data.name.startsWith('imports/') ||
data.contentType.toLowerCase() != 'text/csv'
CONTENT_TYPES.indexOf(data.contentType.toLocaleLowerCase()) == -1
) {
return false
}
@ -93,7 +113,7 @@ const sendImportCompletedEmail = async (
const handlerForFile = (name: string): importHandlerFunc | undefined => {
const fileName = path.parse(name).name
if (fileName.startsWith('MATTER')) {
return importMatterHistory
return importMatterArchive
} else if (fileName.startsWith('URL_LIST')) {
return importCsv
}
@ -101,6 +121,35 @@ const handlerForFile = (name: string): importHandlerFunc | undefined => {
return undefined
}
const urlHandler = async (ctx: ImportContext, url: URL): Promise<void> => {
try {
// Imports are stored in the format imports/<user id>/<type>-<uuid>.csv
const result = await importURL(ctx.userId, url, 'csv-importer')
if (result) {
ctx.countImported += 1
}
} catch (err) {
console.log('error importing url', err)
}
}
const contentHandler = async (
ctx: ImportContext,
url: URL,
title: string,
originalContent: string,
parseResult: Readability.ParseResult
): Promise<void> => {
// const apiResponse = await sendSavePageMutation(userId, {
// url: finalUrl,
// clientRequestId: articleSavingRequestId,
// title,
// originalContent: content,
// parseResult: readabilityResult,
// })
return Promise.resolve()
}
export const importHandler: EventFunction = async (event, context) => {
const data = event as StorageEventData
const ctx = context as CloudFunctionsContext
@ -131,18 +180,14 @@ export const importHandler: EventFunction = async (event, context) => {
return
}
let countFailed = 0
let countImported = 0
await handler(stream, async (url): Promise<void> => {
try {
// Imports are stored in the format imports/<user id>/<type>-<uuid>.csv
const result = await importURL(userId, url, 'csv-importer')
console.log('import url result', result)
countImported = countImported + 1
} catch (err) {
console.log('error importing url', err)
countFailed = countFailed + 1
}
const countFailed = 0
const countImported = 0
await handler(stream, {
userId,
countImported: 0,
countFailed: 0,
urlHandler,
contentHandler,
})
if (countImported <= 1) {

View File

@ -5,28 +5,234 @@
import { parse } from '@fast-csv/parse'
import { Stream } from 'stream'
import unzip from 'unzip-stream'
import fs from 'fs'
import path from 'path'
import * as fsExtra from 'fs-extra'
import { parseHTML } from 'linkedom'
import { Readability } from '@omnivore/readability'
import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify'
import { encode } from 'urlsafe-base64'
import crypto from 'crypto'
import { ImportContext } from '.'
export type UrlHandler = (url: URL) => Promise<void>
export const importMatterHistory = async (
export const importMatterHistoryCsv = async (
stream: Stream,
handler: UrlHandler
): Promise<number> => {
ctx: ImportContext
): Promise<void> => {
const parser = parse({
headers: true,
strictColumnHandling: false,
})
stream.pipe(parser)
let count = 0
for await (const row of parser) {
try {
const url = new URL(row['URL'])
await handler(url)
await ctx.urlHandler(ctx, url)
ctx.countImported += 1
} catch (error) {
console.log('invalid url', row, error)
ctx.countFailed += 1
}
}
}
const DOM_PURIFY_CONFIG = {
ADD_TAGS: ['iframe'],
ADD_ATTR: ['allow', 'allowfullscreen', 'frameborder', 'scrolling'],
FORBID_ATTR: [
'data-ml-dynamic',
'data-ml-dynamic-type',
'data-orig-url',
'data-ml-id',
'data-ml',
'data-xid',
'data-feature',
],
}
function domPurifySanitizeHook(node: Element, data: SanitizeElementHookEvent) {
if (data.tagName === 'iframe') {
const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i
const src = node.getAttribute('src') || ''
const dataSrc = node.getAttribute('data-src') || ''
if (src && urlRegex.test(src)) {
return
}
if (dataSrc && urlRegex.test(dataSrc)) {
node.setAttribute('src', dataSrc)
return
}
node.parentNode?.removeChild(node)
}
}
function getPurifiedContent(html: string) {
const newWindow = parseHTML('')
const DOMPurify = createDOMPurify(newWindow)
DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook)
const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG)
return parseHTML(clean).document
}
function createImageProxyUrl(url: string, width = 0, height = 0) {
if (process.env.IMAGE_PROXY_URL && process.env.IMAGE_PROXY_SECRET) {
const urlWithOptions = `${url}#${width}x${height}`
const signature = signImageProxyUrl(urlWithOptions)
return `${process.env.IMAGE_PROXY_URL}/${width}x${height},s${signature}/${url}`
}
return url
}
function signImageProxyUrl(url: string) {
if (process.env.IMAGE_PROXY_SECRET) {
return encode(
crypto
.createHmac('sha256', process.env.IMAGE_PROXY_SECRET)
.update(url)
.digest()
)
}
return url
}
async function getReadabilityResult(url: string, originalContent: string) {
const document = getPurifiedContent(originalContent)
try {
const article = await new Readability(document, {
createImageProxyUrl,
url,
}).parse()
if (article) {
return article
}
} catch (error) {
console.log('parsing error for url', url, error)
}
return null
}
const unarchive = async (stream: Stream): Promise<string> => {
const archiveDir = `./archive-${Date.now().toString(16)}`
await fsExtra.emptyDir(archiveDir)
return new Promise((resolve, reject) => {
stream
.pipe(unzip.Extract({ path: archiveDir }))
.on('close', () => {
resolve(archiveDir)
})
.on('error', reject)
})
}
const getMatterHistoryContent = (
archiveDir: string,
row: Record<string, string>
) => {
try {
const contentKey = row['File Id']
const contentPath = path.join(archiveDir, contentKey)
const content = fs.readFileSync(contentPath).toString()
return content
} catch (err) {
console.log('error getting matter history content: ', { row, err })
}
return undefined
}
const getURL = (str: string | undefined) => {
if (!str) {
return undefined
}
try {
const url = new URL(str)
return url
} catch (err) {
console.log('error parsing url', { str, err })
}
return undefined
}
const handleMatterHistoryRow = async (
ctx: ImportContext,
archiveDir: string,
row: Record<string, string>
) => {
const title = row['Title']
const urlStr = row['URL']
const url = getURL(urlStr)
if (!url) {
ctx.countFailed += 1
return
}
const originalContent = getMatterHistoryContent(archiveDir, row)
const readabilityResult = originalContent
? await getReadabilityResult(urlStr, originalContent)
: null
if (originalContent && readabilityResult) {
await ctx.contentHandler(
ctx,
url,
title,
originalContent,
readabilityResult
)
} else {
await ctx.urlHandler(ctx, url)
}
}
export const importMatterArchive = async (
stream: Stream,
ctx: ImportContext
): Promise<void> => {
const archiveDir = await unarchive(stream)
try {
const historyFile = path.join(archiveDir, '_matter_history.csv')
const parser = parse({
headers: true,
strictColumnHandling: false,
})
fs.createReadStream(historyFile).pipe(parser)
for await (const row of parser) {
try {
await handleMatterHistoryRow(ctx, archiveDir, row)
ctx.countImported += 1
} catch (error) {
console.log('invalid url', row, error)
ctx.countFailed += 1
}
}
} catch (err) {
console.log('error handling archive: ', { err })
} finally {
try {
await fsExtra.rm(archiveDir, { recursive: true, force: true })
} catch (err) {
console.log('Error removing archive directory', { err })
}
count++
}
return count
}

View File

@ -0,0 +1,108 @@
// Type definitions for non-npm package mozilla-readability 0.2
// Project: https://github.com/mozilla/readability
// Definitions by: Charles Vandevoorde <https://github.com/charlesvdv>, Alex Wendland <https://github.com/awendland>
// Definitions: https://github.com/DefinitelyTyped/DefinitelyTyped
// TypeScript Version: 2.2
declare module '@omnivore/readability' {
class Readability {
constructor(doc: Document, options?: Readability.Options)
async parse(): Promise<Readability.ParseResult | null>
}
namespace Readability {
interface Options {
/**
* Control whether log messages are sent to the console
*/
debug?: boolean
/**
* Set a maximum size on the documents that will be processed. This size is
* checked before any parsing operations occur. If the number of elements in
* the document exceeds this threshold then an Error will be thrown.
*
* See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L2019
*/
maxElemsToParse?: number
nbTopCandidates?: number
/**
* Minimum number of characters in the extracted textContent in order to
* consider the article correctly identified. If the threshold is not met then
* the extraction process will automatically run again with different flags.
*
* See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L1208
*
* Changed from wordThreshold in https://github.com/mozilla/readability/commit/3ff9a166fb27928f222c4c0722e730eda412658a
*/
charThreshold?: number
/**
* parse() removes the class="" attribute from every element in the given
* subtree, except those that match CLASSES_TO_PRESERVE and
* the classesToPreserve array from the options object.
*/
classesToPreserve?: string[]
/**
* By default Readability will strip all classes from the HTML elements in the
* processed article. By setting this to `true` the classes will be retained.
*
* This is a blanket alternative to `classesToPreserve`.
*
* Added in https://github.com/mozilla/readability/commit/2982216913af2c66b0690e88606b03116553ad92
*/
keepClasses?: boolean
url?: string
/**
* Function that converts a regular image url into imageproxy url
* @param url string
*/
createImageProxyUrl?: (
url: string,
width?: number,
height?: number
) => string
/**
* By default, Readability will clean all tables from the HTML elements in the
* processed article. But newsletters in emails use tables to display their content.
* By setting this to `true`, these tables will be retained.
*/
keepTables?: boolean
}
interface ParseResult {
/** Article title */
title: string
/** Author metadata */
byline?: string | null
/** Content direction */
dir?: string | null
/** HTML string of processed article content */
content: string
/** non-HTML version of `content` */
textContent: string
/** Length of an article, in characters */
length: number
/** Article description, or short excerpt from the content */
excerpt: string
/** Article site name */
siteName?: string | null
/** Article site icon */
siteIcon?: string | null
/** Article preview image */
previewImage?: string | null
/** Article published date */
publishedDate?: Date | null
language?: string | null
}
}
export { Readability }
}

View File

@ -4,6 +4,8 @@ import { expect } from 'chai'
import chaiString from 'chai-string'
import * as fs from 'fs'
import { importCsv } from '../../src/csv'
import { ImportContext } from '../../src'
import { stubImportCtx } from '../util'
chai.use(chaiString)
@ -11,11 +13,15 @@ describe('Load a simple CSV file', () => {
it('should call the handler for each URL', async () => {
const urls: URL[] = []
const stream = fs.createReadStream('./test/csv/data/simple.csv')
const count = await importCsv(stream, (url): Promise<void> => {
const stub = stubImportCtx()
stub.urlHandler = (ctx: ImportContext, url): Promise<void> => {
urls.push(url)
return Promise.resolve()
})
expect(count).to.equal(2)
}
await importCsv(stream, stub)
expect(stub.countFailed).to.equal(0)
expect(stub.countImported).to.equal(2)
expect(urls).to.eql([
new URL('https://omnivore.app'),
new URL('https://google.com'),

Binary file not shown.

View File

@ -3,7 +3,13 @@ import * as chai from 'chai'
import { expect } from 'chai'
import chaiString from 'chai-string'
import * as fs from 'fs'
import { importMatterHistory } from '../../src/matterHistory'
import {
importMatterArchive,
importMatterHistoryCsv,
} from '../../src/matterHistory'
import { stubImportCtx } from '../util'
import { ImportContext } from '../../src'
import { Readability } from '@omnivore/readability'
chai.use(chaiString)
@ -11,11 +17,40 @@ describe('Load a simple _matter_history file', () => {
it('should find the URL of each row', async () => {
const urls: URL[] = []
const stream = fs.createReadStream('./test/matter/data/_matter_history.csv')
const count = await importMatterHistory(stream, (url): Promise<void> => {
const stub = stubImportCtx()
stub.urlHandler = (ctx: ImportContext, url): Promise<void> => {
urls.push(url)
return Promise.resolve()
})
expect(count).to.equal(1)
}
await importMatterHistoryCsv(stream, stub)
expect(stub.countFailed).to.equal(0)
expect(stub.countImported).to.equal(1)
expect(urls).to.eql([
new URL('https://www.bloomberg.com/features/2022-the-crypto-story/'),
])
})
})
describe('Load archive file', () => {
it('should find the URL of each row', async () => {
const urls: URL[] = []
const stream = fs.createReadStream('./test/matter/data/Archive.zip')
const stub = stubImportCtx()
stub.contentHandler = (
ctx: ImportContext,
url: URL,
title: string,
originalContent: string,
parseResult: Readability.ParseResult
): Promise<void> => {
urls.push(url)
return Promise.resolve()
}
await importMatterArchive(stream, stub)
expect(stub.countFailed).to.equal(0)
expect(stub.countImported).to.equal(1)
expect(urls).to.eql([
new URL('https://www.bloomberg.com/features/2022-the-crypto-story/'),
])

View File

@ -0,0 +1,22 @@
import { Readability } from '@omnivore/readability'
import { ImportContext } from '../src'
export const stubImportCtx = () => {
return {
userId: '',
countImported: 0,
countFailed: 0,
urlHandler: (ctx: ImportContext, url: URL): Promise<void> => {
return Promise.resolve()
},
contentHandler: (
ctx: ImportContext,
url: URL,
title: string,
originalContent: string,
parseResult: Readability.ParseResult
): Promise<void> => {
return Promise.resolve()
},
}
}

View File

@ -1,9 +1,10 @@
{
"extends": "@tsconfig/node14/tsconfig.json",
"extends": "./../../tsconfig.json",
"ts-node": {
"files": true
},
"compilerOptions": {
"outDir": "build",
"rootDir": ".",
"lib": ["dom"]
"outDir": "dist"
},
"include": ["src", "test"]
}