diff --git a/packages/import-handler/package.json b/packages/import-handler/package.json index fec6d9d45..609cd51b9 100644 --- a/packages/import-handler/package.json +++ b/packages/import-handler/package.json @@ -20,8 +20,10 @@ "deploy": "yarn build && yarn gcloud-deploy" }, "devDependencies": { - "@types/node": "^14.11.2", + "@types/fs-extra": "^11.0.1", "@types/jsonwebtoken": "^8.5.0", + "@types/node": "^14.11.2", + "@types/unzip-stream": "^0.3.1", "eslint-plugin-prettier": "^4.0.0" }, "dependencies": { @@ -29,9 +31,14 @@ "@google-cloud/functions-framework": "3.1.2", "@google-cloud/storage": "^5.18.1", "@google-cloud/tasks": "^3.0.5", + "@omnivore/content-handler": "1.0.0", + "@omnivore/readability": "1.0.0", "@types/express": "^4.17.13", "csv-parser": "^3.0.0", + "dompurify": "^2.4.3", + "fs-extra": "^11.1.0", "jsonwebtoken": "^8.5.1", - "nodemon": "^2.0.15" + "nodemon": "^2.0.15", + "unzip-stream": "^0.3.1" } } diff --git a/packages/import-handler/src/csv.ts b/packages/import-handler/src/csv.ts index 7f60abadd..80f15f72f 100644 --- a/packages/import-handler/src/csv.ts +++ b/packages/import-handler/src/csv.ts @@ -5,24 +5,19 @@ import { parse } from '@fast-csv/parse' import { Stream } from 'stream' +import { ImportContext } from '.' -export type UrlHandler = (url: URL) => Promise - -export const importCsv = async ( - stream: Stream, - handler: UrlHandler -): Promise => { +export const importCsv = async (stream: Stream, ctx: ImportContext) => { const parser = parse() stream.pipe(parser) - let count = 0 for await (const row of parser) { try { const url = new URL(row[0]) - await handler(url) + await ctx.urlHandler(ctx, url) + ctx.countImported += 1 } catch (error) { console.log('invalid url', row, error) + ctx.countFailed += 1 } - count++ } - return count } diff --git a/packages/import-handler/src/index.ts b/packages/import-handler/src/index.ts index 0b37cab30..20898c45e 100644 --- a/packages/import-handler/src/index.ts +++ b/packages/import-handler/src/index.ts @@ -3,30 +3,50 @@ import { CloudFunctionsContext, } from '@google-cloud/functions-framework/build/src/functions' import { Storage } from '@google-cloud/storage' -import { importCsv, UrlHandler } from './csv' +import { importCsv } from './csv' import * as path from 'path' -import { importMatterHistory } from './matterHistory' +import { importMatterArchive, importMatterHistoryCsv } from './matterHistory' import { Stream } from 'node:stream' import { v4 as uuid } from 'uuid' import { CONTENT_FETCH_URL, createCloudTask, EMAIL_USER_URL } from './task' import { promisify } from 'util' import * as jwt from 'jsonwebtoken' +import { Readability } from '@omnivore/readability' const signToken = promisify(jwt.sign) const storage = new Storage() +const CONTENT_TYPES = ['text/csv', 'application/zip'] + interface StorageEventData { bucket: string name: string contentType: string } +export type UrlHandler = (ctx: ImportContext, url: URL) => Promise +export type ContentHandler = ( + ctx: ImportContext, + url: URL, + title: string, + originalContent: string, + parseResult: Readability.ParseResult +) => Promise + +export type ImportContext = { + userId: string + countImported: number + countFailed: number + urlHandler: UrlHandler + contentHandler: ContentHandler +} + type importHandlerFunc = ( stream: Stream, - handler: UrlHandler -) => Promise + handler: ImportContext +) => Promise const shouldHandle = (data: StorageEventData, ctx: CloudFunctionsContext) => { console.log('deciding to handle', ctx, data) @@ -35,7 +55,7 @@ const shouldHandle = (data: StorageEventData, ctx: CloudFunctionsContext) => { } if ( !data.name.startsWith('imports/') || - data.contentType.toLowerCase() != 'text/csv' + CONTENT_TYPES.indexOf(data.contentType.toLocaleLowerCase()) == -1 ) { return false } @@ -93,7 +113,7 @@ const sendImportCompletedEmail = async ( const handlerForFile = (name: string): importHandlerFunc | undefined => { const fileName = path.parse(name).name if (fileName.startsWith('MATTER')) { - return importMatterHistory + return importMatterArchive } else if (fileName.startsWith('URL_LIST')) { return importCsv } @@ -101,6 +121,35 @@ const handlerForFile = (name: string): importHandlerFunc | undefined => { return undefined } +const urlHandler = async (ctx: ImportContext, url: URL): Promise => { + try { + // Imports are stored in the format imports//-.csv + const result = await importURL(ctx.userId, url, 'csv-importer') + if (result) { + ctx.countImported += 1 + } + } catch (err) { + console.log('error importing url', err) + } +} + +const contentHandler = async ( + ctx: ImportContext, + url: URL, + title: string, + originalContent: string, + parseResult: Readability.ParseResult +): Promise => { + // const apiResponse = await sendSavePageMutation(userId, { + // url: finalUrl, + // clientRequestId: articleSavingRequestId, + // title, + // originalContent: content, + // parseResult: readabilityResult, + // }) + return Promise.resolve() +} + export const importHandler: EventFunction = async (event, context) => { const data = event as StorageEventData const ctx = context as CloudFunctionsContext @@ -131,18 +180,14 @@ export const importHandler: EventFunction = async (event, context) => { return } - let countFailed = 0 - let countImported = 0 - await handler(stream, async (url): Promise => { - try { - // Imports are stored in the format imports//-.csv - const result = await importURL(userId, url, 'csv-importer') - console.log('import url result', result) - countImported = countImported + 1 - } catch (err) { - console.log('error importing url', err) - countFailed = countFailed + 1 - } + const countFailed = 0 + const countImported = 0 + await handler(stream, { + userId, + countImported: 0, + countFailed: 0, + urlHandler, + contentHandler, }) if (countImported <= 1) { diff --git a/packages/import-handler/src/matterHistory.ts b/packages/import-handler/src/matterHistory.ts index 8342735b5..f0b314183 100644 --- a/packages/import-handler/src/matterHistory.ts +++ b/packages/import-handler/src/matterHistory.ts @@ -5,28 +5,234 @@ import { parse } from '@fast-csv/parse' import { Stream } from 'stream' +import unzip from 'unzip-stream' +import fs from 'fs' +import path from 'path' +import * as fsExtra from 'fs-extra' + +import { parseHTML } from 'linkedom' +import { Readability } from '@omnivore/readability' +import createDOMPurify, { SanitizeElementHookEvent } from 'dompurify' + +import { encode } from 'urlsafe-base64' +import crypto from 'crypto' +import { ImportContext } from '.' export type UrlHandler = (url: URL) => Promise -export const importMatterHistory = async ( +export const importMatterHistoryCsv = async ( stream: Stream, - handler: UrlHandler -): Promise => { + ctx: ImportContext +): Promise => { const parser = parse({ headers: true, strictColumnHandling: false, }) stream.pipe(parser) - let count = 0 for await (const row of parser) { try { const url = new URL(row['URL']) - await handler(url) + await ctx.urlHandler(ctx, url) + ctx.countImported += 1 } catch (error) { console.log('invalid url', row, error) + ctx.countFailed += 1 + } + } +} + +const DOM_PURIFY_CONFIG = { + ADD_TAGS: ['iframe'], + ADD_ATTR: ['allow', 'allowfullscreen', 'frameborder', 'scrolling'], + FORBID_ATTR: [ + 'data-ml-dynamic', + 'data-ml-dynamic-type', + 'data-orig-url', + 'data-ml-id', + 'data-ml', + 'data-xid', + 'data-feature', + ], +} + +function domPurifySanitizeHook(node: Element, data: SanitizeElementHookEvent) { + if (data.tagName === 'iframe') { + const urlRegex = /^(https?:)?\/\/www\.youtube(-nocookie)?\.com\/embed\//i + const src = node.getAttribute('src') || '' + const dataSrc = node.getAttribute('data-src') || '' + + if (src && urlRegex.test(src)) { + return + } + + if (dataSrc && urlRegex.test(dataSrc)) { + node.setAttribute('src', dataSrc) + return + } + + node.parentNode?.removeChild(node) + } +} + +function getPurifiedContent(html: string) { + const newWindow = parseHTML('') + const DOMPurify = createDOMPurify(newWindow) + DOMPurify.addHook('uponSanitizeElement', domPurifySanitizeHook) + const clean = DOMPurify.sanitize(html, DOM_PURIFY_CONFIG) + return parseHTML(clean).document +} + +function createImageProxyUrl(url: string, width = 0, height = 0) { + if (process.env.IMAGE_PROXY_URL && process.env.IMAGE_PROXY_SECRET) { + const urlWithOptions = `${url}#${width}x${height}` + const signature = signImageProxyUrl(urlWithOptions) + + return `${process.env.IMAGE_PROXY_URL}/${width}x${height},s${signature}/${url}` + } + return url +} + +function signImageProxyUrl(url: string) { + if (process.env.IMAGE_PROXY_SECRET) { + return encode( + crypto + .createHmac('sha256', process.env.IMAGE_PROXY_SECRET) + .update(url) + .digest() + ) + } + return url +} + +async function getReadabilityResult(url: string, originalContent: string) { + const document = getPurifiedContent(originalContent) + + try { + const article = await new Readability(document, { + createImageProxyUrl, + url, + }).parse() + + if (article) { + return article + } + } catch (error) { + console.log('parsing error for url', url, error) + } + + return null +} + +const unarchive = async (stream: Stream): Promise => { + const archiveDir = `./archive-${Date.now().toString(16)}` + await fsExtra.emptyDir(archiveDir) + + return new Promise((resolve, reject) => { + stream + .pipe(unzip.Extract({ path: archiveDir })) + .on('close', () => { + resolve(archiveDir) + }) + .on('error', reject) + }) +} + +const getMatterHistoryContent = ( + archiveDir: string, + row: Record +) => { + try { + const contentKey = row['File Id'] + const contentPath = path.join(archiveDir, contentKey) + const content = fs.readFileSync(contentPath).toString() + + return content + } catch (err) { + console.log('error getting matter history content: ', { row, err }) + } + return undefined +} + +const getURL = (str: string | undefined) => { + if (!str) { + return undefined + } + + try { + const url = new URL(str) + return url + } catch (err) { + console.log('error parsing url', { str, err }) + } + + return undefined +} + +const handleMatterHistoryRow = async ( + ctx: ImportContext, + archiveDir: string, + row: Record +) => { + const title = row['Title'] + const urlStr = row['URL'] + const url = getURL(urlStr) + + if (!url) { + ctx.countFailed += 1 + return + } + + const originalContent = getMatterHistoryContent(archiveDir, row) + const readabilityResult = originalContent + ? await getReadabilityResult(urlStr, originalContent) + : null + + if (originalContent && readabilityResult) { + await ctx.contentHandler( + ctx, + url, + title, + originalContent, + readabilityResult + ) + } else { + await ctx.urlHandler(ctx, url) + } +} + +export const importMatterArchive = async ( + stream: Stream, + ctx: ImportContext +): Promise => { + const archiveDir = await unarchive(stream) + + try { + const historyFile = path.join(archiveDir, '_matter_history.csv') + + const parser = parse({ + headers: true, + strictColumnHandling: false, + }) + + fs.createReadStream(historyFile).pipe(parser) + + for await (const row of parser) { + try { + await handleMatterHistoryRow(ctx, archiveDir, row) + ctx.countImported += 1 + } catch (error) { + console.log('invalid url', row, error) + ctx.countFailed += 1 + } + } + } catch (err) { + console.log('error handling archive: ', { err }) + } finally { + try { + await fsExtra.rm(archiveDir, { recursive: true, force: true }) + } catch (err) { + console.log('Error removing archive directory', { err }) } - count++ } - return count } diff --git a/packages/import-handler/src/readability.d.ts b/packages/import-handler/src/readability.d.ts new file mode 100644 index 000000000..55bfc74d1 --- /dev/null +++ b/packages/import-handler/src/readability.d.ts @@ -0,0 +1,108 @@ +// Type definitions for non-npm package mozilla-readability 0.2 +// Project: https://github.com/mozilla/readability +// Definitions by: Charles Vandevoorde , Alex Wendland +// Definitions: https://github.com/DefinitelyTyped/DefinitelyTyped +// TypeScript Version: 2.2 + +declare module '@omnivore/readability' { + class Readability { + constructor(doc: Document, options?: Readability.Options) + + async parse(): Promise + } + + namespace Readability { + interface Options { + /** + * Control whether log messages are sent to the console + */ + debug?: boolean + + /** + * Set a maximum size on the documents that will be processed. This size is + * checked before any parsing operations occur. If the number of elements in + * the document exceeds this threshold then an Error will be thrown. + * + * See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L2019 + */ + maxElemsToParse?: number + + nbTopCandidates?: number + + /** + * Minimum number of characters in the extracted textContent in order to + * consider the article correctly identified. If the threshold is not met then + * the extraction process will automatically run again with different flags. + * + * See implementation details at https://github.com/mozilla/readability/blob/52ab9b5c8916c306a47b2119270dcdabebf9d203/Readability.js#L1208 + * + * Changed from wordThreshold in https://github.com/mozilla/readability/commit/3ff9a166fb27928f222c4c0722e730eda412658a + */ + charThreshold?: number + + /** + * parse() removes the class="" attribute from every element in the given + * subtree, except those that match CLASSES_TO_PRESERVE and + * the classesToPreserve array from the options object. + */ + classesToPreserve?: string[] + + /** + * By default Readability will strip all classes from the HTML elements in the + * processed article. By setting this to `true` the classes will be retained. + * + * This is a blanket alternative to `classesToPreserve`. + * + * Added in https://github.com/mozilla/readability/commit/2982216913af2c66b0690e88606b03116553ad92 + */ + + keepClasses?: boolean + url?: string + + /** + * Function that converts a regular image url into imageproxy url + * @param url string + */ + createImageProxyUrl?: ( + url: string, + width?: number, + height?: number + ) => string + + /** + * By default, Readability will clean all tables from the HTML elements in the + * processed article. But newsletters in emails use tables to display their content. + * By setting this to `true`, these tables will be retained. + */ + keepTables?: boolean + } + + interface ParseResult { + /** Article title */ + title: string + /** Author metadata */ + byline?: string | null + /** Content direction */ + dir?: string | null + /** HTML string of processed article content */ + content: string + /** non-HTML version of `content` */ + textContent: string + /** Length of an article, in characters */ + length: number + /** Article description, or short excerpt from the content */ + excerpt: string + /** Article site name */ + siteName?: string | null + /** Article site icon */ + siteIcon?: string | null + /** Article preview image */ + previewImage?: string | null + /** Article published date */ + publishedDate?: Date | null + language?: string | null + } + } + + export { Readability } +} diff --git a/packages/import-handler/test/csv/csv.test.ts b/packages/import-handler/test/csv/csv.test.ts index 4ea3ef705..7dfff7ea8 100644 --- a/packages/import-handler/test/csv/csv.test.ts +++ b/packages/import-handler/test/csv/csv.test.ts @@ -4,6 +4,8 @@ import { expect } from 'chai' import chaiString from 'chai-string' import * as fs from 'fs' import { importCsv } from '../../src/csv' +import { ImportContext } from '../../src' +import { stubImportCtx } from '../util' chai.use(chaiString) @@ -11,11 +13,15 @@ describe('Load a simple CSV file', () => { it('should call the handler for each URL', async () => { const urls: URL[] = [] const stream = fs.createReadStream('./test/csv/data/simple.csv') - const count = await importCsv(stream, (url): Promise => { + const stub = stubImportCtx() + stub.urlHandler = (ctx: ImportContext, url): Promise => { urls.push(url) return Promise.resolve() - }) - expect(count).to.equal(2) + } + + await importCsv(stream, stub) + expect(stub.countFailed).to.equal(0) + expect(stub.countImported).to.equal(2) expect(urls).to.eql([ new URL('https://omnivore.app'), new URL('https://google.com'), diff --git a/packages/import-handler/test/matter/data/Archive.zip b/packages/import-handler/test/matter/data/Archive.zip new file mode 100644 index 000000000..f63012484 Binary files /dev/null and b/packages/import-handler/test/matter/data/Archive.zip differ diff --git a/packages/import-handler/test/matter/matter_importer.test.ts b/packages/import-handler/test/matter/matter_importer.test.ts index 6a3b24e98..e160c7be3 100644 --- a/packages/import-handler/test/matter/matter_importer.test.ts +++ b/packages/import-handler/test/matter/matter_importer.test.ts @@ -3,7 +3,13 @@ import * as chai from 'chai' import { expect } from 'chai' import chaiString from 'chai-string' import * as fs from 'fs' -import { importMatterHistory } from '../../src/matterHistory' +import { + importMatterArchive, + importMatterHistoryCsv, +} from '../../src/matterHistory' +import { stubImportCtx } from '../util' +import { ImportContext } from '../../src' +import { Readability } from '@omnivore/readability' chai.use(chaiString) @@ -11,11 +17,40 @@ describe('Load a simple _matter_history file', () => { it('should find the URL of each row', async () => { const urls: URL[] = [] const stream = fs.createReadStream('./test/matter/data/_matter_history.csv') - const count = await importMatterHistory(stream, (url): Promise => { + const stub = stubImportCtx() + stub.urlHandler = (ctx: ImportContext, url): Promise => { urls.push(url) return Promise.resolve() - }) - expect(count).to.equal(1) + } + + await importMatterHistoryCsv(stream, stub) + expect(stub.countFailed).to.equal(0) + expect(stub.countImported).to.equal(1) + expect(urls).to.eql([ + new URL('https://www.bloomberg.com/features/2022-the-crypto-story/'), + ]) + }) +}) + +describe('Load archive file', () => { + it('should find the URL of each row', async () => { + const urls: URL[] = [] + const stream = fs.createReadStream('./test/matter/data/Archive.zip') + const stub = stubImportCtx() + stub.contentHandler = ( + ctx: ImportContext, + url: URL, + title: string, + originalContent: string, + parseResult: Readability.ParseResult + ): Promise => { + urls.push(url) + return Promise.resolve() + } + + await importMatterArchive(stream, stub) + expect(stub.countFailed).to.equal(0) + expect(stub.countImported).to.equal(1) expect(urls).to.eql([ new URL('https://www.bloomberg.com/features/2022-the-crypto-story/'), ]) diff --git a/packages/import-handler/test/util.ts b/packages/import-handler/test/util.ts new file mode 100644 index 000000000..58cd9dd10 --- /dev/null +++ b/packages/import-handler/test/util.ts @@ -0,0 +1,22 @@ +import { Readability } from '@omnivore/readability' +import { ImportContext } from '../src' + +export const stubImportCtx = () => { + return { + userId: '', + countImported: 0, + countFailed: 0, + urlHandler: (ctx: ImportContext, url: URL): Promise => { + return Promise.resolve() + }, + contentHandler: ( + ctx: ImportContext, + url: URL, + title: string, + originalContent: string, + parseResult: Readability.ParseResult + ): Promise => { + return Promise.resolve() + }, + } +} diff --git a/packages/import-handler/tsconfig.json b/packages/import-handler/tsconfig.json index f450acf38..5c4d5c778 100644 --- a/packages/import-handler/tsconfig.json +++ b/packages/import-handler/tsconfig.json @@ -1,9 +1,10 @@ { - "extends": "@tsconfig/node14/tsconfig.json", + "extends": "./../../tsconfig.json", + "ts-node": { + "files": true + }, "compilerOptions": { - "outDir": "build", - "rootDir": ".", - "lib": ["dom"] + "outDir": "dist" }, "include": ["src", "test"] }