From 11cf9f7c286c9e57a91053fbac8f1e9c39651c40 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Wed, 14 Jun 2023 12:50:06 +0800 Subject: [PATCH 1/4] stringify csv before uploading --- packages/api/package.json | 2 ++ packages/api/src/routers/svc/integrations.ts | 13 ++++++++----- packages/api/test/routers/integrations.test.ts | 8 +++++++- yarn.lock | 12 ++++++++++++ 4 files changed, 29 insertions(+), 6 deletions(-) diff --git a/packages/api/package.json b/packages/api/package.json index d1aa221b8..ec47f1bcc 100644 --- a/packages/api/package.json +++ b/packages/api/package.json @@ -49,6 +49,7 @@ "cookie": "^0.5.0", "cookie-parser": "^1.4.5", "cors": "^2.8.5", + "csv-stringify": "^6.4.0", "dataloader": "^2.0.0", "diff-match-patch": "^1.0.5", "dompurify": "^2.0.17", @@ -107,6 +108,7 @@ "@types/chai-string": "^1.4.2", "@types/cookie": "^0.4.0", "@types/cookie-parser": "^1.4.2", + "@types/csv-stringify": "^3.1.0", "@types/diff-match-patch": "^1.0.32", "@types/dompurify": "^2.0.4", "@types/express": "^4.17.7", diff --git a/packages/api/src/routers/svc/integrations.ts b/packages/api/src/routers/svc/integrations.ts index efc8f85d0..bb891750c 100644 --- a/packages/api/src/routers/svc/integrations.ts +++ b/packages/api/src/routers/svc/integrations.ts @@ -1,6 +1,7 @@ /* eslint-disable @typescript-eslint/no-misused-promises */ /* eslint-disable @typescript-eslint/no-unsafe-assignment */ /* eslint-disable @typescript-eslint/no-unsafe-member-access */ +import { stringify } from 'csv-stringify' import express from 'express' import { DateTime } from 'luxon' import { v4 as uuidv4 } from 'uuid' @@ -220,6 +221,12 @@ export function integrationsServiceRouter() { writeStream = file.createWriteStream({ contentType: 'text/csv', }) + // stringify the data and pipe it to the write_stream + const stringifier = stringify({ + header: false, + columns: ['url', 'state', 'labels'], + }) + stringifier.pipe(writeStream) let hasMore = true let offset = 0 @@ -236,11 +243,7 @@ export function integrationsServiceRouter() { break } // write the list of urls, state and labels to the stream - const csvData = retrievedData.map((page) => { - const { url, state, labels } = page - return [url, state, `"[${labels?.join(',') || ''}]"`].join(',') - }) - writeStream.write(csvData.join('\n')) + retrievedData.forEach((row) => stringifier.write(row)) hasMore = !!retrieved.hasMore offset += retrievedData.length diff --git a/packages/api/test/routers/integrations.test.ts b/packages/api/test/routers/integrations.test.ts index 4429bcc08..08267ab14 100644 --- a/packages/api/test/routers/integrations.test.ts +++ b/packages/api/test/routers/integrations.test.ts @@ -369,8 +369,13 @@ describe('Integrations routers', () => { complete: 1, list: { '123': { - given_url: 'https://omnivore.app/pocket-import-test', + given_url: 'https://omnivore.app/pocket-import-test,test', state: '0', + tags: { + '1234': { + tag: 'test', + }, + }, }, }, since: Date.now() / 1000, @@ -387,6 +392,7 @@ describe('Integrations routers', () => { after(async () => { sinon.restore() + nock.cleanAll() await deleteTestIntegrations(user.id, [integration.id]) }) diff --git a/yarn.lock b/yarn.lock index 0a239f537..e38aa98fa 100644 --- a/yarn.lock +++ b/yarn.lock @@ -8111,6 +8111,13 @@ resolved "https://registry.yarnpkg.com/@types/cors/-/cors-2.8.12.tgz#6b2c510a7ad7039e98e7b8d3d6598f4359e5c080" integrity sha512-vt+kDhq/M2ayberEtJcIN/hxXy1Pk+59g2FV/ZQceeaTyCtCucjL2Q7FXlFjtWn4n15KCr1NE2lNNFhp0lEThw== +"@types/csv-stringify@^3.1.0": + version "3.1.0" + resolved "https://registry.yarnpkg.com/@types/csv-stringify/-/csv-stringify-3.1.0.tgz#4c172ef462740e584a5bfe66ea78b67759f7bb32" + integrity sha512-jNRWx49wIc9UjJXukCaQt8iZRjyzDiEC1CGAAIZsydECWl5xM9oq4pSc5+Jhl4oATrRr+eGA9Vf0y9duDbKAvg== + dependencies: + csv-stringify "*" + "@types/debug@^4.0.0", "@types/debug@^4.1.0": version "4.1.7" resolved "https://registry.yarnpkg.com/@types/debug/-/debug-4.1.7.tgz#7cc0ea761509124709b8b2d1090d8f6c17aadb82" @@ -12689,6 +12696,11 @@ csv-parser@^3.0.0: dependencies: minimist "^1.2.0" +csv-stringify@*, csv-stringify@^6.4.0: + version "6.4.0" + resolved "https://registry.yarnpkg.com/csv-stringify/-/csv-stringify-6.4.0.tgz#6d006dca9194700e44f9fbc541bee8bbbd4f459c" + integrity sha512-HQsw0QXiN5fdlO+R8/JzCZnR3Fqp8E87YVnhHlaPtNGJjt6ffbV0LpOkieIb1x6V1+xt878IYq77SpXHWAqKkA== + cyclist@^1.0.1: version "1.0.1" resolved "https://registry.yarnpkg.com/cyclist/-/cyclist-1.0.1.tgz#596e9698fd0c80e12038c2b82d6eb1b35b6224d9" From d2e476a3ea4dc9745dfa6eb9d0b942ac11eeeedd Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Wed, 14 Jun 2023 12:53:43 +0800 Subject: [PATCH 2/4] add debug logs --- packages/puppeteer-parse/index.js | 1 + 1 file changed, 1 insertion(+) diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index 9d9144546..06b9f658d 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -461,6 +461,7 @@ async function fetchContent(req, res) { // mark import failed on the last failed retry const retryCount = req.headers['x-cloudtasks-taskretrycount']; if (retryCount == MAX_RETRY_COUNT) { + console.debug('max retry count reached'); importStatus = importStatus || 'failed'; } From 42305ee0b4c24f8ced932080c6819e6d52487e74 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Wed, 14 Jun 2023 13:17:59 +0800 Subject: [PATCH 3/4] use a separate queue for sending emails --- packages/import-handler/src/task.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/import-handler/src/task.ts b/packages/import-handler/src/task.ts index f4873376d..94b3352cc 100644 --- a/packages/import-handler/src/task.ts +++ b/packages/import-handler/src/task.ts @@ -16,9 +16,9 @@ export const CONTENT_FETCH_URL = process.env.CONTENT_FETCH_GCF_URL export const createCloudTask = async ( taskHandlerUrl: string | undefined, payload: unknown, - requestHeaders?: Record + requestHeaders?: Record, + queue = 'omnivore-import-queue' ) => { - const queue = 'omnivore-import-queue' const location = process.env.GCP_LOCATION const project = process.env.GCP_PROJECT_ID From 8ad937ed84d002ba4229d3565fbfa68f4673dc5d Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Wed, 14 Jun 2023 13:33:20 +0800 Subject: [PATCH 4/4] allow more labels format in the csv --- .../api/test/routers/integrations.test.ts | 3 +++ packages/import-handler/package.json | 1 - packages/import-handler/src/csv.ts | 26 ++++++++++++------- packages/import-handler/src/index.ts | 7 ++++- .../import-handler/test/csv/data/complex.csv | 4 +-- yarn.lock | 7 ----- 6 files changed, 28 insertions(+), 20 deletions(-) diff --git a/packages/api/test/routers/integrations.test.ts b/packages/api/test/routers/integrations.test.ts index 08267ab14..4843ebf70 100644 --- a/packages/api/test/routers/integrations.test.ts +++ b/packages/api/test/routers/integrations.test.ts @@ -375,6 +375,9 @@ describe('Integrations routers', () => { '1234': { tag: 'test', }, + '1235': { + tag: 'new', + }, }, }, }, diff --git a/packages/import-handler/package.json b/packages/import-handler/package.json index ec14ef577..ef7c0fb32 100644 --- a/packages/import-handler/package.json +++ b/packages/import-handler/package.json @@ -42,7 +42,6 @@ "@sentry/serverless": "^7.30.0", "@types/express": "^4.17.13", "axios": "^1.2.2", - "csv-parser": "^3.0.0", "dompurify": "^2.4.3", "fs-extra": "^11.1.0", "glob": "^8.1.0", diff --git a/packages/import-handler/src/csv.ts b/packages/import-handler/src/csv.ts index 70bb8b955..1df145e34 100644 --- a/packages/import-handler/src/csv.ts +++ b/packages/import-handler/src/csv.ts @@ -8,6 +8,22 @@ import { Stream } from 'stream' import { ImportContext } from '.' import { createMetrics, ImportStatus, updateMetrics } from './metrics' +const parseLabels = (labels: string): string[] => { + try { + // labels follows format: "[""label1"",""label2""]" + return JSON.parse(labels) as string[] + } catch (error) { + console.debug('invalid labels format', labels) + + // labels follows format: "[label1,label2]" + return labels + .slice(1, -1) + .split(',') + .map((l) => l.trim()) + .filter((l) => l !== '') + } +} + export const importCsv = async (ctx: ImportContext, stream: Stream) => { // create metrics in redis await createMetrics(ctx.redisClient, ctx.userId, ctx.taskId, 'csv-importer') @@ -18,15 +34,7 @@ export const importCsv = async (ctx: ImportContext, stream: Stream) => { try { const url = new URL(row[0]) const state = row.length > 1 && row[1] ? row[1] : undefined - // labels follows format: "[label1,label2]" - const labels = - row.length > 2 - ? (row[2] as string) - .slice(1, -1) - .split(',') - .map((l) => l.trim()) - .filter((l) => l !== '') - : undefined + const labels = row.length > 2 ? parseLabels(row[2]) : undefined // update total counter await updateMetrics( diff --git a/packages/import-handler/src/index.ts b/packages/import-handler/src/index.ts index 886aea8f5..29f6364e0 100644 --- a/packages/import-handler/src/index.ts +++ b/packages/import-handler/src/index.ts @@ -127,7 +127,12 @@ const createEmailCloudTask = async (userId: string, payload: unknown) => { Cookie: `auth=${authToken}`, } - return createCloudTask(emailUserUrl(), payload, headers) + return createCloudTask( + emailUserUrl(), + payload, + headers, + 'omnivore-email-queue' + ) } const sendImportFailedEmail = async (userId: string) => { diff --git a/packages/import-handler/test/csv/data/complex.csv b/packages/import-handler/test/csv/data/complex.csv index fa4ce8b83..6b8968660 100644 --- a/packages/import-handler/test/csv/data/complex.csv +++ b/packages/import-handler/test/csv/data/complex.csv @@ -1,3 +1,3 @@ -"https://omnivore.app",ARCHIVED,"[test]" -"https://google.com",SUCCEEDED,"[test,development]" +"https://omnivore.app",ARCHIVED,"[""test""]" +"https://google.com",SUCCEEDED,"[""test"",""development""]" https://test.com,SUCCEEDED,"[test, development]" diff --git a/yarn.lock b/yarn.lock index e38aa98fa..2655b1575 100644 --- a/yarn.lock +++ b/yarn.lock @@ -12689,13 +12689,6 @@ csstype@^3.0.2, csstype@^3.0.4: resolved "https://registry.yarnpkg.com/csstype/-/csstype-3.0.8.tgz#d2266a792729fb227cd216fb572f43728e1ad340" integrity sha512-jXKhWqXPmlUeoQnF/EhTtTl4C9SnrxSH/jZUih3jmO6lBKr99rP3/+FmrMj4EFpOXzMtXHAZkd3x0E6h6Fgflw== -csv-parser@^3.0.0: - version "3.0.0" - resolved "https://registry.yarnpkg.com/csv-parser/-/csv-parser-3.0.0.tgz#b88a6256d79e090a97a1b56451f9327b01d710e7" - integrity sha512-s6OYSXAK3IdKqYO33y09jhypG/bSDHPuyCme/IdEHfWpLf/jKcpitVFyOC6UemgGk8v7Q5u2XE0vvwmanxhGlQ== - dependencies: - minimist "^1.2.0" - csv-stringify@*, csv-stringify@^6.4.0: version "6.4.0" resolved "https://registry.yarnpkg.com/csv-stringify/-/csv-stringify-6.4.0.tgz#6d006dca9194700e44f9fbc541bee8bbbd4f459c"