Process labels and state in the csv file

This commit is contained in:
Hongbo Wu
2023-03-03 09:00:42 +08:00
parent 9a46e935d9
commit 4b578bbebb
6 changed files with 65 additions and 7 deletions

View File

@ -238,7 +238,7 @@ export function integrationsServiceRouter() {
// write the list of urls, state and labels to the stream
const csvData = retrievedData.map((page) => {
const { url, state, labels } = page
return [url, state, `[${labels?.join(',') || ''}]`].join(',')
return [url, state, `"[${labels?.join(',') || ''}]"`].join(',')
})
writeStream.write(csvData.join('\n'))

View File

@ -13,7 +13,10 @@ export const importCsv = async (ctx: ImportContext, stream: Stream) => {
for await (const row of parser) {
try {
const url = new URL(row[0])
await ctx.urlHandler(ctx, url)
const state = row.length > 1 ? row[1] : undefined
// labels follows format: "[label1, label2]"
const labels = row.length > 2 ? row[2].slice(1, -1).split(',') : undefined
await ctx.urlHandler(ctx, url, state, labels)
ctx.countImported += 1
} catch (error) {
console.log('invalid url', row, error)

View File

@ -1,7 +1,7 @@
import { Storage } from '@google-cloud/storage'
import { importCsv } from './csv'
import * as path from 'path'
import { importMatterArchive, importMatterHistoryCsv } from './matterHistory'
import { importMatterArchive } from './matterHistory'
import { Stream } from 'node:stream'
import { v4 as uuid } from 'uuid'
import { CONTENT_FETCH_URL, createCloudTask, emailUserUrl } from './task'
@ -13,6 +13,8 @@ import { Readability } from '@omnivore/readability'
import * as Sentry from '@sentry/serverless'
export type RetrievedDataState = 'archived' | 'saved' | 'deleted'
Sentry.GCPFunction.init({
dsn: process.env.SENTRY_DSN,
tracesSampleRate: 0,
@ -24,7 +26,12 @@ const storage = new Storage()
const CONTENT_TYPES = ['text/csv', 'application/zip']
export type UrlHandler = (ctx: ImportContext, url: URL) => Promise<void>
export type UrlHandler = (
ctx: ImportContext,
url: URL,
state?: RetrievedDataState,
labels?: string[]
) => Promise<void>
export type ContentHandler = (
ctx: ImportContext,
url: URL,

View File

@ -4,7 +4,7 @@ import { expect } from 'chai'
import chaiString from 'chai-string'
import * as fs from 'fs'
import { importCsv } from '../../src/csv'
import { ImportContext } from '../../src'
import { ImportContext, RetrievedDataState } from '../../src'
import { stubImportCtx } from '../util'
chai.use(chaiString)
@ -28,3 +28,44 @@ describe('Load a simple CSV file', () => {
])
})
})
describe('Load a complex CSV file', () => {
it('should call the handler for each URL, state and labels', async () => {
const results: {
url: URL
state?: RetrievedDataState
labels?: string[]
}[] = []
const stream = fs.createReadStream('./test/csv/data/complex.csv')
const stub = stubImportCtx()
stub.urlHandler = (
ctx: ImportContext,
url,
state,
labels
): Promise<void> => {
results.push({
url,
state,
labels,
})
return Promise.resolve()
}
await importCsv(stub, stream)
expect(stub.countFailed).to.equal(0)
expect(stub.countImported).to.equal(2)
expect(results).to.eql([
{
url: new URL('https://omnivore.app'),
state: 'archived',
labels: ['test'],
},
{
url: new URL('https://google.com'),
state: 'saved',
labels: ['test', 'development'],
},
])
})
})

View File

@ -0,0 +1,2 @@
"https://omnivore.app",archived,"[test]"
"https://google.com",saved,"[test,development]"
1 https://omnivore.app archived [test]
2 https://google.com saved [test,development]

View File

@ -1,12 +1,17 @@
import { Readability } from '@omnivore/readability'
import { ImportContext } from '../src'
import { ImportContext, RetrievedDataState } from '../src'
export const stubImportCtx = () => {
return {
userId: '',
countImported: 0,
countFailed: 0,
urlHandler: (ctx: ImportContext, url: URL): Promise<void> => {
urlHandler: (
ctx: ImportContext,
url: URL,
state?: RetrievedDataState,
labels?: string[]
): Promise<void> => {
return Promise.resolve()
},
contentHandler: (