From 54e7c52b5ea80d27e2e2b322063274d5ac1270b7 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Thu, 11 Apr 2024 18:07:53 +0800 Subject: [PATCH] always update the state of the pdf item to succeeded/failed, even if we fail to parse the content --- .../api/src/services/update_pdf_content.ts | 21 ++-- packages/api/src/services/upload_file.ts | 26 ++--- packages/pdf-handler/src/index.ts | 105 +++++++++++------- packages/pdf-handler/src/job.ts | 5 +- 4 files changed, 91 insertions(+), 66 deletions(-) diff --git a/packages/api/src/services/update_pdf_content.ts b/packages/api/src/services/update_pdf_content.ts index 75d3e8438..4c337cfb4 100644 --- a/packages/api/src/services/update_pdf_content.ts +++ b/packages/api/src/services/update_pdf_content.ts @@ -8,16 +8,17 @@ import { findUploadFileById, setFileUploadComplete } from './upload_file' export interface UpdateContentMessage { fileId: string - content: string + content?: string title?: string author?: string description?: string + state?: LibraryItemState } export const isUpdateContentMessage = ( data: any ): data is UpdateContentMessage => { - return 'fileId' in data && 'content' in data + return 'fileId' in data } export const updateContentForFileItem = async (msg: UpdateContentMessage) => { @@ -51,16 +52,13 @@ export const updateContentForFileItem = async (msg: UpdateContentMessage) => { } const itemToUpdate: QueryDeepPartialEntity = { + ...msg, originalContent: msg.content, + // This event is fired after the file is fully uploaded, + // so along with updating content, we mark it as + // succeeded or failed based on the message state + state: msg.state || LibraryItemState.Succeeded, } - if (msg.title) itemToUpdate.title = msg.title - if (msg.author) itemToUpdate.author = msg.author - if (msg.description) itemToUpdate.description = msg.description - - // This event is fired after the file is fully uploaded, - // so along with updating content, we mark it as - // succeeded. - itemToUpdate.state = LibraryItemState.Succeeded try { const uploadFileData = await setFileUploadComplete( @@ -80,7 +78,8 @@ export const updateContentForFileItem = async (msg: UpdateContentMessage) => { logger.info('Updating library item text', { id: libraryItem.id, result, - content: msg.content.substring(0, 20), + content: msg.content?.substring(0, 20), + state: msg.state, }) return true diff --git a/packages/api/src/services/upload_file.ts b/packages/api/src/services/upload_file.ts index 1d284916f..148614352 100644 --- a/packages/api/src/services/upload_file.ts +++ b/packages/api/src/services/upload_file.ts @@ -18,6 +18,7 @@ import { } from '../utils/uploads' import { validateUrl } from './create_page_save_request' import { createOrUpdateLibraryItem } from './library_item' +import { v4 as uuid } from 'uuid' const isFileUrl = (url: string): boolean => { const parsedUrl = new URL(url) @@ -90,8 +91,18 @@ export const uploadFile = async ( } } + let url = input.url + + const uploadFileId = uuid() + const uploadFilePathName = generateUploadFilePathName(uploadFileId, fileName) + // If this is a file URL, we swap in a special URL + if (isFileUrl(url)) { + url = `https://omnivore.app/attachments/${uploadFilePathName}` + } + const uploadFileData = await authTrx((t) => t.getRepository(UploadFile).save({ + id: uploadFileId, url: input.url, user: { id: uid }, fileName, @@ -99,24 +110,11 @@ export const uploadFile = async ( contentType: input.contentType, }) ) - const uploadFileId = uploadFileData.id - const uploadFilePathName = generateUploadFilePathName(uploadFileId, fileName) const uploadSignedUrl = await generateUploadSignedUrl( uploadFilePathName, input.contentType ) - // If this is a file URL, we swap in a special URL - const attachmentUrl = `https://omnivore.app/attachments/${uploadFilePathName}` - if (isFileUrl(input.url)) { - await authTrx(async (tx) => { - await tx.getRepository(UploadFile).update(uploadFileId, { - url: attachmentUrl, - status: UploadFileStatus.Initialized, - }) - }) - } - const itemType = itemTypeForContentType(input.contentType) if (input.createPageEntry) { // If we have a file:// URL, don't try to match it @@ -125,7 +123,7 @@ export const uploadFile = async ( const item = await createOrUpdateLibraryItem( { id: input.clientRequestId || undefined, - originalUrl: isFileUrl(input.url) ? attachmentUrl : input.url, + originalUrl: url, user: { id: uid }, title, readableContent: '', diff --git a/packages/pdf-handler/src/index.ts b/packages/pdf-handler/src/index.ts index e1623372b..cf3f3359c 100644 --- a/packages/pdf-handler/src/index.ts +++ b/packages/pdf-handler/src/index.ts @@ -1,7 +1,7 @@ import { GetSignedUrlConfig, Storage } from '@google-cloud/storage' import * as Sentry from '@sentry/serverless' import { parsePdf } from './pdf' -import { queueUpdatePageJob } from './job' +import { queueUpdatePageJob, State } from './job' Sentry.GCPFunction.init({ dsn: process.env.SENTRY_DSN, @@ -50,10 +50,11 @@ const getDocumentUrl = async ( export const updatePageContent = async ( fileId: string, - content: string, + content?: string, title?: string, author?: string, - description?: string + description?: string, + state?: State ): Promise => { const job = await queueUpdatePageJob({ fileId, @@ -61,6 +62,7 @@ export const updatePageContent = async ( title, author, description, + state, }) return job.id } @@ -86,45 +88,68 @@ export const pdfHandler = Sentry.GCPFunction.wrapHttpFunction( if ('message' in req.body && 'data' in req.body.message) { const pubSubMessage = req.body.message.data as string const data = getStorageEventData(pubSubMessage) - if (data) { - try { - if (shouldHandle(data)) { - console.log('handling pdf data', data) - - const url = await getDocumentUrl(data) - console.log('PDF url: ', url) - if (!url) { - console.log('Could not fetch PDF', data.bucket, data.name) - return res.status(404).send('Could not fetch PDF') - } - - const parsed = await parsePdf(url) - const result = await updatePageContent( - data.name, - parsed.content, - parsed.title, - parsed.author, - parsed.description - ) - console.log( - 'publish result', - result, - 'title', - parsed.title, - 'author', - parsed.author - ) - } else { - console.log('not handling pdf data', data) - } - } catch (err) { - console.log('error handling event', { err, data }) - return res.status(500).send('Error handling event') - } + if (!data) { + console.log('no data found in pubsub message') + return res.send('ok') + } + + if (!shouldHandle(data)) { + console.log('not handling pdf data', data) + return res.send('ok') + } + + console.log('handling pdf data', data) + + let content, + title, + author, + description, + state: State = 'SUCCEEDED' // Default to succeeded even if we fail to parse + + try { + const url = await getDocumentUrl(data) + console.log('PDF url: ', url) + if (!url) { + console.log('Could not fetch PDF', data.bucket, data.name) + // If we can't fetch the PDF, mark it as failed + state = 'FAILED' + + return res.status(404).send('Could not fetch PDF') + } + + // Parse the PDF to update the content and metadata + const parsed = await parsePdf(url) + content = parsed.content + title = parsed.title + author = parsed.author + description = parsed.description + } catch (err) { + console.log('error parsing pdf', { err, data }) + + return res.status(500).send('Error parsing pdf') + } finally { + // Always update the state, even if we fail to parse + const result = await updatePageContent( + data.name, + content, + title, + author, + description, + state + ) + console.log( + 'publish result', + result, + 'title', + title, + 'author', + author, + 'state', + state + ) } - } else { - console.log('no pubsub message') } + res.send('ok') } ) diff --git a/packages/pdf-handler/src/job.ts b/packages/pdf-handler/src/job.ts index d244079d3..ca404aa0f 100644 --- a/packages/pdf-handler/src/job.ts +++ b/packages/pdf-handler/src/job.ts @@ -8,12 +8,15 @@ const queue = new Queue(QUEUE_NAME, { connection: redisDataSource.queueRedisClient, }) +export type State = 'SUCCEEDED' | 'FAILED' + type UpdatePageJobData = { fileId: string - content: string + content?: string title?: string author?: string description?: string + state?: State } export const queueUpdatePageJob = async (data: UpdatePageJobData) => {