From 8cfa24a847f27d330eda5e34fc9f8d3ff0353d37 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Wed, 15 May 2024 21:46:22 +0800 Subject: [PATCH] allow downloading/uploading readable content --- packages/api/src/jobs/save_page.ts | 10 +++--- packages/api/src/jobs/upload_content.ts | 36 ++++++++++++++------ packages/api/src/routers/content_router.ts | 24 +++++++------ packages/api/src/services/library_item.ts | 18 ++++++++-- packages/api/src/utils/createTask.ts | 3 ++ packages/api/src/utils/highlightGenerator.ts | 2 +- packages/api/src/utils/uploads.ts | 26 +++++++++++--- 7 files changed, 84 insertions(+), 35 deletions(-) diff --git a/packages/api/src/jobs/save_page.ts b/packages/api/src/jobs/save_page.ts index bf338a167..332196e00 100644 --- a/packages/api/src/jobs/save_page.ts +++ b/packages/api/src/jobs/save_page.ts @@ -186,12 +186,12 @@ export const savePageJob = async (data: Data, attemptsMade: number) => { } // download the original content - const filePath = contentFilePath( + const filePath = contentFilePath({ userId, - articleSavingRequestId, - new Date(savedAt).getTime(), - 'original' - ) + libraryItemId: articleSavingRequestId, + format: 'original', + savedAt: new Date(savedAt), + }) const exists = await isFileExists(filePath) if (!exists) { logger.error('Original content file does not exist', { diff --git a/packages/api/src/jobs/upload_content.ts b/packages/api/src/jobs/upload_content.ts index 7159510b2..fbddd79ee 100644 --- a/packages/api/src/jobs/upload_content.ts +++ b/packages/api/src/jobs/upload_content.ts @@ -2,11 +2,15 @@ import { Highlight } from '../entity/highlight' import { findLibraryItemById } from '../services/library_item' import { logger } from '../utils/logger' import { htmlToHighlightedMarkdown, htmlToMarkdown } from '../utils/parser' -import { uploadToBucket } from '../utils/uploads' +import { isFileExists, uploadToBucket } from '../utils/uploads' export const UPLOAD_CONTENT_JOB = 'UPLOAD_CONTENT_JOB' -export type ContentFormat = 'markdown' | 'highlightedMarkdown' | 'original' +export type ContentFormat = + | 'markdown' + | 'highlightedMarkdown' + | 'original' + | 'readable' export interface UploadContentJobData { libraryItemId: string @@ -26,6 +30,7 @@ const convertContent = ( case 'highlightedMarkdown': return htmlToHighlightedMarkdown(content, highlights) case 'original': + case 'readable': return content default: throw new Error('Unsupported format') @@ -36,6 +41,7 @@ const CONTENT_TYPES = { markdown: 'text/markdown', highlightedMarkdown: 'text/markdown', original: 'text/html', + readable: 'text/html', } const getSelectOptions = ( @@ -43,6 +49,7 @@ const getSelectOptions = ( ): { column: 'readableContent' | 'originalContent'; highlights?: boolean } => { switch (format) { case 'markdown': + case 'readable': return { column: 'readableContent', } @@ -73,31 +80,40 @@ export const uploadContentJob = async (data: UploadContentJobData) => { }, }) if (!libraryItem) { - logger.error('Library item not found', data) + logger.error(`Library item not found: ${libraryItemId}`) throw new Error('Library item not found') } const content = libraryItem[column] if (!content) { - logger.error(`${column} not found`, data) + logger.error(`${column} not found`) throw new Error('Content not found') } - logger.info('Converting content', data) + logger.info('Converting content') const convertedContent = convertContent( content, format, libraryItem.highlights ) - console.time('uploadToBucket') - logger.info('Uploading content', data) + const exists = await isFileExists(filePath) + if (exists) { + logger.info(`File already exists: ${filePath}`) + return + } + + logger.info(`Uploading content: ${filePath}`) + logger.profile('Uploader') + await uploadToBucket(filePath, Buffer.from(convertedContent), { contentType: CONTENT_TYPES[format], - timeout: 60000, // 1 minute + timeout: 10_000, // 10 seconds }) - console.timeEnd('uploadToBucket') - logger.info('Content uploaded', data) + logger.profile('Uploader', { + level: 'info', + message: 'Content uploaded', + }) } diff --git a/packages/api/src/routers/content_router.ts b/packages/api/src/routers/content_router.ts index b3110d354..14643b89a 100644 --- a/packages/api/src/routers/content_router.ts +++ b/packages/api/src/routers/content_router.ts @@ -72,14 +72,13 @@ export function contentRouter() { // generate signed url for each library item const data = await Promise.all( libraryItems.map(async (libraryItem) => { - const date = - format === 'original' ? libraryItem.savedAt : libraryItem.updatedAt - const filePath = contentFilePath( + const filePath = contentFilePath({ userId, - libraryItem.id, - date.getTime(), - format - ) + libraryItemId: libraryItem.id, + format, + savedAt: libraryItem.savedAt, + updatedAt: libraryItem.updatedAt, + }) try { const downloadUrl = await generateDownloadSignedUrl(filePath, { @@ -89,7 +88,7 @@ export function contentRouter() { // check if file is already uploaded const exists = await isFileExists(filePath) if (exists) { - logger.info('File already exists', filePath) + logger.info(`File already exists: ${filePath}`) } return { @@ -109,7 +108,10 @@ export function contentRouter() { } }) ) - logger.info('Signed urls generated', data) + logger.info( + 'Signed urls generated', + data.map((d) => d.downloadUrl) + ) // skip uploading if there is an error or file already exists const uploadData = data.filter( @@ -117,8 +119,8 @@ export function contentRouter() { ) as UploadContentJobData[] if (uploadData.length > 0) { - await enqueueBulkUploadContentJob(uploadData) - logger.info('Bulk upload content job enqueued', uploadData) + const jobs = await enqueueBulkUploadContentJob(uploadData) + logger.info('Bulk upload content job enqueued', jobs) } res.send({ diff --git a/packages/api/src/services/library_item.ts b/packages/api/src/services/library_item.ts index f62d9d2a4..9ad7f8471 100644 --- a/packages/api/src/services/library_item.ts +++ b/packages/api/src/services/library_item.ts @@ -1695,14 +1695,21 @@ export const uploadOriginalContent = async ( userId: string, libraryItemId: string, savedAt: Date, - originalContent: string + originalContent: string, + timeout = 10_000 // 10 seconds ) => { await uploadToBucket( - contentFilePath(userId, libraryItemId, savedAt.getTime(), 'original'), + contentFilePath({ + userId, + libraryItemId, + savedAt, + format: 'original', + }), Buffer.from(originalContent), { public: false, contentType: 'text/html', + timeout, } ) } @@ -1713,6 +1720,11 @@ export const downloadOriginalContent = async ( savedAt: Date ) => { return downloadFromBucket( - contentFilePath(userId, libraryItemId, savedAt.getTime(), 'original') + contentFilePath({ + userId, + libraryItemId, + savedAt, + format: 'original', + }) ) } diff --git a/packages/api/src/utils/createTask.ts b/packages/api/src/utils/createTask.ts index 546b69d11..e06c1bb99 100644 --- a/packages/api/src/utils/createTask.ts +++ b/packages/api/src/utils/createTask.ts @@ -970,6 +970,9 @@ export const enqueueBulkUploadContentJob = async ( name: UPLOAD_CONTENT_JOB, data: d, opts: { + jobId: `${UPLOAD_CONTENT_JOB}_${d.filePath}_${JOB_VERSION}`, // dedupe by job id + removeOnComplete: true, + removeOnFail: true, attempts: 3, priority: getJobPriority(UPLOAD_CONTENT_JOB), }, diff --git a/packages/api/src/utils/highlightGenerator.ts b/packages/api/src/utils/highlightGenerator.ts index b3f4448ba..8f9da4e1f 100644 --- a/packages/api/src/utils/highlightGenerator.ts +++ b/packages/api/src/utils/highlightGenerator.ts @@ -49,7 +49,7 @@ type FillNodeResponse = { } function getTextNodesBetween(rootNode: Node, startNode: Node, endNode: Node) { - const maxTime = 1000 * 60 // 60 seconds + const maxTime = 10_000 // 10 seconds const start = Date.now() let textNodeStartingPoint = 0 let articleText = '' diff --git a/packages/api/src/utils/uploads.ts b/packages/api/src/utils/uploads.ts index 62acc9ce4..801f4f623 100644 --- a/packages/api/src/utils/uploads.ts +++ b/packages/api/src/utils/uploads.ts @@ -163,9 +163,25 @@ export const downloadFromBucket = async (filePath: string): Promise => { return data } -export const contentFilePath = ( - userId: string, - libraryItemId: string, - timestamp: number, +export const contentFilePath = ({ + userId, + libraryItemId, + format, + savedAt, + updatedAt, +}: { + userId: string + libraryItemId: string format: ContentFormat -) => `content/${userId}/${libraryItemId}.${timestamp}.${format}` + savedAt?: Date + updatedAt?: Date +}) => { + // Use updatedAt for highlightedMarkdown format because highlights are saved + const date = format === 'highlightedMarkdown' ? updatedAt : savedAt + + if (!date) { + throw new Error('Date not found') + } + + return `content/${userId}/${libraryItemId}.${date.getTime()}.${format}` +}