From 52ebf466e35837ec722f259380507749166c4d45 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Fri, 17 May 2024 16:46:54 +0800 Subject: [PATCH] get content from cache first when saving url --- packages/api/src/jobs/save_page.ts | 64 ++++++++++++++----- packages/content-fetch/src/job.ts | 1 + packages/content-fetch/src/request_handler.ts | 63 +++++++----------- 3 files changed, 73 insertions(+), 55 deletions(-) diff --git a/packages/api/src/jobs/save_page.ts b/packages/api/src/jobs/save_page.ts index 332196e00..b82f4efb8 100644 --- a/packages/api/src/jobs/save_page.ts +++ b/packages/api/src/jobs/save_page.ts @@ -6,6 +6,7 @@ import { ArticleSavingRequestStatus, CreateLabelInput, } from '../generated/graphql' +import { redisDataSource } from '../redis_data_source' import { userRepository } from '../repository/user' import { saveFile } from '../services/save_file' import { savePage } from '../services/save_page' @@ -43,6 +44,19 @@ interface Data { rssFeedUrl?: string publishedAt?: string taskId?: string + cacheKey?: string +} + +const getCachedContent = async (key: string): Promise => { + const result = await redisDataSource.redisClient?.get(key) + if (!result) { + logger.info('fetch result is not cached', { key }) + return null + } + + logger.info('content is cached', { key }) + + return result } const uploadPdf = async ( @@ -133,6 +147,7 @@ export const savePageJob = async (data: Data, attemptsMade: number) => { title, contentType, state, + cacheKey, } = data let isImported, isSaved @@ -185,25 +200,42 @@ export const savePageJob = async (data: Data, attemptsMade: number) => { return true } - // download the original content - const filePath = contentFilePath({ - userId, - libraryItemId: articleSavingRequestId, - format: 'original', - savedAt: new Date(savedAt), - }) - const exists = await isFileExists(filePath) - if (!exists) { - logger.error('Original content file does not exist', { - finalUrl, - filePath, - }) + let content - throw new Error('Original content file does not exist') + if (cacheKey) { + logger.info('fetching content from cache', { + cacheKey, + }) + content = await getCachedContent(cacheKey) + + logger.info('fetched content from cache') } - const content = (await downloadFromBucket(filePath)).toString() - console.log('Downloaded original content from:', filePath) + if (!content) { + logger.info('downloading content from GCS', { + url, + }) + + // download the original content + const filePath = contentFilePath({ + userId, + libraryItemId: articleSavingRequestId, + format: 'original', + savedAt: new Date(savedAt), + }) + const exists = await isFileExists(filePath) + if (!exists) { + logger.error('Original content file does not exist', { + finalUrl, + filePath, + }) + + throw new Error('Original content file does not exist') + } + + content = (await downloadFromBucket(filePath)).toString() + logger.info('Downloaded original content from:', { filePath }) + } // for non-pdf content, we need to save the content const result = await savePage( diff --git a/packages/content-fetch/src/job.ts b/packages/content-fetch/src/job.ts index de39d4bc0..c6ce525e0 100644 --- a/packages/content-fetch/src/job.ts +++ b/packages/content-fetch/src/job.ts @@ -20,6 +20,7 @@ interface SavePageJobData { taskId?: string title?: string contentType?: string + cacheKey?: string } interface SavePageJob { diff --git a/packages/content-fetch/src/request_handler.ts b/packages/content-fetch/src/request_handler.ts index 8172673bb..ba5d8b461 100644 --- a/packages/content-fetch/src/request_handler.ts +++ b/packages/content-fetch/src/request_handler.ts @@ -49,13 +49,6 @@ interface LogRecord { totalTime?: number } -interface FetchResult { - finalUrl: string - title?: string - content?: string - contentType?: string -} - const storage = process.env.GCS_UPLOAD_SA_KEY_FILE_PATH ? new Storage({ keyFilename: process.env.GCS_UPLOAD_SA_KEY_FILE_PATH }) : new Storage() @@ -87,37 +80,22 @@ const uploadOriginalContent = async ( const cacheKey = (url: string, locale = '', timezone = '') => `fetch-result:${url}:${locale}:${timezone}` -const isFetchResult = (obj: unknown): obj is FetchResult => { - return typeof obj === 'object' && obj !== null && 'finalUrl' in obj -} - -export const cacheFetchResult = async ( - key: string, - fetchResult: FetchResult -) => { +export const cacheContent = async (key: string, content: string) => { // cache the fetch result for 24 hours const ttl = 24 * 60 * 60 - const value = JSON.stringify(fetchResult) - return redisDataSource.cacheClient.set(key, value, 'EX', ttl, 'NX') + return redisDataSource.cacheClient.set(key, content, 'EX', ttl, 'NX') } -const getCachedFetchResult = async ( - key: string -): Promise => { - const result = await redisDataSource.cacheClient.get(key) - if (!result) { +const getCachedContent = async (key: string): Promise => { + const content = await redisDataSource.cacheClient.get(key) + if (!content) { console.info('fetch result is not cached', key) return null } - const fetchResult = JSON.parse(result) as unknown - if (!isFetchResult(fetchResult)) { - throw new Error('fetch result is not valid') - } + console.info('content is cached', key) - console.info('fetch result is cached', key) - - return fetchResult + return content } export const contentFetchRequestHandler: RequestHandler = async (req, res) => { @@ -171,25 +149,31 @@ export const contentFetchRequestHandler: RequestHandler = async (req, res) => { console.log(`Article parsing request`, logRecord) try { - const key = cacheKey(url, locale, timezone) - let fetchResult = await getCachedFetchResult(key) - if (!fetchResult) { - console.log( - 'fetch result not found in cache, fetching content now...', - url - ) + let finalUrl: string, + title: string | undefined, + contentType: string | undefined - fetchResult = await fetchContent(url, locale, timezone) + const key = cacheKey(url, locale, timezone) + let content = await getCachedContent(key) + if (!content) { + console.log('content not found in cache, fetching content now...', url) + + const fetchResult = await fetchContent(url, locale, timezone) console.log('content has been fetched') if (fetchResult.content) { - const cacheResult = await cacheFetchResult(url, fetchResult) + content = fetchResult.content + const cacheResult = await cacheContent(key, content) + console.log('cache result', cacheResult) } + + finalUrl = fetchResult.finalUrl + title = fetchResult.title + contentType = fetchResult.contentType } const savedDate = savedAt ? new Date(savedAt) : new Date() - const { finalUrl, title, content, contentType } = fetchResult if (content) { await uploadOriginalContent(users, content, savedDate.getTime()) } @@ -211,6 +195,7 @@ export const contentFetchRequestHandler: RequestHandler = async (req, res) => { taskId, title, contentType, + cacheKey: key, }, isRss: !!rssFeedUrl, isImport: !!taskId,