From f3ce6f4d4e19ff7d91b0f606767a89d3f155fcd9 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Fri, 17 May 2024 15:55:28 +0800 Subject: [PATCH] catch content fetch result in redis --- packages/content-fetch/src/request_handler.ts | 65 ++++++++++++++++++- 1 file changed, 63 insertions(+), 2 deletions(-) diff --git a/packages/content-fetch/src/request_handler.ts b/packages/content-fetch/src/request_handler.ts index e43a7d88a..1decbacb6 100644 --- a/packages/content-fetch/src/request_handler.ts +++ b/packages/content-fetch/src/request_handler.ts @@ -3,6 +3,7 @@ import { fetchContent } from '@omnivore/puppeteer-parse' import { RequestHandler } from 'express' import { analytics } from './analytics' import { queueSavePageJob } from './job' +import { redisDataSource } from './redis_data_source' interface UserConfig { id: string @@ -48,6 +49,13 @@ interface LogRecord { totalTime?: number } +interface FetchResult { + finalUrl: string + title?: string + content?: string + contentType?: string +} + const storage = process.env.GCS_UPLOAD_SA_KEY_FILE_PATH ? new Storage({ keyFilename: process.env.GCS_UPLOAD_SA_KEY_FILE_PATH }) : new Storage() @@ -76,6 +84,44 @@ const uploadOriginalContent = async ( ) } +const cacheKey = (url: string) => `fetch-result:${url}` + +const isFetchResult = (obj: unknown): obj is FetchResult => { + return typeof obj === 'object' && obj !== null && 'finalUrl' in obj +} + +export const cacheFetchResult = async ( + url: string, + fetchResult: FetchResult +) => { + // cache the fetch result for 24 hours + const ttl = 24 * 60 * 60 + const key = cacheKey(url) + const value = JSON.stringify(fetchResult) + return redisDataSource.cacheClient.set(key, value, 'EX', ttl, 'NX') +} + +const getCachedFetchResult = async ( + url: string +): Promise => { + const key = cacheKey(url) + + const result = await redisDataSource.cacheClient.get(key) + if (!result) { + console.info('fetch result is not cached', url) + return null + } + + const fetchResult = JSON.parse(result) as unknown + if (!isFetchResult(fetchResult)) { + throw new Error('fetch result is not valid') + } + + console.info('fetch result is cached', url) + + return fetchResult +} + export const contentFetchRequestHandler: RequestHandler = async (req, res) => { const functionStartTime = Date.now() @@ -127,9 +173,24 @@ export const contentFetchRequestHandler: RequestHandler = async (req, res) => { console.log(`Article parsing request`, logRecord) try { + let fetchResult = await getCachedFetchResult(url) + if (!fetchResult) { + console.log( + 'fetch result not found in cache, fetching content now...', + url + ) + + fetchResult = await fetchContent(url, locale, timezone) + console.log('content has been fetched') + + if (fetchResult.content) { + const cacheResult = await cacheFetchResult(url, fetchResult) + console.log('cache result', cacheResult) + } + } + const savedDate = savedAt ? new Date(savedAt) : new Date() - const fetchResult = await fetchContent(url, locale, timezone) - const { title, content, contentType, finalUrl } = fetchResult + const { finalUrl, title, content, contentType } = fetchResult if (content) { await uploadOriginalContent(users, content, savedDate.getTime()) }