get content from cache first when saving url

This commit is contained in:
Hongbo Wu
2024-05-17 16:46:54 +08:00
parent 9c3d619ad5
commit 52ebf466e3
3 changed files with 73 additions and 55 deletions

View File

@ -6,6 +6,7 @@ import {
ArticleSavingRequestStatus,
CreateLabelInput,
} from '../generated/graphql'
import { redisDataSource } from '../redis_data_source'
import { userRepository } from '../repository/user'
import { saveFile } from '../services/save_file'
import { savePage } from '../services/save_page'
@ -43,6 +44,19 @@ interface Data {
rssFeedUrl?: string
publishedAt?: string
taskId?: string
cacheKey?: string
}
const getCachedContent = async (key: string): Promise<string | null> => {
const result = await redisDataSource.redisClient?.get(key)
if (!result) {
logger.info('fetch result is not cached', { key })
return null
}
logger.info('content is cached', { key })
return result
}
const uploadPdf = async (
@ -133,6 +147,7 @@ export const savePageJob = async (data: Data, attemptsMade: number) => {
title,
contentType,
state,
cacheKey,
} = data
let isImported, isSaved
@ -185,25 +200,42 @@ export const savePageJob = async (data: Data, attemptsMade: number) => {
return true
}
// download the original content
const filePath = contentFilePath({
userId,
libraryItemId: articleSavingRequestId,
format: 'original',
savedAt: new Date(savedAt),
})
const exists = await isFileExists(filePath)
if (!exists) {
logger.error('Original content file does not exist', {
finalUrl,
filePath,
})
let content
throw new Error('Original content file does not exist')
if (cacheKey) {
logger.info('fetching content from cache', {
cacheKey,
})
content = await getCachedContent(cacheKey)
logger.info('fetched content from cache')
}
const content = (await downloadFromBucket(filePath)).toString()
console.log('Downloaded original content from:', filePath)
if (!content) {
logger.info('downloading content from GCS', {
url,
})
// download the original content
const filePath = contentFilePath({
userId,
libraryItemId: articleSavingRequestId,
format: 'original',
savedAt: new Date(savedAt),
})
const exists = await isFileExists(filePath)
if (!exists) {
logger.error('Original content file does not exist', {
finalUrl,
filePath,
})
throw new Error('Original content file does not exist')
}
content = (await downloadFromBucket(filePath)).toString()
logger.info('Downloaded original content from:', { filePath })
}
// for non-pdf content, we need to save the content
const result = await savePage(

View File

@ -20,6 +20,7 @@ interface SavePageJobData {
taskId?: string
title?: string
contentType?: string
cacheKey?: string
}
interface SavePageJob {

View File

@ -49,13 +49,6 @@ interface LogRecord {
totalTime?: number
}
interface FetchResult {
finalUrl: string
title?: string
content?: string
contentType?: string
}
const storage = process.env.GCS_UPLOAD_SA_KEY_FILE_PATH
? new Storage({ keyFilename: process.env.GCS_UPLOAD_SA_KEY_FILE_PATH })
: new Storage()
@ -87,37 +80,22 @@ const uploadOriginalContent = async (
const cacheKey = (url: string, locale = '', timezone = '') =>
`fetch-result:${url}:${locale}:${timezone}`
const isFetchResult = (obj: unknown): obj is FetchResult => {
return typeof obj === 'object' && obj !== null && 'finalUrl' in obj
}
export const cacheFetchResult = async (
key: string,
fetchResult: FetchResult
) => {
export const cacheContent = async (key: string, content: string) => {
// cache the fetch result for 24 hours
const ttl = 24 * 60 * 60
const value = JSON.stringify(fetchResult)
return redisDataSource.cacheClient.set(key, value, 'EX', ttl, 'NX')
return redisDataSource.cacheClient.set(key, content, 'EX', ttl, 'NX')
}
const getCachedFetchResult = async (
key: string
): Promise<FetchResult | null> => {
const result = await redisDataSource.cacheClient.get(key)
if (!result) {
const getCachedContent = async (key: string): Promise<string | null> => {
const content = await redisDataSource.cacheClient.get(key)
if (!content) {
console.info('fetch result is not cached', key)
return null
}
const fetchResult = JSON.parse(result) as unknown
if (!isFetchResult(fetchResult)) {
throw new Error('fetch result is not valid')
}
console.info('content is cached', key)
console.info('fetch result is cached', key)
return fetchResult
return content
}
export const contentFetchRequestHandler: RequestHandler = async (req, res) => {
@ -171,25 +149,31 @@ export const contentFetchRequestHandler: RequestHandler = async (req, res) => {
console.log(`Article parsing request`, logRecord)
try {
const key = cacheKey(url, locale, timezone)
let fetchResult = await getCachedFetchResult(key)
if (!fetchResult) {
console.log(
'fetch result not found in cache, fetching content now...',
url
)
let finalUrl: string,
title: string | undefined,
contentType: string | undefined
fetchResult = await fetchContent(url, locale, timezone)
const key = cacheKey(url, locale, timezone)
let content = await getCachedContent(key)
if (!content) {
console.log('content not found in cache, fetching content now...', url)
const fetchResult = await fetchContent(url, locale, timezone)
console.log('content has been fetched')
if (fetchResult.content) {
const cacheResult = await cacheFetchResult(url, fetchResult)
content = fetchResult.content
const cacheResult = await cacheContent(key, content)
console.log('cache result', cacheResult)
}
finalUrl = fetchResult.finalUrl
title = fetchResult.title
contentType = fetchResult.contentType
}
const savedDate = savedAt ? new Date(savedAt) : new Date()
const { finalUrl, title, content, contentType } = fetchResult
if (content) {
await uploadOriginalContent(users, content, savedDate.getTime())
}
@ -211,6 +195,7 @@ export const contentFetchRequestHandler: RequestHandler = async (req, res) => {
taskId,
title,
contentType,
cacheKey: key,
},
isRss: !!rssFeedUrl,
isImport: !!taskId,