This commit is contained in:
Hongbo Wu
2024-05-14 20:18:18 +08:00
parent dbd7b7932f
commit 9dee510be1
9 changed files with 141 additions and 107 deletions

View File

@ -13,6 +13,7 @@
"ioredis": "^5.3.2",
"posthog-node": "^3.6.3",
"@google-cloud/functions-framework": "^3.0.0",
"@google-cloud/storage": "^7.0.1",
"@omnivore/puppeteer-parse": "^1.0.0",
"@sentry/serverless": "^7.77.0"
},

View File

@ -9,6 +9,7 @@ interface SavePageJobData {
url: string
finalUrl: string
articleSavingRequestId: string
state?: string
labels?: string[]
source: string
@ -17,6 +18,8 @@ interface SavePageJobData {
savedAt?: string
publishedAt?: string
taskId?: string
title?: string
contentType?: string
}
interface SavePageJob {

View File

@ -1,11 +1,12 @@
import { Storage } from '@google-cloud/storage'
import { fetchContent } from '@omnivore/puppeteer-parse'
import { RequestHandler } from 'express'
import { analytics } from './analytics'
import { queueSavePageJob } from './job'
import { redisDataSource } from './redis_data_source'
interface User {
interface UserConfig {
id: string
libraryItemId: string
folder?: string
}
@ -23,7 +24,7 @@ interface RequestBody {
savedAt?: string
publishedAt?: string
folder?: string
users?: User[]
users?: UserConfig[]
priority: 'high' | 'low'
}
@ -42,24 +43,37 @@ interface LogRecord {
savedAt?: string
publishedAt?: string
folder?: string
users?: User[]
users?: UserConfig[]
error?: string
totalTime?: number
}
interface FetchResult {
finalUrl: string
title?: string
content?: string
contentType?: string
const storage = process.env.GCS_UPLOAD_SA_KEY_FILE_PATH
? new Storage({ keyFilename: process.env.GCS_UPLOAD_SA_KEY_FILE_PATH })
: new Storage()
const bucketName = process.env.GCS_UPLOAD_BUCKET || 'omnivore-files'
const uploadToBucket = async (filePath: string, data: string) => {
await storage
.bucket(bucketName)
.file(filePath)
.save(data, { public: false, timeout: 30000 })
}
export const cacheFetchResult = async (fetchResult: FetchResult) => {
// cache the fetch result for 24 hours
const ttl = 24 * 60 * 60
const key = `fetch-result:${fetchResult.finalUrl}`
const value = JSON.stringify(fetchResult)
return redisDataSource.cacheClient.set(key, value, 'EX', ttl, 'NX')
const uploadOriginalContent = async (
users: UserConfig[],
content: string,
savedTimestamp: number
) => {
await Promise.all(
users.map(async (user) => {
const filePath = `content/${user.id}/${user.libraryItemId}.${savedTimestamp}.original`
await uploadToBucket(filePath, content)
console.log(`Original content uploaded to ${filePath}`)
})
)
}
export const contentFetchRequestHandler: RequestHandler = async (req, res) => {
@ -76,6 +90,7 @@ export const contentFetchRequestHandler: RequestHandler = async (req, res) => {
{
id: userId,
folder: body.folder,
libraryItemId: body.saveRequestId,
},
]
}
@ -112,8 +127,12 @@ export const contentFetchRequestHandler: RequestHandler = async (req, res) => {
console.log(`Article parsing request`, logRecord)
try {
const savedDate = savedAt ? new Date(savedAt) : new Date()
const fetchResult = await fetchContent(url, locale, timezone)
const finalUrl = fetchResult.finalUrl
const { title, content, contentType, finalUrl } = fetchResult
if (content) {
await uploadOriginalContent(users, content, savedDate.getTime())
}
const savePageJobs = users.map((user) => ({
userId: user.id,
@ -121,24 +140,23 @@ export const contentFetchRequestHandler: RequestHandler = async (req, res) => {
userId: user.id,
url,
finalUrl,
articleSavingRequestId,
articleSavingRequestId: user.libraryItemId,
state,
labels,
source,
folder: user.folder,
rssFeedUrl,
savedAt,
savedAt: savedDate.toISOString(),
publishedAt,
taskId,
title,
contentType,
},
isRss: !!rssFeedUrl,
isImport: !!taskId,
priority,
}))
const cacheResult = await cacheFetchResult(fetchResult)
console.log('cacheFetchResult result', cacheResult)
const jobs = await queueSavePageJob(savePageJobs)
console.log('save-page jobs queued', jobs.length)
} catch (error) {