Files
omnivore/packages/content-fetch/src/request_handler.ts
2024-08-18 12:37:10 +08:00

339 lines
8.3 KiB
TypeScript

import { Storage } from '@google-cloud/storage'
import { fetchContent } from '@omnivore/puppeteer-parse'
import { RedisDataSource } from '@omnivore/utils'
import 'dotenv/config'
import { RequestHandler } from 'express'
import { analytics } from './analytics'
import { queueSavePageJob } from './job'
interface UserConfig {
id: string
libraryItemId: string
folder?: string
}
interface RequestBody {
url: string
userId?: string
saveRequestId: string
state?: string
labels?: string[]
source?: string
taskId?: string
locale?: string
timezone?: string
rssFeedUrl?: string
savedAt?: string
publishedAt?: string
folder?: string
users?: UserConfig[]
priority: 'high' | 'low'
}
interface LogRecord {
url: string
articleSavingRequestId: string
labels: {
source: string
}
state?: string
labelsToAdd?: string[]
taskId?: string
locale?: string
timezone?: string
rssFeedUrl?: string
savedAt?: string
publishedAt?: string
folder?: string
users?: UserConfig[]
error?: string
totalTime?: number
}
interface FetchResult {
finalUrl: string
title?: string
content?: string
contentType?: string
}
const storage = process.env.GCS_UPLOAD_SA_KEY_FILE_PATH
? new Storage({ keyFilename: process.env.GCS_UPLOAD_SA_KEY_FILE_PATH })
: new Storage()
const bucketName = process.env.GCS_UPLOAD_BUCKET || 'omnivore-files'
const NO_CACHE_URLS = [
'https://deviceandbrowserinfo.com/are_you_a_bot',
'https://deviceandbrowserinfo.com/info_device',
]
const uploadToBucket = async (filePath: string, data: string) => {
await storage
.bucket(bucketName)
.file(filePath)
.save(data, { public: false, timeout: 5000 })
}
const uploadOriginalContent = async (
users: UserConfig[],
content: string,
savedTimestamp: number
) => {
await Promise.all(
users.map(async (user) => {
const filePath = `content/${user.id}/${user.libraryItemId}.${savedTimestamp}.original`
await uploadToBucket(filePath, content)
console.log(`Original content uploaded to ${filePath}`)
})
)
}
const cacheKey = (url: string, locale = '', timezone = '') =>
`fetch-result:${url}:${locale}:${timezone}`
const isFetchResult = (obj: unknown): obj is FetchResult => {
return typeof obj === 'object' && obj !== null && 'finalUrl' in obj
}
export const cacheFetchResult = async (
redisDataSource: RedisDataSource,
key: string,
fetchResult: FetchResult
) => {
// cache the fetch result for 24 hours
const ttl = 24 * 60 * 60
const value = JSON.stringify(fetchResult)
return redisDataSource.cacheClient.set(key, value, 'EX', ttl, 'NX')
}
const getCachedFetchResult = async (
redisDataSource: RedisDataSource,
key: string
): Promise<FetchResult | undefined> => {
const result = await redisDataSource.cacheClient.get(key)
if (!result) {
console.info('fetch result is not cached', key)
return undefined
}
const fetchResult = JSON.parse(result) as unknown
if (!isFetchResult(fetchResult)) {
console.error('invalid fetch result in cache', key)
return undefined
}
console.info('fetch result is cached', key)
return fetchResult
}
const failureRedisKey = (domain: string) => `fetch-failure:${domain}`
const isDomainBlocked = async (
redisDataSource: RedisDataSource,
domain: string
) => {
const blockedDomains = ['localhost', 'weibo.com']
if (blockedDomains.includes(domain)) {
return true
}
const key = failureRedisKey(domain)
const redisClient = redisDataSource.cacheClient
try {
const result = await redisClient.get(key)
// if the domain has failed to fetch more than certain times, block it
const maxFailures = parseInt(process.env.MAX_FEED_FETCH_FAILURES ?? '10')
if (result && parseInt(result) > maxFailures) {
console.info(`domain is blocked: ${domain}`)
return true
}
} catch (error) {
console.error('Failed to check domain block status', { domain, error })
}
return false
}
const incrementContentFetchFailure = async (
redisDataSource: RedisDataSource,
domain: string
) => {
const redisClient = redisDataSource.cacheClient
const key = failureRedisKey(domain)
try {
const result = await redisClient.incr(key)
// expire the key in 1 hour
await redisClient.expire(key, 60 * 60)
return result
} catch (error) {
console.error('Failed to increment failure in redis', { domain, error })
return null
}
}
export const contentFetchRequestHandler: RequestHandler = async (req, res) => {
const functionStartTime = Date.now()
const body = <RequestBody>req.body
// users is used when saving article for multiple users
let users = body.users || []
const userId = body.userId
// userId is used when saving article for a single user
if (userId) {
users = [
{
id: userId,
folder: body.folder,
libraryItemId: body.saveRequestId,
},
]
}
const articleSavingRequestId = body.saveRequestId
const state = body.state
const labels = body.labels
const source = body.source || 'puppeteer-parse'
const taskId = body.taskId // taskId is used to update import status
const url = body.url
const locale = body.locale
const timezone = body.timezone
const rssFeedUrl = body.rssFeedUrl
const savedAt = body.savedAt
const publishedAt = body.publishedAt
const priority = body.priority
const logRecord: LogRecord = {
url,
articleSavingRequestId,
labels: {
source,
},
state,
labelsToAdd: labels,
taskId: taskId,
locale,
timezone,
rssFeedUrl,
savedAt,
publishedAt,
users,
}
console.log(`Article parsing request`, logRecord)
// create redis source
const redisDataSource = new RedisDataSource({
cache: {
url: process.env.REDIS_URL,
cert: process.env.REDIS_CERT,
},
mq: {
url: process.env.MQ_REDIS_URL,
cert: process.env.MQ_REDIS_CERT,
},
})
try {
const domain = new URL(url).hostname
const isBlocked = await isDomainBlocked(redisDataSource, domain)
if (isBlocked) {
console.log('domain is blocked', domain)
return res.sendStatus(200)
}
const key = cacheKey(url, locale, timezone)
let fetchResult = await getCachedFetchResult(redisDataSource, key)
if (!fetchResult) {
console.log(
'fetch result not found in cache, fetching content now...',
url
)
try {
fetchResult = await fetchContent(url, locale, timezone)
console.log('content has been fetched')
} catch (error) {
await incrementContentFetchFailure(redisDataSource, domain)
throw error
}
if (fetchResult.content && !NO_CACHE_URLS.includes(url)) {
const cacheResult = await cacheFetchResult(
redisDataSource,
key,
fetchResult
)
console.log('cache result', cacheResult)
}
}
const savedDate = savedAt ? new Date(savedAt) : new Date()
const { finalUrl, title, content, contentType } = fetchResult
if (content) {
await uploadOriginalContent(users, content, savedDate.getTime())
}
const savePageJobs = users.map((user) => ({
userId: user.id,
data: {
userId: user.id,
url,
finalUrl,
articleSavingRequestId: user.libraryItemId,
state,
labels,
source,
folder: user.folder,
rssFeedUrl,
savedAt: savedDate.toISOString(),
publishedAt,
taskId,
title,
contentType,
cacheKey: key,
},
isRss: !!rssFeedUrl,
isImport: !!taskId,
priority,
}))
const jobs = await queueSavePageJob(redisDataSource, savePageJobs)
console.log('save-page jobs queued', jobs.length)
} catch (error) {
if (error instanceof Error) {
logRecord.error = error.message
} else {
logRecord.error = 'unknown error'
}
return res.sendStatus(500)
} finally {
logRecord.totalTime = Date.now() - functionStartTime
console.log(`parse-page result`, logRecord)
// capture events
analytics.capture(
users.map((user) => user.id),
{
result: logRecord.error ? 'failure' : 'success',
properties: {
url,
source,
totalTime: logRecord.totalTime,
errorMessage: logRecord.error,
},
}
)
await redisDataSource.shutdown()
}
res.sendStatus(200)
}