use different queues for fast,slow and rss content fetch jobs
This commit is contained in:
@ -1,53 +0,0 @@
|
||||
FROM node:18.16-alpine
|
||||
|
||||
# Installs latest Chromium (92) package.
|
||||
RUN apk add --no-cache \
|
||||
chromium \
|
||||
nss \
|
||||
freetype \
|
||||
harfbuzz \
|
||||
ca-certificates \
|
||||
ttf-freefont \
|
||||
nodejs \
|
||||
yarn \
|
||||
g++ \
|
||||
make \
|
||||
python3
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
ENV CHROMIUM_PATH /usr/bin/chromium-browser
|
||||
ENV LAUNCH_HEADLESS=true
|
||||
ENV PORT 9090
|
||||
|
||||
COPY package.json .
|
||||
COPY yarn.lock .
|
||||
COPY tsconfig.json .
|
||||
COPY .prettierrc .
|
||||
COPY .eslintrc .
|
||||
|
||||
COPY /packages/content-fetch/package.json ./packages/content-fetch/package.json
|
||||
COPY /packages/content-handler/package.json ./packages/content-handler/package.json
|
||||
COPY /packages/puppeteer-parse/package.json ./packages/puppeteer-parse/package.json
|
||||
COPY /packages/utils/package.json ./packages/utils/package.json
|
||||
|
||||
RUN yarn install --pure-lockfile
|
||||
|
||||
ADD /packages/content-handler ./packages/content-handler
|
||||
ADD /packages/puppeteer-parse ./packages/puppeteer-parse
|
||||
ADD /packages/content-fetch ./packages/content-fetch
|
||||
ADD /packages/utils ./packages/utils
|
||||
RUN yarn workspace @omnivore/utils build
|
||||
RUN yarn workspace @omnivore/content-handler build
|
||||
RUN yarn workspace @omnivore/puppeteer-parse build
|
||||
RUN yarn workspace @omnivore/content-fetch build
|
||||
|
||||
# After building, fetch the production dependencies
|
||||
RUN rm -rf /app/packages/content-fetch/node_modules
|
||||
RUN rm -rf /app/node_modules
|
||||
RUN yarn install --pure-lockfile --production
|
||||
|
||||
EXPOSE 9090
|
||||
|
||||
# USER pptruser
|
||||
ENTRYPOINT ["yarn", "workspace", "@omnivore/content-fetch", "start_gcf"]
|
||||
@ -7,11 +7,9 @@
|
||||
"build/src"
|
||||
],
|
||||
"dependencies": {
|
||||
"@google-cloud/functions-framework": "^3.0.0",
|
||||
"@google-cloud/storage": "^7.0.1",
|
||||
"@omnivore/puppeteer-parse": "^1.0.0",
|
||||
"@omnivore/utils": "1.0.0",
|
||||
"@sentry/serverless": "^7.77.0",
|
||||
"bullmq": "^5.1.1",
|
||||
"dotenv": "^8.2.0",
|
||||
"express": "^4.17.1",
|
||||
@ -27,9 +25,7 @@
|
||||
"test:typecheck": "tsc --noEmit",
|
||||
"lint": "eslint src --ext ts,js,tsx,jsx",
|
||||
"build": "tsc",
|
||||
"start": "node build/src/app.js",
|
||||
"start_gcf": "functions-framework --port=9090 --target=puppeteer",
|
||||
"start_worker": "node build/src/worker.js"
|
||||
"start": "node build/src/app.js"
|
||||
},
|
||||
"volta": {
|
||||
"extends": "../../package.json"
|
||||
|
||||
@ -1,34 +1,138 @@
|
||||
import 'dotenv/config'
|
||||
import express from 'express'
|
||||
import { contentFetchRequestHandler } from './request_handler'
|
||||
import { RedisDataSource } from '@omnivore/utils'
|
||||
import { JobType } from 'bullmq'
|
||||
import express, { Express } from 'express'
|
||||
import asyncHandler from 'express-async-handler'
|
||||
import { createWorkers, getQueue } from './worker'
|
||||
|
||||
const app = express()
|
||||
const main = () => {
|
||||
console.log('[worker]: starting workers')
|
||||
|
||||
app.use(express.json())
|
||||
app.use(express.urlencoded({ extended: true }))
|
||||
const app: Express = express()
|
||||
const port = process.env.PORT || 3002
|
||||
|
||||
if (!process.env.VERIFICATION_TOKEN) {
|
||||
throw new Error('VERIFICATION_TOKEN environment variable is not set')
|
||||
// create redis source
|
||||
const redisDataSource = new RedisDataSource({
|
||||
cache: {
|
||||
url: process.env.REDIS_URL,
|
||||
cert: process.env.REDIS_CERT,
|
||||
},
|
||||
mq: {
|
||||
url: process.env.MQ_REDIS_URL,
|
||||
cert: process.env.MQ_REDIS_CERT,
|
||||
},
|
||||
})
|
||||
|
||||
const workers = createWorkers(redisDataSource)
|
||||
|
||||
const closeWorkers = async () => {
|
||||
await Promise.all(
|
||||
workers.map(async (worker) => {
|
||||
await worker.close()
|
||||
console.log('worker closed:', worker.name)
|
||||
})
|
||||
)
|
||||
}
|
||||
|
||||
// respond healthy to auto-scaler.
|
||||
app.get('/_ah/health', (req, res) => res.sendStatus(200))
|
||||
|
||||
app.get(
|
||||
'/lifecycle/prestop',
|
||||
asyncHandler(async (_req, res) => {
|
||||
console.log('prestop lifecycle hook called.')
|
||||
await closeWorkers()
|
||||
console.log('workers closed')
|
||||
|
||||
res.sendStatus(200)
|
||||
})
|
||||
)
|
||||
|
||||
app.get(
|
||||
'/metrics',
|
||||
asyncHandler(async (_, res) => {
|
||||
let output = ''
|
||||
|
||||
for (const worker of workers) {
|
||||
const queueName = worker.name
|
||||
const queue = await getQueue(
|
||||
redisDataSource.queueRedisClient,
|
||||
queueName
|
||||
)
|
||||
|
||||
const jobsTypes: Array<JobType> = [
|
||||
'active',
|
||||
'failed',
|
||||
'completed',
|
||||
'prioritized',
|
||||
]
|
||||
const counts = await queue.getJobCounts(...jobsTypes)
|
||||
|
||||
jobsTypes.forEach((metric) => {
|
||||
output += `# TYPE omnivore_queue_messages_${metric} gauge\n`
|
||||
output += `omnivore_queue_messages_${metric}{queue="${queueName}"} ${counts[metric]}\n`
|
||||
})
|
||||
|
||||
// Export the age of the oldest prioritized job in the queue
|
||||
const oldestJobs = await queue.getJobs(['prioritized'], 0, 1, true)
|
||||
if (oldestJobs.length > 0) {
|
||||
const currentTime = Date.now()
|
||||
const ageInSeconds = (currentTime - oldestJobs[0].timestamp) / 1000
|
||||
output += `# TYPE omnivore_queue_messages_oldest_job_age_seconds gauge\n`
|
||||
output += `omnivore_queue_messages_oldest_job_age_seconds{queue="${queueName}"} ${ageInSeconds}\n`
|
||||
} else {
|
||||
output += `# TYPE omnivore_queue_messages_oldest_job_age_seconds gauge\n`
|
||||
output += `omnivore_queue_messages_oldest_job_age_seconds{queue="${queueName}"} ${0}\n`
|
||||
}
|
||||
}
|
||||
|
||||
res.status(200).setHeader('Content-Type', 'text/plain').send(output)
|
||||
})
|
||||
)
|
||||
|
||||
const server = app.listen(port, () => {
|
||||
console.log(`[worker]: Workers started`)
|
||||
})
|
||||
|
||||
const gracefulShutdown = async (signal: string) => {
|
||||
console.log(`[worker]: Received ${signal}, closing server...`)
|
||||
await new Promise<void>((resolve) => {
|
||||
server.close((err) => {
|
||||
console.log('[worker]: Express server closed')
|
||||
if (err) {
|
||||
console.log('[worker]: error stopping server', { err })
|
||||
}
|
||||
|
||||
resolve()
|
||||
})
|
||||
})
|
||||
|
||||
await closeWorkers()
|
||||
console.log('[worker]: Workers closed')
|
||||
|
||||
await redisDataSource.shutdown()
|
||||
console.log('[worker]: Redis connection closed')
|
||||
|
||||
process.exit(0)
|
||||
}
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-misused-promises
|
||||
process.on('SIGINT', () => gracefulShutdown('SIGINT'))
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-misused-promises
|
||||
process.on('SIGTERM', () => gracefulShutdown('SIGTERM'))
|
||||
|
||||
process.on('uncaughtException', function (err) {
|
||||
// Handle the error safely
|
||||
console.error(err, 'Uncaught exception')
|
||||
})
|
||||
|
||||
process.on('unhandledRejection', (reason, promise) => {
|
||||
// Handle the error safely
|
||||
console.error({ promise, reason }, 'Unhandled Rejection at: Promise')
|
||||
})
|
||||
}
|
||||
|
||||
app.get('/_ah/health', (req, res) => res.sendStatus(200))
|
||||
|
||||
app.all('/', (req, res, next) => {
|
||||
if (req.method !== 'GET' && req.method !== 'POST') {
|
||||
console.error('request method is not GET or POST')
|
||||
return res.sendStatus(405)
|
||||
}
|
||||
|
||||
if (req.query.token !== process.env.VERIFICATION_TOKEN) {
|
||||
console.error('query does not include valid token')
|
||||
return res.sendStatus(403)
|
||||
}
|
||||
|
||||
return contentFetchRequestHandler(req, res, next)
|
||||
})
|
||||
|
||||
const PORT = process.env.PORT ? parseInt(process.env.PORT) : 8080
|
||||
app.listen(PORT, () => {
|
||||
console.log(`App listening on port ${PORT}`)
|
||||
console.log('Press Ctrl+C to quit.')
|
||||
})
|
||||
// only call main if the file was called from the CLI and wasn't required from another module
|
||||
if (require.main === module) {
|
||||
main()
|
||||
}
|
||||
|
||||
@ -1,31 +0,0 @@
|
||||
import { HttpFunction } from '@google-cloud/functions-framework'
|
||||
import * as Sentry from '@sentry/serverless'
|
||||
import 'dotenv/config'
|
||||
import { contentFetchRequestHandler } from './request_handler'
|
||||
|
||||
Sentry.GCPFunction.init({
|
||||
dsn: process.env.SENTRY_DSN,
|
||||
tracesSampleRate: 0,
|
||||
})
|
||||
|
||||
/**
|
||||
* Cloud Function entry point, HTTP trigger.
|
||||
* Loads the requested URL via Puppeteer, captures page content and sends it to backend
|
||||
*
|
||||
* @param {Object} req Cloud Function request context.
|
||||
* @param {Object} res Cloud Function response context.
|
||||
*/
|
||||
export const puppeteer = Sentry.GCPFunction.wrapHttpFunction(
|
||||
contentFetchRequestHandler as HttpFunction
|
||||
)
|
||||
|
||||
/**
|
||||
* Cloud Function entry point, HTTP trigger.
|
||||
* Loads the requested URL via Puppeteer and captures a screenshot of the provided element
|
||||
*
|
||||
* @param {Object} req Cloud Function request context.
|
||||
* Inlcudes:
|
||||
* * url - URL address of the page to open
|
||||
* @param {Object} res Cloud Function response context.
|
||||
*/
|
||||
// exports.preview = Sentry.GCPFunction.wrapHttpFunction(preview);
|
||||
@ -2,7 +2,6 @@ import { Storage } from '@google-cloud/storage'
|
||||
import { fetchContent } from '@omnivore/puppeteer-parse'
|
||||
import { RedisDataSource } from '@omnivore/utils'
|
||||
import 'dotenv/config'
|
||||
import { RequestHandler } from 'express'
|
||||
import { analytics } from './analytics'
|
||||
import { queueSavePageJob } from './job'
|
||||
|
||||
@ -321,29 +320,3 @@ export const processFetchContentJob = async (
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
export const contentFetchRequestHandler: RequestHandler = async (req, res) => {
|
||||
const data = <JobData>req.body
|
||||
|
||||
// create redis source
|
||||
const redisDataSource = new RedisDataSource({
|
||||
cache: {
|
||||
url: process.env.REDIS_URL,
|
||||
cert: process.env.REDIS_CERT,
|
||||
},
|
||||
mq: {
|
||||
url: process.env.MQ_REDIS_URL,
|
||||
cert: process.env.MQ_REDIS_CERT,
|
||||
},
|
||||
})
|
||||
|
||||
try {
|
||||
await processFetchContentJob(redisDataSource, data)
|
||||
} catch (error) {
|
||||
return res.sendStatus(500)
|
||||
} finally {
|
||||
await redisDataSource.shutdown()
|
||||
}
|
||||
|
||||
res.sendStatus(200)
|
||||
}
|
||||
|
||||
@ -1,15 +1,17 @@
|
||||
import { RedisDataSource } from '@omnivore/utils'
|
||||
import { Job, JobType, Queue, QueueEvents, RedisClient, Worker } from 'bullmq'
|
||||
import express, { Express } from 'express'
|
||||
import asyncHandler from 'express-async-handler'
|
||||
import { Job, Queue, QueueEvents, RedisClient, Worker } from 'bullmq'
|
||||
import { JobData, processFetchContentJob } from './request_handler'
|
||||
|
||||
const QUEUE_NAME = 'omnivore-content-fetch-queue'
|
||||
const FAST_QUEUE = 'omnivore-content-fetch-queue'
|
||||
const SLOW_QUEUE = 'omnivore-content-fetch-slow-queue'
|
||||
const RSS_QUEUE = 'omnivore-content-fetch-rss-queue'
|
||||
const QUEUE_NAMES = [FAST_QUEUE, SLOW_QUEUE, RSS_QUEUE] as const
|
||||
|
||||
export const getContentFetchQueue = async (
|
||||
connection: RedisClient
|
||||
export const getQueue = async (
|
||||
connection: RedisClient,
|
||||
queueName: string
|
||||
): Promise<Queue> => {
|
||||
const queue = new Queue(QUEUE_NAME, {
|
||||
const queue = new Queue(queueName, {
|
||||
connection,
|
||||
defaultJobOptions: {
|
||||
backoff: {
|
||||
@ -28,9 +30,29 @@ export const getContentFetchQueue = async (
|
||||
return queue
|
||||
}
|
||||
|
||||
const createWorker = (redisDataSource: RedisDataSource) => {
|
||||
return new Worker(
|
||||
QUEUE_NAME,
|
||||
const createWorker = (redisDataSource: RedisDataSource, queueName: string) => {
|
||||
const getLimiter = (queueName: string) => {
|
||||
switch (queueName) {
|
||||
case SLOW_QUEUE:
|
||||
return {
|
||||
max: 5,
|
||||
duration: 1000, // 1 second
|
||||
}
|
||||
case RSS_QUEUE:
|
||||
return {
|
||||
max: 3,
|
||||
duration: 1000, // 1 second
|
||||
}
|
||||
default:
|
||||
return {
|
||||
max: 10,
|
||||
duration: 1000, // 1 second
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
const worker = new Worker(
|
||||
queueName,
|
||||
async (job: Job<JobData>) => {
|
||||
// process the job
|
||||
await processFetchContentJob(redisDataSource, job.data)
|
||||
@ -38,152 +60,39 @@ const createWorker = (redisDataSource: RedisDataSource) => {
|
||||
{
|
||||
connection: redisDataSource.queueRedisClient,
|
||||
autorun: true, // start processing jobs immediately
|
||||
limiter: {
|
||||
max: 50,
|
||||
duration: 1000, // 1 second
|
||||
},
|
||||
limiter: getLimiter(queueName),
|
||||
}
|
||||
)
|
||||
}
|
||||
|
||||
const main = () => {
|
||||
console.log('[worker]: starting worker')
|
||||
|
||||
const app: Express = express()
|
||||
const port = process.env.PORT || 3002
|
||||
|
||||
// create redis source
|
||||
const redisDataSource = new RedisDataSource({
|
||||
cache: {
|
||||
url: process.env.REDIS_URL,
|
||||
cert: process.env.REDIS_CERT,
|
||||
},
|
||||
mq: {
|
||||
url: process.env.MQ_REDIS_URL,
|
||||
cert: process.env.MQ_REDIS_CERT,
|
||||
},
|
||||
worker.on('error', (err) => {
|
||||
console.error('worker error:', err)
|
||||
})
|
||||
|
||||
// respond healthy to auto-scaler.
|
||||
app.get('/_ah/health', (req, res) => res.sendStatus(200))
|
||||
|
||||
app.get(
|
||||
'/lifecycle/prestop',
|
||||
asyncHandler(async (_req, res) => {
|
||||
console.log('prestop lifecycle hook called.')
|
||||
await worker.close()
|
||||
res.sendStatus(200)
|
||||
})
|
||||
)
|
||||
|
||||
app.get(
|
||||
'/metrics',
|
||||
asyncHandler(async (_, res) => {
|
||||
const queue = await getContentFetchQueue(redisDataSource.queueRedisClient)
|
||||
if (!queue) {
|
||||
res.sendStatus(400)
|
||||
return
|
||||
}
|
||||
|
||||
let output = ''
|
||||
const jobsTypes: JobType[] = [
|
||||
'active',
|
||||
'failed',
|
||||
'completed',
|
||||
'prioritized',
|
||||
]
|
||||
const counts = await queue.getJobCounts(...jobsTypes)
|
||||
|
||||
jobsTypes.forEach((metric) => {
|
||||
output += `# TYPE omnivore_queue_messages_${metric} gauge\n`
|
||||
output += `omnivore_queue_messages_${metric}{queue="${QUEUE_NAME}"} ${counts[metric]}\n`
|
||||
})
|
||||
|
||||
// Export the age of the oldest prioritized job in the queue
|
||||
const oldestJobs = await queue.getJobs(['prioritized'], 0, 1, true)
|
||||
if (oldestJobs.length > 0) {
|
||||
const currentTime = Date.now()
|
||||
const ageInSeconds = (currentTime - oldestJobs[0].timestamp) / 1000
|
||||
output += `# TYPE omnivore_queue_messages_oldest_job_age_seconds gauge\n`
|
||||
output += `omnivore_queue_messages_oldest_job_age_seconds{queue="${QUEUE_NAME}"} ${ageInSeconds}\n`
|
||||
} else {
|
||||
output += `# TYPE omnivore_queue_messages_oldest_job_age_seconds gauge\n`
|
||||
output += `omnivore_queue_messages_oldest_job_age_seconds{queue="${QUEUE_NAME}"} ${0}\n`
|
||||
}
|
||||
|
||||
res.status(200).setHeader('Content-Type', 'text/plain').send(output)
|
||||
})
|
||||
)
|
||||
|
||||
const server = app.listen(port, () => {
|
||||
console.log(`[worker]: started`)
|
||||
})
|
||||
|
||||
const worker = createWorker(redisDataSource)
|
||||
|
||||
const queueEvents = new QueueEvents(QUEUE_NAME, {
|
||||
const queueEvents = new QueueEvents(queueName, {
|
||||
connection: redisDataSource.queueRedisClient,
|
||||
})
|
||||
|
||||
queueEvents.on('added', (job) => {
|
||||
console.log('added job: ', job.jobId, job.name)
|
||||
console.log('added job:', job.jobId, job.name)
|
||||
})
|
||||
|
||||
queueEvents.on('removed', (job) => {
|
||||
console.log('removed job: ', job.jobId)
|
||||
console.log('removed job:', job.jobId)
|
||||
})
|
||||
|
||||
queueEvents.on('completed', (job) => {
|
||||
console.log('completed job: ', job.jobId)
|
||||
console.log('completed job:', job.jobId)
|
||||
})
|
||||
|
||||
queueEvents.on('failed', (job) => {
|
||||
console.log('failed job: ', job.jobId)
|
||||
console.log('failed job:', job.jobId)
|
||||
})
|
||||
|
||||
const gracefulShutdown = async (signal: string) => {
|
||||
console.log(`[worker]: Received ${signal}, closing server...`)
|
||||
await new Promise<void>((resolve) => {
|
||||
server.close((err) => {
|
||||
console.log('[worker]: Express server closed')
|
||||
if (err) {
|
||||
console.log('[worker]: error stopping server', { err })
|
||||
}
|
||||
|
||||
resolve()
|
||||
})
|
||||
})
|
||||
|
||||
await worker.close()
|
||||
console.log('[worker]: Worker closed')
|
||||
|
||||
await redisDataSource.shutdown()
|
||||
console.log('[worker]: Redis connection closed')
|
||||
|
||||
process.exit(0)
|
||||
}
|
||||
|
||||
const handleShutdown = (signal: string) => {
|
||||
return () => {
|
||||
void gracefulShutdown(signal)
|
||||
}
|
||||
}
|
||||
|
||||
process.on('SIGTERM', handleShutdown('SIGTERM'))
|
||||
process.on('SIGINT', handleShutdown('SIGINT'))
|
||||
|
||||
process.on('uncaughtException', (error) => {
|
||||
console.error('Uncaught Exception:', error)
|
||||
handleShutdown('uncaughtException')
|
||||
})
|
||||
|
||||
process.on('unhandledRejection', (reason, promise) => {
|
||||
console.error('Unhandled Rejection at:', promise, 'reason:', reason)
|
||||
handleShutdown('unhandledRejection')
|
||||
})
|
||||
return worker
|
||||
}
|
||||
|
||||
// only call main if the file was called from the CLI and wasn't required from another module
|
||||
if (require.main === module) {
|
||||
main()
|
||||
export const createWorkers = (redisDataSource: RedisDataSource) => {
|
||||
return QUEUE_NAMES.map((queueName) =>
|
||||
createWorker(redisDataSource, queueName)
|
||||
)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user