create a job for finding thumbnail
This commit is contained in:
@ -71,6 +71,7 @@
|
||||
"graphql-shield": "^7.5.0",
|
||||
"highlightjs": "^9.16.2",
|
||||
"html-entities": "^2.3.2",
|
||||
"image-size": "^1.0.2",
|
||||
"intercom-client": "^3.1.4",
|
||||
"ioredis": "^5.3.2",
|
||||
"jsonwebtoken": "^8.5.1",
|
||||
|
||||
174
packages/api/src/jobs/find_thumbnail.ts
Normal file
174
packages/api/src/jobs/find_thumbnail.ts
Normal file
@ -0,0 +1,174 @@
|
||||
import axios, { AxiosResponse } from 'axios'
|
||||
import sizeOf from 'image-size'
|
||||
import { parseHTML } from 'linkedom'
|
||||
import {
|
||||
findLibraryItemById,
|
||||
updateLibraryItem,
|
||||
} from '../services/library_item'
|
||||
import { logger } from '../utils/logger'
|
||||
|
||||
interface Data {
|
||||
libraryItemId: string
|
||||
userId: string
|
||||
}
|
||||
|
||||
interface ImageSize {
|
||||
src: string
|
||||
width: number
|
||||
height: number
|
||||
}
|
||||
|
||||
export const THUMBNAIL_JOB = 'find-thumbnail'
|
||||
|
||||
const fetchImage = async (url: string): Promise<AxiosResponse | null> => {
|
||||
console.log('fetching image', url)
|
||||
try {
|
||||
// get image file by url
|
||||
return await axios.get(url, {
|
||||
responseType: 'arraybuffer',
|
||||
timeout: 10000, // 10s
|
||||
maxContentLength: 20000000, // 20mb
|
||||
})
|
||||
} catch (e) {
|
||||
logger.error('fetch image error', e)
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
const getImageSize = async (src: string): Promise<ImageSize | null> => {
|
||||
try {
|
||||
const response = await fetchImage(src)
|
||||
if (!response) {
|
||||
return null
|
||||
}
|
||||
|
||||
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument
|
||||
const buffer = Buffer.from(response.data, 'binary')
|
||||
|
||||
// get image size
|
||||
const { width, height } = sizeOf(buffer)
|
||||
|
||||
if (!width || !height) {
|
||||
return null
|
||||
}
|
||||
|
||||
return {
|
||||
src,
|
||||
width,
|
||||
height,
|
||||
}
|
||||
} catch (e) {
|
||||
logger.error(e)
|
||||
return null
|
||||
}
|
||||
}
|
||||
|
||||
export const fetchAllImageSizes = async (content: string) => {
|
||||
const dom = parseHTML(content).document
|
||||
|
||||
// fetch all images by src and get their sizes
|
||||
const images = dom.querySelectorAll('img[src]')
|
||||
if (!images || images.length === 0) {
|
||||
console.log('no images')
|
||||
return []
|
||||
}
|
||||
|
||||
return Promise.all(
|
||||
Array.from(images).map((image) => {
|
||||
const src = image.getAttribute('src')
|
||||
if (!src) {
|
||||
return null
|
||||
}
|
||||
|
||||
return getImageSize(src)
|
||||
})
|
||||
)
|
||||
}
|
||||
|
||||
// credit: https://github.com/reddit-archive/reddit/blob/753b17407e9a9dca09558526805922de24133d53/r2/r2/lib/media.py#L706
|
||||
export const _findThumbnail = (imagesSizes: (ImageSize | null)[]) => {
|
||||
// find the largest and squarest image as the thumbnail
|
||||
let thumbnail = ''
|
||||
let largestArea = 0
|
||||
for (const imageSize of Array.from(imagesSizes)) {
|
||||
if (!imageSize) {
|
||||
continue
|
||||
}
|
||||
|
||||
let area = imageSize.width * imageSize.height
|
||||
|
||||
// ignore small images
|
||||
if (area < 5000) {
|
||||
logger.info('ignore small', { src: imageSize.src })
|
||||
continue
|
||||
}
|
||||
|
||||
// penalize excessively long/wide images
|
||||
const ratio =
|
||||
Math.max(imageSize.width, imageSize.height) /
|
||||
Math.min(imageSize.width, imageSize.height)
|
||||
if (ratio > 1.5) {
|
||||
logger.info('penalizing long/wide', { src: imageSize.src })
|
||||
area /= ratio * 2
|
||||
}
|
||||
|
||||
// penalize images with "sprite" in their name
|
||||
if (imageSize.src.toLowerCase().includes('sprite')) {
|
||||
logger.info('penalizing sprite', { src: imageSize.src })
|
||||
area /= 10
|
||||
}
|
||||
|
||||
if (area > largestArea) {
|
||||
largestArea = area
|
||||
thumbnail = imageSize.src
|
||||
}
|
||||
}
|
||||
|
||||
return thumbnail
|
||||
}
|
||||
|
||||
export const findThumbnail = async (data: Data) => {
|
||||
const { libraryItemId, userId } = data
|
||||
|
||||
const item = await findLibraryItemById(libraryItemId, userId)
|
||||
if (!item) {
|
||||
logger.info('page not found')
|
||||
return false
|
||||
}
|
||||
|
||||
const thumbnail = item.thumbnail
|
||||
if (thumbnail) {
|
||||
logger.info('thumbnail already set')
|
||||
// pre-cache thumbnail first if exists
|
||||
const image = await fetchImage(thumbnail)
|
||||
if (!image) {
|
||||
logger.info('thumbnail image not found')
|
||||
item.thumbnail = undefined
|
||||
}
|
||||
}
|
||||
|
||||
logger.info('pre-caching all images...')
|
||||
// pre-cache all images in the content and get their sizes
|
||||
const imageSizes = await fetchAllImageSizes(item.readableContent)
|
||||
// find thumbnail from all images if thumbnail not set
|
||||
if (!item.thumbnail && imageSizes.length > 0) {
|
||||
logger.info('finding thumbnail...')
|
||||
const thumbnail = _findThumbnail(imageSizes)
|
||||
if (!thumbnail) {
|
||||
logger.info('no thumbnail found from content')
|
||||
return false
|
||||
}
|
||||
|
||||
// update page with thumbnail
|
||||
await updateLibraryItem(
|
||||
libraryItemId,
|
||||
{
|
||||
thumbnail,
|
||||
},
|
||||
userId
|
||||
)
|
||||
logger.info(`thumbnail updated: ${thumbnail}`)
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
@ -2,17 +2,18 @@
|
||||
/* eslint-disable @typescript-eslint/restrict-template-expressions */
|
||||
/* eslint-disable @typescript-eslint/require-await */
|
||||
/* eslint-disable @typescript-eslint/no-misused-promises */
|
||||
import { Job, QueueEvents, Worker, Queue, JobType } from 'bullmq'
|
||||
import { Job, Queue, QueueEvents, Worker, JobType } from 'bullmq'
|
||||
import express, { Express } from 'express'
|
||||
import { SnakeNamingStrategy } from 'typeorm-naming-strategies'
|
||||
import { appDataSource } from './data_source'
|
||||
import { env } from './env'
|
||||
import { findThumbnail, THUMBNAIL_JOB } from './jobs/find_thumbnail'
|
||||
import { refreshAllFeeds } from './jobs/rss/refreshAllFeeds'
|
||||
import { refreshFeed } from './jobs/rss/refreshFeed'
|
||||
import { savePageJob } from './jobs/save_page'
|
||||
import { updatePDFContentJob } from './jobs/update_pdf_content'
|
||||
import { redisDataSource } from './redis_data_source'
|
||||
import { CustomTypeOrmLogger } from './utils/logger'
|
||||
import { updatePDFContentJob } from './jobs/update_pdf_content'
|
||||
|
||||
export const QUEUE_NAME = 'omnivore-backend-queue'
|
||||
|
||||
@ -119,8 +120,9 @@ const main = async () => {
|
||||
case 'update-pdf-content': {
|
||||
return updatePDFContentJob(job.data)
|
||||
}
|
||||
case THUMBNAIL_JOB:
|
||||
return findThumbnail(job.data)
|
||||
}
|
||||
return true
|
||||
},
|
||||
{
|
||||
connection: workerRedisClient,
|
||||
|
||||
@ -170,8 +170,8 @@ export const savePage = async (
|
||||
if (!isImported && !parseResult.parsedContent?.previewImage) {
|
||||
try {
|
||||
// create a task to update thumbnail and pre-cache all images
|
||||
const taskId = await enqueueThumbnailTask(user.id, slug)
|
||||
logger.info('Created thumbnail task', { taskId })
|
||||
const job = await enqueueThumbnailTask(user.id, clientRequestId)
|
||||
logger.info('Created thumbnail task', { job })
|
||||
} catch (e) {
|
||||
logger.error('Failed to create thumbnail task', e)
|
||||
}
|
||||
|
||||
@ -22,6 +22,8 @@ import { CreateTaskError } from './errors'
|
||||
import { stringToHash } from './helpers'
|
||||
import { logger } from './logger'
|
||||
import View = google.cloud.tasks.v2.Task.View
|
||||
import { getBackendQueue } from '../queue-processor'
|
||||
import { THUMBNAIL_JOB } from '../jobs/find_thumbnail'
|
||||
|
||||
// Instantiates a client.
|
||||
const client = new CloudTasksClient()
|
||||
@ -579,50 +581,19 @@ export const enqueueExportToIntegration = async (
|
||||
|
||||
export const enqueueThumbnailTask = async (
|
||||
userId: string,
|
||||
slug: string
|
||||
): Promise<string> => {
|
||||
const { GOOGLE_CLOUD_PROJECT } = process.env
|
||||
libraryItemId: string
|
||||
) => {
|
||||
const queue = await getBackendQueue()
|
||||
if (!queue) {
|
||||
return undefined
|
||||
}
|
||||
const payload = {
|
||||
userId,
|
||||
slug,
|
||||
libraryItemId,
|
||||
}
|
||||
|
||||
const headers = {
|
||||
Cookie: `auth=${generateVerificationToken({ id: userId })}`,
|
||||
}
|
||||
|
||||
// If there is no Google Cloud Project Id exposed, it means that we are in local environment
|
||||
if (env.dev.isLocal || !GOOGLE_CLOUD_PROJECT) {
|
||||
if (env.queue.thumbnailTaskHandlerUrl) {
|
||||
// Calling the handler function directly.
|
||||
setTimeout(() => {
|
||||
axios
|
||||
.post(env.queue.thumbnailTaskHandlerUrl, payload, {
|
||||
headers,
|
||||
})
|
||||
.catch((error) => {
|
||||
logError(error)
|
||||
})
|
||||
}, 0)
|
||||
}
|
||||
return ''
|
||||
}
|
||||
|
||||
const createdTasks = await createHttpTaskWithToken({
|
||||
payload,
|
||||
taskHandlerUrl: env.queue.thumbnailTaskHandlerUrl,
|
||||
requestHeaders: headers,
|
||||
queue: 'omnivore-thumbnail-queue',
|
||||
return queue.add(THUMBNAIL_JOB, payload, {
|
||||
priority: 100,
|
||||
})
|
||||
|
||||
if (!createdTasks || !createdTasks[0].name) {
|
||||
logger.error(`Unable to get the name of the task`, {
|
||||
payload,
|
||||
createdTasks,
|
||||
})
|
||||
throw new CreateTaskError(`Unable to get the name of the task`)
|
||||
}
|
||||
return createdTasks[0].name
|
||||
}
|
||||
|
||||
export interface RssSubscriptionGroup {
|
||||
|
||||
Reference in New Issue
Block a user