create a job for finding thumbnail

This commit is contained in:
Hongbo Wu
2024-01-25 11:18:18 +08:00
parent f3c1b9d9a0
commit e54c691c09
5 changed files with 193 additions and 45 deletions

View File

@ -71,6 +71,7 @@
"graphql-shield": "^7.5.0",
"highlightjs": "^9.16.2",
"html-entities": "^2.3.2",
"image-size": "^1.0.2",
"intercom-client": "^3.1.4",
"ioredis": "^5.3.2",
"jsonwebtoken": "^8.5.1",

View File

@ -0,0 +1,174 @@
import axios, { AxiosResponse } from 'axios'
import sizeOf from 'image-size'
import { parseHTML } from 'linkedom'
import {
findLibraryItemById,
updateLibraryItem,
} from '../services/library_item'
import { logger } from '../utils/logger'
interface Data {
libraryItemId: string
userId: string
}
interface ImageSize {
src: string
width: number
height: number
}
export const THUMBNAIL_JOB = 'find-thumbnail'
const fetchImage = async (url: string): Promise<AxiosResponse | null> => {
console.log('fetching image', url)
try {
// get image file by url
return await axios.get(url, {
responseType: 'arraybuffer',
timeout: 10000, // 10s
maxContentLength: 20000000, // 20mb
})
} catch (e) {
logger.error('fetch image error', e)
return null
}
}
const getImageSize = async (src: string): Promise<ImageSize | null> => {
try {
const response = await fetchImage(src)
if (!response) {
return null
}
// eslint-disable-next-line @typescript-eslint/no-unsafe-argument
const buffer = Buffer.from(response.data, 'binary')
// get image size
const { width, height } = sizeOf(buffer)
if (!width || !height) {
return null
}
return {
src,
width,
height,
}
} catch (e) {
logger.error(e)
return null
}
}
export const fetchAllImageSizes = async (content: string) => {
const dom = parseHTML(content).document
// fetch all images by src and get their sizes
const images = dom.querySelectorAll('img[src]')
if (!images || images.length === 0) {
console.log('no images')
return []
}
return Promise.all(
Array.from(images).map((image) => {
const src = image.getAttribute('src')
if (!src) {
return null
}
return getImageSize(src)
})
)
}
// credit: https://github.com/reddit-archive/reddit/blob/753b17407e9a9dca09558526805922de24133d53/r2/r2/lib/media.py#L706
export const _findThumbnail = (imagesSizes: (ImageSize | null)[]) => {
// find the largest and squarest image as the thumbnail
let thumbnail = ''
let largestArea = 0
for (const imageSize of Array.from(imagesSizes)) {
if (!imageSize) {
continue
}
let area = imageSize.width * imageSize.height
// ignore small images
if (area < 5000) {
logger.info('ignore small', { src: imageSize.src })
continue
}
// penalize excessively long/wide images
const ratio =
Math.max(imageSize.width, imageSize.height) /
Math.min(imageSize.width, imageSize.height)
if (ratio > 1.5) {
logger.info('penalizing long/wide', { src: imageSize.src })
area /= ratio * 2
}
// penalize images with "sprite" in their name
if (imageSize.src.toLowerCase().includes('sprite')) {
logger.info('penalizing sprite', { src: imageSize.src })
area /= 10
}
if (area > largestArea) {
largestArea = area
thumbnail = imageSize.src
}
}
return thumbnail
}
export const findThumbnail = async (data: Data) => {
const { libraryItemId, userId } = data
const item = await findLibraryItemById(libraryItemId, userId)
if (!item) {
logger.info('page not found')
return false
}
const thumbnail = item.thumbnail
if (thumbnail) {
logger.info('thumbnail already set')
// pre-cache thumbnail first if exists
const image = await fetchImage(thumbnail)
if (!image) {
logger.info('thumbnail image not found')
item.thumbnail = undefined
}
}
logger.info('pre-caching all images...')
// pre-cache all images in the content and get their sizes
const imageSizes = await fetchAllImageSizes(item.readableContent)
// find thumbnail from all images if thumbnail not set
if (!item.thumbnail && imageSizes.length > 0) {
logger.info('finding thumbnail...')
const thumbnail = _findThumbnail(imageSizes)
if (!thumbnail) {
logger.info('no thumbnail found from content')
return false
}
// update page with thumbnail
await updateLibraryItem(
libraryItemId,
{
thumbnail,
},
userId
)
logger.info(`thumbnail updated: ${thumbnail}`)
}
return true
}

View File

@ -2,17 +2,18 @@
/* eslint-disable @typescript-eslint/restrict-template-expressions */
/* eslint-disable @typescript-eslint/require-await */
/* eslint-disable @typescript-eslint/no-misused-promises */
import { Job, QueueEvents, Worker, Queue, JobType } from 'bullmq'
import { Job, Queue, QueueEvents, Worker, JobType } from 'bullmq'
import express, { Express } from 'express'
import { SnakeNamingStrategy } from 'typeorm-naming-strategies'
import { appDataSource } from './data_source'
import { env } from './env'
import { findThumbnail, THUMBNAIL_JOB } from './jobs/find_thumbnail'
import { refreshAllFeeds } from './jobs/rss/refreshAllFeeds'
import { refreshFeed } from './jobs/rss/refreshFeed'
import { savePageJob } from './jobs/save_page'
import { updatePDFContentJob } from './jobs/update_pdf_content'
import { redisDataSource } from './redis_data_source'
import { CustomTypeOrmLogger } from './utils/logger'
import { updatePDFContentJob } from './jobs/update_pdf_content'
export const QUEUE_NAME = 'omnivore-backend-queue'
@ -119,8 +120,9 @@ const main = async () => {
case 'update-pdf-content': {
return updatePDFContentJob(job.data)
}
case THUMBNAIL_JOB:
return findThumbnail(job.data)
}
return true
},
{
connection: workerRedisClient,

View File

@ -170,8 +170,8 @@ export const savePage = async (
if (!isImported && !parseResult.parsedContent?.previewImage) {
try {
// create a task to update thumbnail and pre-cache all images
const taskId = await enqueueThumbnailTask(user.id, slug)
logger.info('Created thumbnail task', { taskId })
const job = await enqueueThumbnailTask(user.id, clientRequestId)
logger.info('Created thumbnail task', { job })
} catch (e) {
logger.error('Failed to create thumbnail task', e)
}

View File

@ -22,6 +22,8 @@ import { CreateTaskError } from './errors'
import { stringToHash } from './helpers'
import { logger } from './logger'
import View = google.cloud.tasks.v2.Task.View
import { getBackendQueue } from '../queue-processor'
import { THUMBNAIL_JOB } from '../jobs/find_thumbnail'
// Instantiates a client.
const client = new CloudTasksClient()
@ -579,50 +581,19 @@ export const enqueueExportToIntegration = async (
export const enqueueThumbnailTask = async (
userId: string,
slug: string
): Promise<string> => {
const { GOOGLE_CLOUD_PROJECT } = process.env
libraryItemId: string
) => {
const queue = await getBackendQueue()
if (!queue) {
return undefined
}
const payload = {
userId,
slug,
libraryItemId,
}
const headers = {
Cookie: `auth=${generateVerificationToken({ id: userId })}`,
}
// If there is no Google Cloud Project Id exposed, it means that we are in local environment
if (env.dev.isLocal || !GOOGLE_CLOUD_PROJECT) {
if (env.queue.thumbnailTaskHandlerUrl) {
// Calling the handler function directly.
setTimeout(() => {
axios
.post(env.queue.thumbnailTaskHandlerUrl, payload, {
headers,
})
.catch((error) => {
logError(error)
})
}, 0)
}
return ''
}
const createdTasks = await createHttpTaskWithToken({
payload,
taskHandlerUrl: env.queue.thumbnailTaskHandlerUrl,
requestHeaders: headers,
queue: 'omnivore-thumbnail-queue',
return queue.add(THUMBNAIL_JOB, payload, {
priority: 100,
})
if (!createdTasks || !createdTasks[0].name) {
logger.error(`Unable to get the name of the task`, {
payload,
createdTasks,
})
throw new CreateTaskError(`Unable to get the name of the task`)
}
return createdTasks[0].name
}
export interface RssSubscriptionGroup {