From e54c691c09d2445d0cab12ef822030811decbf59 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Thu, 25 Jan 2024 11:18:18 +0800 Subject: [PATCH] create a job for finding thumbnail --- packages/api/package.json | 1 + packages/api/src/jobs/find_thumbnail.ts | 174 ++++++++++++++++++++++++ packages/api/src/queue-processor.ts | 8 +- packages/api/src/services/save_page.ts | 4 +- packages/api/src/utils/createTask.ts | 51 ++----- 5 files changed, 193 insertions(+), 45 deletions(-) create mode 100644 packages/api/src/jobs/find_thumbnail.ts diff --git a/packages/api/package.json b/packages/api/package.json index 7d5420686..0f1b60eba 100644 --- a/packages/api/package.json +++ b/packages/api/package.json @@ -71,6 +71,7 @@ "graphql-shield": "^7.5.0", "highlightjs": "^9.16.2", "html-entities": "^2.3.2", + "image-size": "^1.0.2", "intercom-client": "^3.1.4", "ioredis": "^5.3.2", "jsonwebtoken": "^8.5.1", diff --git a/packages/api/src/jobs/find_thumbnail.ts b/packages/api/src/jobs/find_thumbnail.ts new file mode 100644 index 000000000..06ed28a3e --- /dev/null +++ b/packages/api/src/jobs/find_thumbnail.ts @@ -0,0 +1,174 @@ +import axios, { AxiosResponse } from 'axios' +import sizeOf from 'image-size' +import { parseHTML } from 'linkedom' +import { + findLibraryItemById, + updateLibraryItem, +} from '../services/library_item' +import { logger } from '../utils/logger' + +interface Data { + libraryItemId: string + userId: string +} + +interface ImageSize { + src: string + width: number + height: number +} + +export const THUMBNAIL_JOB = 'find-thumbnail' + +const fetchImage = async (url: string): Promise => { + console.log('fetching image', url) + try { + // get image file by url + return await axios.get(url, { + responseType: 'arraybuffer', + timeout: 10000, // 10s + maxContentLength: 20000000, // 20mb + }) + } catch (e) { + logger.error('fetch image error', e) + return null + } +} + +const getImageSize = async (src: string): Promise => { + try { + const response = await fetchImage(src) + if (!response) { + return null + } + + // eslint-disable-next-line @typescript-eslint/no-unsafe-argument + const buffer = Buffer.from(response.data, 'binary') + + // get image size + const { width, height } = sizeOf(buffer) + + if (!width || !height) { + return null + } + + return { + src, + width, + height, + } + } catch (e) { + logger.error(e) + return null + } +} + +export const fetchAllImageSizes = async (content: string) => { + const dom = parseHTML(content).document + + // fetch all images by src and get their sizes + const images = dom.querySelectorAll('img[src]') + if (!images || images.length === 0) { + console.log('no images') + return [] + } + + return Promise.all( + Array.from(images).map((image) => { + const src = image.getAttribute('src') + if (!src) { + return null + } + + return getImageSize(src) + }) + ) +} + +// credit: https://github.com/reddit-archive/reddit/blob/753b17407e9a9dca09558526805922de24133d53/r2/r2/lib/media.py#L706 +export const _findThumbnail = (imagesSizes: (ImageSize | null)[]) => { + // find the largest and squarest image as the thumbnail + let thumbnail = '' + let largestArea = 0 + for (const imageSize of Array.from(imagesSizes)) { + if (!imageSize) { + continue + } + + let area = imageSize.width * imageSize.height + + // ignore small images + if (area < 5000) { + logger.info('ignore small', { src: imageSize.src }) + continue + } + + // penalize excessively long/wide images + const ratio = + Math.max(imageSize.width, imageSize.height) / + Math.min(imageSize.width, imageSize.height) + if (ratio > 1.5) { + logger.info('penalizing long/wide', { src: imageSize.src }) + area /= ratio * 2 + } + + // penalize images with "sprite" in their name + if (imageSize.src.toLowerCase().includes('sprite')) { + logger.info('penalizing sprite', { src: imageSize.src }) + area /= 10 + } + + if (area > largestArea) { + largestArea = area + thumbnail = imageSize.src + } + } + + return thumbnail +} + +export const findThumbnail = async (data: Data) => { + const { libraryItemId, userId } = data + + const item = await findLibraryItemById(libraryItemId, userId) + if (!item) { + logger.info('page not found') + return false + } + + const thumbnail = item.thumbnail + if (thumbnail) { + logger.info('thumbnail already set') + // pre-cache thumbnail first if exists + const image = await fetchImage(thumbnail) + if (!image) { + logger.info('thumbnail image not found') + item.thumbnail = undefined + } + } + + logger.info('pre-caching all images...') + // pre-cache all images in the content and get their sizes + const imageSizes = await fetchAllImageSizes(item.readableContent) + // find thumbnail from all images if thumbnail not set + if (!item.thumbnail && imageSizes.length > 0) { + logger.info('finding thumbnail...') + const thumbnail = _findThumbnail(imageSizes) + if (!thumbnail) { + logger.info('no thumbnail found from content') + return false + } + + // update page with thumbnail + await updateLibraryItem( + libraryItemId, + { + thumbnail, + }, + userId + ) + logger.info(`thumbnail updated: ${thumbnail}`) + } + + return true +} diff --git a/packages/api/src/queue-processor.ts b/packages/api/src/queue-processor.ts index 233bf36e7..2810ac6d6 100644 --- a/packages/api/src/queue-processor.ts +++ b/packages/api/src/queue-processor.ts @@ -2,17 +2,18 @@ /* eslint-disable @typescript-eslint/restrict-template-expressions */ /* eslint-disable @typescript-eslint/require-await */ /* eslint-disable @typescript-eslint/no-misused-promises */ -import { Job, QueueEvents, Worker, Queue, JobType } from 'bullmq' +import { Job, Queue, QueueEvents, Worker, JobType } from 'bullmq' import express, { Express } from 'express' import { SnakeNamingStrategy } from 'typeorm-naming-strategies' import { appDataSource } from './data_source' import { env } from './env' +import { findThumbnail, THUMBNAIL_JOB } from './jobs/find_thumbnail' import { refreshAllFeeds } from './jobs/rss/refreshAllFeeds' import { refreshFeed } from './jobs/rss/refreshFeed' import { savePageJob } from './jobs/save_page' +import { updatePDFContentJob } from './jobs/update_pdf_content' import { redisDataSource } from './redis_data_source' import { CustomTypeOrmLogger } from './utils/logger' -import { updatePDFContentJob } from './jobs/update_pdf_content' export const QUEUE_NAME = 'omnivore-backend-queue' @@ -119,8 +120,9 @@ const main = async () => { case 'update-pdf-content': { return updatePDFContentJob(job.data) } + case THUMBNAIL_JOB: + return findThumbnail(job.data) } - return true }, { connection: workerRedisClient, diff --git a/packages/api/src/services/save_page.ts b/packages/api/src/services/save_page.ts index 7931fc48e..7cd7b1768 100644 --- a/packages/api/src/services/save_page.ts +++ b/packages/api/src/services/save_page.ts @@ -170,8 +170,8 @@ export const savePage = async ( if (!isImported && !parseResult.parsedContent?.previewImage) { try { // create a task to update thumbnail and pre-cache all images - const taskId = await enqueueThumbnailTask(user.id, slug) - logger.info('Created thumbnail task', { taskId }) + const job = await enqueueThumbnailTask(user.id, clientRequestId) + logger.info('Created thumbnail task', { job }) } catch (e) { logger.error('Failed to create thumbnail task', e) } diff --git a/packages/api/src/utils/createTask.ts b/packages/api/src/utils/createTask.ts index 09c84b62d..88eb3a2d5 100644 --- a/packages/api/src/utils/createTask.ts +++ b/packages/api/src/utils/createTask.ts @@ -22,6 +22,8 @@ import { CreateTaskError } from './errors' import { stringToHash } from './helpers' import { logger } from './logger' import View = google.cloud.tasks.v2.Task.View +import { getBackendQueue } from '../queue-processor' +import { THUMBNAIL_JOB } from '../jobs/find_thumbnail' // Instantiates a client. const client = new CloudTasksClient() @@ -579,50 +581,19 @@ export const enqueueExportToIntegration = async ( export const enqueueThumbnailTask = async ( userId: string, - slug: string -): Promise => { - const { GOOGLE_CLOUD_PROJECT } = process.env + libraryItemId: string +) => { + const queue = await getBackendQueue() + if (!queue) { + return undefined + } const payload = { userId, - slug, + libraryItemId, } - - const headers = { - Cookie: `auth=${generateVerificationToken({ id: userId })}`, - } - - // If there is no Google Cloud Project Id exposed, it means that we are in local environment - if (env.dev.isLocal || !GOOGLE_CLOUD_PROJECT) { - if (env.queue.thumbnailTaskHandlerUrl) { - // Calling the handler function directly. - setTimeout(() => { - axios - .post(env.queue.thumbnailTaskHandlerUrl, payload, { - headers, - }) - .catch((error) => { - logError(error) - }) - }, 0) - } - return '' - } - - const createdTasks = await createHttpTaskWithToken({ - payload, - taskHandlerUrl: env.queue.thumbnailTaskHandlerUrl, - requestHeaders: headers, - queue: 'omnivore-thumbnail-queue', + return queue.add(THUMBNAIL_JOB, payload, { + priority: 100, }) - - if (!createdTasks || !createdTasks[0].name) { - logger.error(`Unable to get the name of the task`, { - payload, - createdTasks, - }) - throw new CreateTaskError(`Unable to get the name of the task`) - } - return createdTasks[0].name } export interface RssSubscriptionGroup {