From 94dd4be65909a1a8f20d71981aa4359d35d350aa Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 23 Jan 2024 16:47:42 +0800 Subject: [PATCH 1/5] fix: page content not saved when title is empty but content is not --- packages/puppeteer-parse/src/index.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/packages/puppeteer-parse/src/index.ts b/packages/puppeteer-parse/src/index.ts index bfe7fde72..2629689c2 100644 --- a/packages/puppeteer-parse/src/index.ts +++ b/packages/puppeteer-parse/src/index.ts @@ -154,7 +154,7 @@ export const fetchContent = async ( let context: BrowserContext | undefined, page: Page | undefined, - title = '', + title: string | undefined, content: string | undefined, contentType: string | undefined, readabilityResult: Readability.ParseResult | null | undefined @@ -214,7 +214,7 @@ export const fetchContent = async ( const sbResult = await fetchContentWithScrapingBee(url) title = sbResult.title content = sbResult.domContent - } else if (result.title && result.domContent) { + } else { title = result.title content = result.domContent } From 1332cda356b7a4f51aae15314ed669e74e7895d8 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 23 Jan 2024 16:48:57 +0800 Subject: [PATCH 2/5] give highest priority to save-page job for adding items by links --- packages/content-fetch/src/job.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/content-fetch/src/job.ts b/packages/content-fetch/src/job.ts index 3d3946b18..82529735c 100644 --- a/packages/content-fetch/src/job.ts +++ b/packages/content-fetch/src/job.ts @@ -29,7 +29,7 @@ const getPriority = (job: savePageJob): number => { return 100 } - return 5 + return 1 } const getAttempts = (job: savePageJob): number => { From fd0bb37d3a3d0a1200b353928a59e03cb786d37e Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 23 Jan 2024 16:54:35 +0800 Subject: [PATCH 3/5] make title optional for cache --- packages/content-fetch/src/request_handler.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/content-fetch/src/request_handler.ts b/packages/content-fetch/src/request_handler.ts index 9672344d3..0106d8058 100644 --- a/packages/content-fetch/src/request_handler.ts +++ b/packages/content-fetch/src/request_handler.ts @@ -47,7 +47,7 @@ interface LogRecord { interface FetchResult { finalUrl: string - title: string + title?: string content?: string contentType?: string readabilityResult?: unknown From 567ce172e6a77d4d9b7ca4aad1217b59a8ca6386 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 23 Jan 2024 16:55:17 +0800 Subject: [PATCH 4/5] make title optional for cache in save-page job --- packages/api/src/jobs/save_page.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/api/src/jobs/save_page.ts b/packages/api/src/jobs/save_page.ts index d48cd9f1c..9eccf5efb 100644 --- a/packages/api/src/jobs/save_page.ts +++ b/packages/api/src/jobs/save_page.ts @@ -70,7 +70,7 @@ interface SavePageResponse { interface FetchResult { finalUrl: string - title: string + title?: string content?: string contentType?: string readabilityResult?: Readability.ParseResult From f4a2edf8f4193a8ae71ee0504693d5c3672e7b44 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Tue, 23 Jan 2024 16:58:08 +0800 Subject: [PATCH 5/5] show detailed logs --- packages/content-fetch/src/job.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packages/content-fetch/src/job.ts b/packages/content-fetch/src/job.ts index 82529735c..ba70478af 100644 --- a/packages/content-fetch/src/job.ts +++ b/packages/content-fetch/src/job.ts @@ -57,7 +57,7 @@ export const queueSavePageJob = async (savePageJobs: savePageJob[]) => { data: job.data, opts: getOpts(job), })) - console.log('queue save page jobs:', { jobs }) + console.log('queue save page jobs:', JSON.stringify(jobs, null, 2)) return queue.addBulk(jobs) }