From 29a5b20d2cbe24788d95624164c79c41d12bca41 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Wed, 24 Jul 2024 12:17:13 +0800 Subject: [PATCH 01/10] remove scrapingbee from content-fetch --- packages/content-fetch/src/job.ts | 5 +- packages/puppeteer-parse/package.json | 2 - packages/puppeteer-parse/src/index.ts | 101 ++++++-------------------- 3 files changed, 26 insertions(+), 82 deletions(-) diff --git a/packages/content-fetch/src/job.ts b/packages/content-fetch/src/job.ts index 68e1d3cd1..ae282bfb7 100644 --- a/packages/content-fetch/src/job.ts +++ b/packages/content-fetch/src/job.ts @@ -77,7 +77,10 @@ export const queueSavePageJob = async ( data: job.data, opts: getOpts(job), })) - console.log('queue save page jobs:', JSON.stringify(jobs, null, 2)) + console.log( + 'queue save page jobs:', + jobs.map((job) => job.data.finalUrl) + ) const queue = new Queue(QUEUE_NAME, { connection: redisDataSource.queueRedisClient, diff --git a/packages/puppeteer-parse/package.json b/packages/puppeteer-parse/package.json index 14637c232..242602c51 100644 --- a/packages/puppeteer-parse/package.json +++ b/packages/puppeteer-parse/package.json @@ -10,10 +10,8 @@ "dependencies": { "@omnivore/content-handler": "1.0.0", "@omnivore/readability": "1.0.0", - "axios": "^1.4.0", "crypto": "^1.0.1", "dompurify": "^2.4.1", - "linkedom": "^0.14.9", "puppeteer-core": "^22.12.1", "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-adblocker": "^2.13.6", diff --git a/packages/puppeteer-parse/src/index.ts b/packages/puppeteer-parse/src/index.ts index 78031e58c..c0993cd56 100644 --- a/packages/puppeteer-parse/src/index.ts +++ b/packages/puppeteer-parse/src/index.ts @@ -1,8 +1,6 @@ /* eslint-disable @typescript-eslint/no-unsafe-member-access */ /* eslint-disable @typescript-eslint/no-unsafe-assignment */ import { preHandleContent } from '@omnivore/content-handler' -import axios from 'axios' -import { parseHTML } from 'linkedom' import path from 'path' import { BrowserContext, Page, Protocol } from 'puppeteer-core' import { getBrowser } from './browser' @@ -16,22 +14,6 @@ const ALLOWED_CONTENT_TYPES = [ 'application/pdf', ] -const fetchContentWithScrapingBee = async (url: string) => { - const response = await axios.get('https://app.scrapingbee.com/api/v1', { - params: { - api_key: process.env.SCRAPINGBEE_API_KEY, - url: url, - render_js: 'false', - premium_proxy: 'true', - country_code: 'us', - }, - timeout: 10_000, - }) - - const dom = parseHTML(response.data).document - return { title: dom.title, domContent: dom.documentElement.outerHTML, url } -} - const enableJavascriptForUrl = (url: string) => { try { const u = new URL(url) @@ -60,39 +42,28 @@ export const fetchContent = async ( } console.log(`content-fetch request`, logRecord) - let page: Page | undefined, - title: string | undefined, + let title: string | undefined, content: string | undefined, contentType: string | undefined, context: BrowserContext | undefined try { url = getUrl(url) - if (!url) { - throw new Error('Valid URL to parse not specified') - } // pre handle url with custom handlers try { const result = await preHandleContent(url) - if (result && result.url) { - validateUrlString(url) - url = result.url - } - if (result && result.title) { - title = result.title - } - if (result && result.content) { - content = result.content - } - if (result && result.contentType) { - contentType = result.contentType + if (result?.url) { + url = getUrl(result.url) } + title = result?.title + content = result?.content + contentType = result?.contentType } catch (e) { - console.info('error with handler: ', e) + console.error('error with handler: ', e) } - if ((!content || !title) && contentType !== 'application/pdf') { + if (contentType !== 'application/pdf' && (!content || !title)) { const result = await retrievePage( url, logRecord, @@ -100,59 +71,27 @@ export const fetchContent = async ( locale, timezone ) - if (result && result.context) { - context = result.context - } - if (result && result.page) { - page = result.page - } - if (result && result.finalUrl) { - url = result.finalUrl - } - if (result && result.contentType) { - contentType = result.contentType - } - } + context = result.context + url = result.finalUrl + contentType = result.contentType - if (contentType !== 'application/pdf') { - if (page && (!content || !title)) { + const page = result.page + if (page) { const result = await retrieveHtml(page, logRecord) - if (result.isBlocked) { - const sbResult = await fetchContentWithScrapingBee(url) - title = sbResult.title - content = sbResult.domContent - } else { - title = result.title - content = result.domContent - } - } else { - console.info('using prefetched content and title') + title = result.title + content = result.domContent } } } catch (e) { console.error(`Error while retrieving page ${url}`, e) - // fallback to scrapingbee for non pdf content - if (url && contentType !== 'application/pdf') { - console.info('fallback to scrapingbee', url) - - const sbResult = await fetchContentWithScrapingBee(url) - - return { - finalUrl: url, - title: sbResult.title, - content: sbResult.domContent, - contentType, - } - } - throw e } finally { // close browser context if it was created if (context) { - console.info('closing page...', url) + console.info('closing context...', url) await context.close() - console.info('page closed', url) + console.info('context closed', url) } console.info(`content-fetch result`, logRecord) @@ -519,8 +458,12 @@ async function retrieveHtml(page: Page, logRecord: Record) { throw e } + if (domContent === 'IS_BLOCKED') { - return { isBlocked: true } + logRecord.blockedByClient = true + + throw new Error('Page is blocked') } + return { domContent, title } } From 066883a84d2eb49c031c1ffd5f84502930cb41ff Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Wed, 24 Jul 2024 12:51:25 +0800 Subject: [PATCH 02/10] remove unused dependencies --- packages/puppeteer-parse/package.json | 8 +- packages/puppeteer-parse/src/index.ts | 1 - packages/puppeteer-parse/src/readability.d.ts | 173 ------------------ yarn.lock | 24 --- 4 files changed, 1 insertion(+), 205 deletions(-) delete mode 100644 packages/puppeteer-parse/src/readability.d.ts diff --git a/packages/puppeteer-parse/package.json b/packages/puppeteer-parse/package.json index 242602c51..0683a93ee 100644 --- a/packages/puppeteer-parse/package.json +++ b/packages/puppeteer-parse/package.json @@ -9,18 +9,12 @@ ], "dependencies": { "@omnivore/content-handler": "1.0.0", - "@omnivore/readability": "1.0.0", - "crypto": "^1.0.1", - "dompurify": "^2.4.1", "puppeteer-core": "^22.12.1", "puppeteer-extra": "^3.3.6", "puppeteer-extra-plugin-adblocker": "^2.13.6", - "puppeteer-extra-plugin-stealth": "^2.11.2", - "urlsafe-base64": "^1.0.0" + "puppeteer-extra-plugin-stealth": "^2.11.2" }, "devDependencies": { - "@types/dompurify": "^3.0.5", - "@types/urlsafe-base64": "^1.0.31", "chai": "^4.3.6", "mocha": "^10.0.0" }, diff --git a/packages/puppeteer-parse/src/index.ts b/packages/puppeteer-parse/src/index.ts index c0993cd56..853c9a6ef 100644 --- a/packages/puppeteer-parse/src/index.ts +++ b/packages/puppeteer-parse/src/index.ts @@ -1,4 +1,3 @@ -/* eslint-disable @typescript-eslint/no-unsafe-member-access */ /* eslint-disable @typescript-eslint/no-unsafe-assignment */ import { preHandleContent } from '@omnivore/content-handler' import path from 'path' diff --git a/packages/puppeteer-parse/src/readability.d.ts b/packages/puppeteer-parse/src/readability.d.ts deleted file mode 100644 index 4722588cd..000000000 --- a/packages/puppeteer-parse/src/readability.d.ts +++ /dev/null @@ -1,173 +0,0 @@ -// Type definitions for non-npm package mozilla-readability 0.2 -// Project: https://github.com/mozilla/readability -// Definitions by: Charles Vandevoorde , Alex Wendland -// Definitions: https://github.com/DefinitelyTyped/DefinitelyTyped -// TypeScript Version: 2.2 - -declare module '@omnivore/readability' { - /** - * A standalone version of the readability library used for Firefox Reader View. - * - * Note that isProbablyReaderable() was moved into a separate file in https://github.com/mozilla/readability/commit/2620542dd1e8380220d82afa97a2c283ae636e40 - * and therefore is no longer part of the Readability class. - */ - class Readability { - /** - * ## Usage on the web - * - * To parse a document, you must create a new Readability object from a - * DOM document object, and then call parse(). Here's an example: - * - * ```js - * var article = new Readability(document).parse(); - * ``` - * - * If you're using Readability on the web, you will likely be able to - * use a document reference from elsewhere (e.g. fetched via XMLHttpRequest, - * in a same-origin