From 0261cafb7afa1531bbc07efa85a0c0ccfeb2731a Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Wed, 16 Nov 2022 10:37:14 +0800 Subject: [PATCH] Add puppeteer-extra pluggins --- packages/puppeteer-parse/index.js | 13 +++++++++---- packages/puppeteer-parse/package.json | 3 +++ 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index cd72936af..a78e3033e 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -15,12 +15,17 @@ const { DateTime } = require('luxon'); const os = require('os'); const { Storage } = require('@google-cloud/storage'); const { parseHTML } = require('linkedom'); -const puppeteer = require('puppeteer-core'); const { preHandleContent } = require("@omnivore/content-handler"); +const puppeteer = require('puppeteer-extra'); + // Add stealth plugin to hide puppeteer usage -// const StealthPlugin = require('puppeteer-extra-plugin-stealth'); -// puppeteer.use(StealthPlugin()); +const StealthPlugin = require('puppeteer-extra-plugin-stealth'); +puppeteer.use(StealthPlugin()); + +// Add adblocker plugin to block all ads and trackers (saves bandwidth) +const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker') +puppeteer.use(AdblockerPlugin({ blockTrackers: true })); const storage = new Storage(); const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : []; @@ -549,7 +554,7 @@ async function retrievePage(url) { }); try { - const response = await page.goto(url, { timeout: 8 * 1000, waitUntil: ['networkidle2'] }); + const response = await page.goto(url, { timeout: 30 * 1000, waitUntil: ['networkidle2'] }); const finalUrl = response.url(); const contentType = response.headers()['content-type']; diff --git a/packages/puppeteer-parse/package.json b/packages/puppeteer-parse/package.json index a9537f6f2..41378e1bf 100644 --- a/packages/puppeteer-parse/package.json +++ b/packages/puppeteer-parse/package.json @@ -12,6 +12,9 @@ "linkedom": "^0.14.9", "luxon": "^2.3.1", "puppeteer-core": "^16.1.0", + "puppeteer-extra": "^3.3.4", + "puppeteer-extra-plugin-adblocker": "^2.13.5", + "puppeteer-extra-plugin-stealth": "^2.11.1", "underscore": "^1.13.4", "winston": "^3.3.3" },