diff --git a/packages/content-fetch/fetch-content.js b/packages/content-fetch/fetch-content.js index c7493e364..38c19fcf1 100644 --- a/packages/content-fetch/fetch-content.js +++ b/packages/content-fetch/fetch-content.js @@ -31,9 +31,6 @@ const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/pl // Add stealth plugin to hide puppeteer usage const StealthPlugin = require('puppeteer-extra-plugin-stealth'); puppeteer.use(StealthPlugin()); -// Add adblocker plugin to block ads and trackers -const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker'); -puppeteer.use(AdblockerPlugin({ blockTrackers: true })); const userAgentForUrl = (url) => { @@ -351,6 +348,33 @@ function getUrl(req) { return parsed.href; } + +async function blockResources(page) { + const blockedResources = [ + // Assets + '*/favicon.ico', + '.css', + '.jpg', + '.jpeg', + '.png', + '.svg', + '.woff', + + // Analytics and other fluff + '*.optimizely.com', + 'everesttech.net', + 'userzoom.com', + 'doubleclick.net', + 'googleadservices.com', + 'adservice.google.com/*', + 'connect.facebook.com', + 'connect.facebook.net', + 'sp.analytics.yahoo.com', + ] + + await page._client.send('Network.setBlockedURLs', { urls: blockedResources }); +} + async function retrievePage(url) { validateUrlString(url); @@ -406,6 +430,8 @@ async function retrievePage(url) { } catch {} }); + await blockResources(page); + /* * Disallow MathJax from running in Puppeteer and modifying the document, * we shall instead run it in our frontend application to transform any @@ -413,24 +439,24 @@ async function retrievePage(url) { */ await page.setRequestInterception(true); let requestCount = 0; - // page.on('request', request => { - // if (request.resourceType() === 'font' || request.resourceType() === 'image') { - // request.abort(); - // return; - // } - // if (requestCount++ > 100) { - // request.abort(); - // return; - // } - // if ( - // request.resourceType() === 'script' && - // request.url().toLowerCase().indexOf('mathjax') > -1 - // ) { - // request.abort(); - // } else { - // request.continue(); - // } - // }); + page.on('request', request => { + if (request.resourceType() === 'font' || request.resourceType() === 'image') { + request.abort(); + return; + } + if (requestCount++ > 100) { + request.abort(); + return; + } + if ( + request.resourceType() === 'script' && + request.url().toLowerCase().indexOf('mathjax') > -1 + ) { + request.abort(); + } else { + request.continue(); + } + }); // Puppeteer fails during download of PDf files, // so record the failure and use those items diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index 6cb5926ad..56a64afc2 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -29,9 +29,6 @@ const puppeteer = require('puppeteer-extra'); // Add stealth plugin to hide puppeteer usage const StealthPlugin = require('puppeteer-extra-plugin-stealth'); puppeteer.use(StealthPlugin()); -// Add adblocker plugin to block ads and trackers -const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker'); -puppeteer.use(AdblockerPlugin({ blockTrackers: true })); const storage = new Storage(); const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : []; @@ -554,6 +551,32 @@ function getUrl(req) { } catch (e) {} } +async function blockResources(page) { + const blockedResources = [ + // Assets + '*/favicon.ico', + '.css', + '.jpg', + '.jpeg', + '.png', + '.svg', + '.woff', + + // Analytics and other fluff + '*.optimizely.com', + 'everesttech.net', + 'userzoom.com', + 'doubleclick.net', + 'googleadservices.com', + 'adservice.google.com/*', + 'connect.facebook.com', + 'connect.facebook.net', + 'sp.analytics.yahoo.com', + ] + + await page._client.send('Network.setBlockedURLs', { urls: blockedResources }); +} + async function retrievePage(url) { validateUrlString(url); @@ -609,6 +632,35 @@ async function retrievePage(url) { } catch {} }); + await blockResources(page); + + /* + * Disallow MathJax from running in Puppeteer and modifying the document, + * we shall instead run it in our frontend application to transform any + * mathjax content when present. + */ + await page.setRequestInterception(true); + let requestCount = 0; + page.on('request', request => { + if (request.resourceType() === 'font' || request.resourceType() === 'image') { + request.abort(); + return; + } + if (requestCount++ > 100) { + request.abort(); + return; + } + if ( + request.resourceType() === 'script' && + request.url().toLowerCase().indexOf('mathjax') > -1 + ) { + request.abort(); + } else { + request.continue(); + } + }); + + // Puppeteer fails during download of PDf files, // so record the failure and use those items let lastPdfUrl = undefined; diff --git a/packages/puppeteer-parse/package.json b/packages/puppeteer-parse/package.json index 2159015e5..e37537415 100644 --- a/packages/puppeteer-parse/package.json +++ b/packages/puppeteer-parse/package.json @@ -15,7 +15,6 @@ "luxon": "^2.3.1", "puppeteer-core": "^13.7.0", "puppeteer-extra": "^3.2.3", - "puppeteer-extra-plugin-adblocker": "^2.12.0", "puppeteer-extra-plugin-stealth": "^2.9.0", "winston": "^3.3.3" },