Remove adblocker and block resources by url and also block mathJax script

This commit is contained in:
Hongbo Wu
2022-05-11 22:04:47 +08:00
parent d542d31aed
commit 0984dca183
3 changed files with 102 additions and 25 deletions

View File

@ -31,9 +31,6 @@ const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/pl
// Add stealth plugin to hide puppeteer usage
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());
// Add adblocker plugin to block ads and trackers
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker');
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
const userAgentForUrl = (url) => {
@ -351,6 +348,33 @@ function getUrl(req) {
return parsed.href;
}
async function blockResources(page) {
const blockedResources = [
// Assets
'*/favicon.ico',
'.css',
'.jpg',
'.jpeg',
'.png',
'.svg',
'.woff',
// Analytics and other fluff
'*.optimizely.com',
'everesttech.net',
'userzoom.com',
'doubleclick.net',
'googleadservices.com',
'adservice.google.com/*',
'connect.facebook.com',
'connect.facebook.net',
'sp.analytics.yahoo.com',
]
await page._client.send('Network.setBlockedURLs', { urls: blockedResources });
}
async function retrievePage(url) {
validateUrlString(url);
@ -406,6 +430,8 @@ async function retrievePage(url) {
} catch {}
});
await blockResources(page);
/*
* Disallow MathJax from running in Puppeteer and modifying the document,
* we shall instead run it in our frontend application to transform any
@ -413,24 +439,24 @@ async function retrievePage(url) {
*/
await page.setRequestInterception(true);
let requestCount = 0;
// page.on('request', request => {
// if (request.resourceType() === 'font' || request.resourceType() === 'image') {
// request.abort();
// return;
// }
// if (requestCount++ > 100) {
// request.abort();
// return;
// }
// if (
// request.resourceType() === 'script' &&
// request.url().toLowerCase().indexOf('mathjax') > -1
// ) {
// request.abort();
// } else {
// request.continue();
// }
// });
page.on('request', request => {
if (request.resourceType() === 'font' || request.resourceType() === 'image') {
request.abort();
return;
}
if (requestCount++ > 100) {
request.abort();
return;
}
if (
request.resourceType() === 'script' &&
request.url().toLowerCase().indexOf('mathjax') > -1
) {
request.abort();
} else {
request.continue();
}
});
// Puppeteer fails during download of PDf files,
// so record the failure and use those items

View File

@ -29,9 +29,6 @@ const puppeteer = require('puppeteer-extra');
// Add stealth plugin to hide puppeteer usage
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());
// Add adblocker plugin to block ads and trackers
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker');
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
const storage = new Storage();
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
@ -554,6 +551,32 @@ function getUrl(req) {
} catch (e) {}
}
async function blockResources(page) {
const blockedResources = [
// Assets
'*/favicon.ico',
'.css',
'.jpg',
'.jpeg',
'.png',
'.svg',
'.woff',
// Analytics and other fluff
'*.optimizely.com',
'everesttech.net',
'userzoom.com',
'doubleclick.net',
'googleadservices.com',
'adservice.google.com/*',
'connect.facebook.com',
'connect.facebook.net',
'sp.analytics.yahoo.com',
]
await page._client.send('Network.setBlockedURLs', { urls: blockedResources });
}
async function retrievePage(url) {
validateUrlString(url);
@ -609,6 +632,35 @@ async function retrievePage(url) {
} catch {}
});
await blockResources(page);
/*
* Disallow MathJax from running in Puppeteer and modifying the document,
* we shall instead run it in our frontend application to transform any
* mathjax content when present.
*/
await page.setRequestInterception(true);
let requestCount = 0;
page.on('request', request => {
if (request.resourceType() === 'font' || request.resourceType() === 'image') {
request.abort();
return;
}
if (requestCount++ > 100) {
request.abort();
return;
}
if (
request.resourceType() === 'script' &&
request.url().toLowerCase().indexOf('mathjax') > -1
) {
request.abort();
} else {
request.continue();
}
});
// Puppeteer fails during download of PDf files,
// so record the failure and use those items
let lastPdfUrl = undefined;

View File

@ -15,7 +15,6 @@
"luxon": "^2.3.1",
"puppeteer-core": "^13.7.0",
"puppeteer-extra": "^3.2.3",
"puppeteer-extra-plugin-adblocker": "^2.12.0",
"puppeteer-extra-plugin-stealth": "^2.9.0",
"winston": "^3.3.3"
},