Remove adblocker and block resources by url and also block mathJax script
This commit is contained in:
@ -31,9 +31,6 @@ const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/pl
|
||||
// Add stealth plugin to hide puppeteer usage
|
||||
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
||||
puppeteer.use(StealthPlugin());
|
||||
// Add adblocker plugin to block ads and trackers
|
||||
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker');
|
||||
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
|
||||
|
||||
|
||||
const userAgentForUrl = (url) => {
|
||||
@ -351,6 +348,33 @@ function getUrl(req) {
|
||||
return parsed.href;
|
||||
}
|
||||
|
||||
|
||||
async function blockResources(page) {
|
||||
const blockedResources = [
|
||||
// Assets
|
||||
'*/favicon.ico',
|
||||
'.css',
|
||||
'.jpg',
|
||||
'.jpeg',
|
||||
'.png',
|
||||
'.svg',
|
||||
'.woff',
|
||||
|
||||
// Analytics and other fluff
|
||||
'*.optimizely.com',
|
||||
'everesttech.net',
|
||||
'userzoom.com',
|
||||
'doubleclick.net',
|
||||
'googleadservices.com',
|
||||
'adservice.google.com/*',
|
||||
'connect.facebook.com',
|
||||
'connect.facebook.net',
|
||||
'sp.analytics.yahoo.com',
|
||||
]
|
||||
|
||||
await page._client.send('Network.setBlockedURLs', { urls: blockedResources });
|
||||
}
|
||||
|
||||
async function retrievePage(url) {
|
||||
validateUrlString(url);
|
||||
|
||||
@ -406,6 +430,8 @@ async function retrievePage(url) {
|
||||
} catch {}
|
||||
});
|
||||
|
||||
await blockResources(page);
|
||||
|
||||
/*
|
||||
* Disallow MathJax from running in Puppeteer and modifying the document,
|
||||
* we shall instead run it in our frontend application to transform any
|
||||
@ -413,24 +439,24 @@ async function retrievePage(url) {
|
||||
*/
|
||||
await page.setRequestInterception(true);
|
||||
let requestCount = 0;
|
||||
// page.on('request', request => {
|
||||
// if (request.resourceType() === 'font' || request.resourceType() === 'image') {
|
||||
// request.abort();
|
||||
// return;
|
||||
// }
|
||||
// if (requestCount++ > 100) {
|
||||
// request.abort();
|
||||
// return;
|
||||
// }
|
||||
// if (
|
||||
// request.resourceType() === 'script' &&
|
||||
// request.url().toLowerCase().indexOf('mathjax') > -1
|
||||
// ) {
|
||||
// request.abort();
|
||||
// } else {
|
||||
// request.continue();
|
||||
// }
|
||||
// });
|
||||
page.on('request', request => {
|
||||
if (request.resourceType() === 'font' || request.resourceType() === 'image') {
|
||||
request.abort();
|
||||
return;
|
||||
}
|
||||
if (requestCount++ > 100) {
|
||||
request.abort();
|
||||
return;
|
||||
}
|
||||
if (
|
||||
request.resourceType() === 'script' &&
|
||||
request.url().toLowerCase().indexOf('mathjax') > -1
|
||||
) {
|
||||
request.abort();
|
||||
} else {
|
||||
request.continue();
|
||||
}
|
||||
});
|
||||
|
||||
// Puppeteer fails during download of PDf files,
|
||||
// so record the failure and use those items
|
||||
|
||||
@ -29,9 +29,6 @@ const puppeteer = require('puppeteer-extra');
|
||||
// Add stealth plugin to hide puppeteer usage
|
||||
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
||||
puppeteer.use(StealthPlugin());
|
||||
// Add adblocker plugin to block ads and trackers
|
||||
const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker');
|
||||
puppeteer.use(AdblockerPlugin({ blockTrackers: true }));
|
||||
|
||||
const storage = new Storage();
|
||||
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
|
||||
@ -554,6 +551,32 @@ function getUrl(req) {
|
||||
} catch (e) {}
|
||||
}
|
||||
|
||||
async function blockResources(page) {
|
||||
const blockedResources = [
|
||||
// Assets
|
||||
'*/favicon.ico',
|
||||
'.css',
|
||||
'.jpg',
|
||||
'.jpeg',
|
||||
'.png',
|
||||
'.svg',
|
||||
'.woff',
|
||||
|
||||
// Analytics and other fluff
|
||||
'*.optimizely.com',
|
||||
'everesttech.net',
|
||||
'userzoom.com',
|
||||
'doubleclick.net',
|
||||
'googleadservices.com',
|
||||
'adservice.google.com/*',
|
||||
'connect.facebook.com',
|
||||
'connect.facebook.net',
|
||||
'sp.analytics.yahoo.com',
|
||||
]
|
||||
|
||||
await page._client.send('Network.setBlockedURLs', { urls: blockedResources });
|
||||
}
|
||||
|
||||
async function retrievePage(url) {
|
||||
validateUrlString(url);
|
||||
|
||||
@ -609,6 +632,35 @@ async function retrievePage(url) {
|
||||
} catch {}
|
||||
});
|
||||
|
||||
await blockResources(page);
|
||||
|
||||
/*
|
||||
* Disallow MathJax from running in Puppeteer and modifying the document,
|
||||
* we shall instead run it in our frontend application to transform any
|
||||
* mathjax content when present.
|
||||
*/
|
||||
await page.setRequestInterception(true);
|
||||
let requestCount = 0;
|
||||
page.on('request', request => {
|
||||
if (request.resourceType() === 'font' || request.resourceType() === 'image') {
|
||||
request.abort();
|
||||
return;
|
||||
}
|
||||
if (requestCount++ > 100) {
|
||||
request.abort();
|
||||
return;
|
||||
}
|
||||
if (
|
||||
request.resourceType() === 'script' &&
|
||||
request.url().toLowerCase().indexOf('mathjax') > -1
|
||||
) {
|
||||
request.abort();
|
||||
} else {
|
||||
request.continue();
|
||||
}
|
||||
});
|
||||
|
||||
|
||||
// Puppeteer fails during download of PDf files,
|
||||
// so record the failure and use those items
|
||||
let lastPdfUrl = undefined;
|
||||
|
||||
@ -15,7 +15,6 @@
|
||||
"luxon": "^2.3.1",
|
||||
"puppeteer-core": "^13.7.0",
|
||||
"puppeteer-extra": "^3.2.3",
|
||||
"puppeteer-extra-plugin-adblocker": "^2.12.0",
|
||||
"puppeteer-extra-plugin-stealth": "^2.9.0",
|
||||
"winston": "^3.3.3"
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user