Allow selectively disabling javascript on some hosts
Some hosts readability is improved by disabling javascript
This commit is contained in:
@ -24,6 +24,7 @@ const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) Apple
|
||||
const BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||
const NON_BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||
const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
|
||||
const NON_SCRIPT_HOSTS= ['medium.com']
|
||||
|
||||
const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf'];
|
||||
|
||||
@ -46,6 +47,20 @@ const userAgentForUrl = (url) => {
|
||||
return DESKTOP_USER_AGENT
|
||||
};
|
||||
|
||||
const enableJavascriptForUrl = async (url) => {
|
||||
try {
|
||||
const u = new URL(url);
|
||||
for (const host of NON_SCRIPT_HOSTS) {
|
||||
if (u.hostname.endsWith(host)) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.log('error getting hostname for url', url, e)
|
||||
}
|
||||
return true
|
||||
};
|
||||
|
||||
// launch Puppeteer
|
||||
const getBrowserPromise = (async () => {
|
||||
return puppeteer.launch({
|
||||
@ -406,6 +421,9 @@ async function retrievePage(url) {
|
||||
|
||||
const context = await browser.createIncognitoBrowserContext();
|
||||
const page = await context.newPage();
|
||||
if (!enableJavascriptForUrl(url)) {
|
||||
page.setJavaScriptEnabled(false)
|
||||
}
|
||||
await page.setUserAgent(userAgentForUrl(url));
|
||||
|
||||
const client = await page.target().createCDPSession();
|
||||
|
||||
Reference in New Issue
Block a user