Allow selectively disabling javascript on some hosts

Some hosts readability is improved by disabling javascript
This commit is contained in:
Jackson Harper
2022-06-10 13:25:10 -07:00
parent ca51cddf2a
commit b94215f1fc

View File

@ -24,6 +24,7 @@ const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) Apple
const BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com']
const NON_SCRIPT_HOSTS= ['medium.com']
const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf'];
@ -46,6 +47,20 @@ const userAgentForUrl = (url) => {
return DESKTOP_USER_AGENT
};
const enableJavascriptForUrl = async (url) => {
try {
const u = new URL(url);
for (const host of NON_SCRIPT_HOSTS) {
if (u.hostname.endsWith(host)) {
return false;
}
}
} catch (e) {
console.log('error getting hostname for url', url, e)
}
return true
};
// launch Puppeteer
const getBrowserPromise = (async () => {
return puppeteer.launch({
@ -406,6 +421,9 @@ async function retrievePage(url) {
const context = await browser.createIncognitoBrowserContext();
const page = await context.newPage();
if (!enableJavascriptForUrl(url)) {
page.setJavaScriptEnabled(false)
}
await page.setUserAgent(userAgentForUrl(url));
const client = await page.target().createCDPSession();