diff --git a/packages/content-fetch/fetch-content.js b/packages/content-fetch/fetch-content.js index fd9a51b56..f90c753ce 100644 --- a/packages/content-fetch/fetch-content.js +++ b/packages/content-fetch/fetch-content.js @@ -24,6 +24,7 @@ const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) Apple const BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' const NON_BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' const NON_BOT_HOSTS = ['bloomberg.com', 'forbes.com'] +const NON_SCRIPT_HOSTS= ['medium.com'] const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf']; @@ -46,6 +47,20 @@ const userAgentForUrl = (url) => { return DESKTOP_USER_AGENT }; +const enableJavascriptForUrl = async (url) => { + try { + const u = new URL(url); + for (const host of NON_SCRIPT_HOSTS) { + if (u.hostname.endsWith(host)) { + return false; + } + } + } catch (e) { + console.log('error getting hostname for url', url, e) + } + return true +}; + // launch Puppeteer const getBrowserPromise = (async () => { return puppeteer.launch({ @@ -406,6 +421,9 @@ async function retrievePage(url) { const context = await browser.createIncognitoBrowserContext(); const page = await context.newPage(); + if (!enableJavascriptForUrl(url)) { + page.setJavaScriptEnabled(false) + } await page.setUserAgent(userAgentForUrl(url)); const client = await page.target().createCDPSession();