Merge pull request #597 from omnivore-app/remove-chrome-aws-lambda

Optimize puppeteer and remove chrome-aws-lambda dependencies
This commit is contained in:
Jackson Harper
2022-05-13 16:12:24 -07:00
committed by GitHub
16 changed files with 146 additions and 336 deletions

View File

@ -9,11 +9,8 @@ const axios = require('axios');
const { promisify } = require('util');
const { DateTime } = require('luxon');
const os = require('os');
const jsdom = require("jsdom");
const { Cipher } = require('crypto');
const { JSDOM } = jsdom;
const { parseHTML } = require('linkedom');
exports.appleNewsHandler = {
@ -30,10 +27,10 @@ exports.appleNewsHandler = {
const response = await axios.get(url, { headers: { 'User-Agent': MOBILE_USER_AGENT } } );
const data = response.data;
const dom = new JSDOM(data);
const dom = parseHTML(data).document;
// make sure its a valid URL by wrapping in new URL
const u = new URL(dom.window.document.querySelector('span.click-here').parentNode.href);
const u = new URL(dom.querySelector('span.click-here').parentNode.href);
return { url: u.href };
}
}

View File

@ -6,8 +6,7 @@
require('dotenv').config();
const axios = require('axios');
const os = require('os');
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
const { parseHTML } = require('linkedom');
exports.bloombergHandler = {
@ -30,8 +29,8 @@ exports.bloombergHandler = {
'block_resources': false,
}
})
const dom = new JSDOM(response.data);
return { title: dom.window.document.title, content: dom.window.document.querySelector('body').innerHTML, url: url }
const dom = parseHTML(response.data).document;
return { title: dom.title, content: dom.querySelector('body').innerHTML, url: url }
} catch (error) {
console.error('error prehandling bloomberg url', error)
throw error

View File

@ -5,8 +5,7 @@
/* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config();
const axios = require('axios');
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
const { parseHTML } = require('linkedom');
exports.derstandardHandler = {
shouldPrehandle: (url, env) => {
@ -23,10 +22,14 @@ exports.derstandardHandler = {
});
const content = response.data;
const dom = new JSDOM(content)
const titleElement = dom.window.document.querySelector('.article-title')
titleElement?.remove()
var title = undefined;
const dom = parseHTML(content).document;
const titleElement = dom.querySelector('.article-title')
if (!titleElement) {
title = titleElement.textContent
titleElement.remove()
}
return { content: dom.window.document.body.outerHTML, title: titleElement?.textContent };
return { content: dom.body.outerHTML, title: title };
}
}

View File

@ -5,7 +5,6 @@
/* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config();
const Url = require('url');
const chromium = require('chrome-aws-lambda');
const axios = require('axios');
const jwt = require('jsonwebtoken');
const { promisify } = require('util');
@ -125,10 +124,34 @@ const userAgentForUrl = (url) => {
// launch Puppeteer
const getBrowserPromise = (async () => {
return puppeteer.launch({
args: chromium.args,
args: [
'--allow-running-insecure-content',
'--autoplay-policy=user-gesture-required',
'--disable-component-update',
'--disable-domain-reliability',
'--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process',
'--disable-print-preview',
'--disable-setuid-sandbox',
'--disable-site-isolation-trials',
'--disable-speech-api',
'--disable-web-security',
'--disk-cache-size=33554432',
'--enable-features=SharedArrayBuffer',
'--hide-scrollbars',
'--ignore-gpu-blocklist',
'--in-process-gpu',
'--mute-audio',
'--no-default-browser-check',
'--no-pings',
'--no-sandbox',
'--no-zygote',
'--use-gl=swiftshader',
'--window-size=1920,1080', // https://source.chromium.org/search?q=lang:cpp+symbol:kWindowSize&ss=chromium
process.env.LAUNCH_HEADLESS ? '--single-process' : '--start-maximized',
],
defaultViewport: { height: 1080, width: 1920 },
executablePath: process.env.CHROMIUM_PATH || (await chromium.executablePath),
headless: process.env.LAUNCH_HEADLESS ? true : chromium.headless,
executablePath: process.env.CHROMIUM_PATH,
headless: !!process.env.LAUNCH_HEADLESS,
timeout: 0,
userDataDir: '/tmp/puppeteer',
});
@ -642,7 +665,7 @@ async function retrievePage(url) {
await page.setRequestInterception(true);
let requestCount = 0;
page.on('request', request => {
if (request.resourceType() === 'font' || request.resourceType() === 'image') {
if (['font', 'image', 'media'].includes(request.resourceType())) {
request.abort();
return;
}
@ -655,9 +678,9 @@ async function retrievePage(url) {
request.url().toLowerCase().indexOf('mathjax') > -1
) {
request.abort();
} else {
request.continue();
return
}
request.continue();
});

View File

@ -6,8 +6,6 @@
require('dotenv').config();
const axios = require('axios');
const os = require('os');
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
exports.mediumHandler = {

View File

@ -8,10 +8,9 @@
"@google-cloud/storage": "^5.18.1",
"@sentry/serverless": "^6.13.3",
"axios": "^0.26.0",
"chrome-aws-lambda": "^10.1.0",
"dotenv": "^8.2.0",
"jsdom": "^19.0.0",
"jsonwebtoken": "^8.5.1",
"linkedom": "^0.14.9",
"luxon": "^2.3.1",
"puppeteer-core": "^13.7.0",
"puppeteer-extra": "^3.2.3",