From 1f1698ea8127fc2ed732d42d57d0dd74c0d236ea Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Fri, 15 Jul 2022 15:11:41 +0800 Subject: [PATCH] sync changes to content-fetch-gcf --- packages/puppeteer-parse/Dockerfile | 10 +--- packages/puppeteer-parse/index.js | 66 ++++++++++++++++++--------- packages/puppeteer-parse/package.json | 2 - 3 files changed, 45 insertions(+), 33 deletions(-) diff --git a/packages/puppeteer-parse/Dockerfile b/packages/puppeteer-parse/Dockerfile index 76ab9c967..d3ce96e20 100644 --- a/packages/puppeteer-parse/Dockerfile +++ b/packages/puppeteer-parse/Dockerfile @@ -74,13 +74,6 @@ RUN apk add --no-cache \ nodejs \ yarn -# Tell Puppeteer to skip installing Chrome. We'll be using the installed package. -ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \ - PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser - -# Puppeteer v10.0.0 works with Chromium 92. -RUN yarn add puppeteer@10.0.0 - # Add user so we don't need --no-sandbox. RUN addgroup -S pptruser && adduser -S -g pptruser pptruser \ && mkdir -p /home/pptruser/Downloads /app \ @@ -90,7 +83,6 @@ RUN addgroup -S pptruser && adduser -S -g pptruser pptruser \ # Run everything after as non-privileged user. WORKDIR /app -ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true ENV CHROMIUM_PATH /usr/bin/chromium-browser ENV LAUNCH_HEADLESS=true @@ -109,4 +101,4 @@ ADD /packages/puppeteer-parse ./packages/puppeteer-parse EXPOSE 8080 # USER pptruser -ENTRYPOINT ["yarn", "workspace", "@omnivore/puppeteer-parse", "start"] \ No newline at end of file +ENTRYPOINT ["yarn", "workspace", "@omnivore/puppeteer-parse", "start"] diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index a2933147e..efa00a589 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -5,7 +5,6 @@ /* eslint-disable @typescript-eslint/no-require-imports */ require('dotenv').config(); const Url = require('url'); -const chromium = require('chrome-aws-lambda'); const axios = require('axios'); const jwt = require('jsonwebtoken'); const { promisify } = require('util'); @@ -24,11 +23,11 @@ const { pdfHandler } = require('./pdf-handler'); const { mediumHandler } = require('./medium-handler'); const { derstandardHandler } = require('./derstandard-handler'); const { imageHandler } = require('./image-handler'); -const puppeteer = require('puppeteer-extra'); +const puppeteer = require('puppeteer-core'); // Add stealth plugin to hide puppeteer usage -const StealthPlugin = require('puppeteer-extra-plugin-stealth'); -puppeteer.use(StealthPlugin()); +// const StealthPlugin = require('puppeteer-extra-plugin-stealth'); +// puppeteer.use(StealthPlugin()); const storage = new Storage(); const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : []; @@ -125,12 +124,34 @@ const userAgentForUrl = (url) => { // launch Puppeteer const getBrowserPromise = (async () => { return puppeteer.launch({ - args: chromium.args, + args: [ + '--allow-running-insecure-content', + '--autoplay-policy=user-gesture-required', + '--disable-component-update', + '--disable-domain-reliability', + '--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process', + '--disable-print-preview', + '--disable-setuid-sandbox', + '--disable-site-isolation-trials', + '--disable-speech-api', + '--disable-web-security', + '--disk-cache-size=33554432', + '--enable-features=SharedArrayBuffer', + '--hide-scrollbars', + '--ignore-gpu-blocklist', + '--in-process-gpu', + '--mute-audio', + '--no-default-browser-check', + '--no-pings', + '--no-sandbox', + '--no-zygote', + '--use-gl=swiftshader', + '--window-size=1920,1080', + ].filter((item) => !!item), defaultViewport: { height: 1080, width: 1920 }, - executablePath: process.env.CHROMIUM_PATH || (await chromium.executablePath), - headless: process.env.LAUNCH_HEADLESS ? true : chromium.headless, + executablePath: process.env.CHROMIUM_PATH, + headless: !!process.env.LAUNCH_HEADLESS, timeout: 0, - userDataDir: '/tmp/puppeteer', }); })(); @@ -179,10 +200,10 @@ const getUploadIdAndSignedUrl = async (userId, url) => { return response.data.data.uploadFileRequest; }; -const uploadPdf = async (url, userId) => { +const uploadPdf = async (url, userId, articleSavingRequestId) => { validateUrlString(url); - const uploadResult = await getUploadIdAndSignedUrl(userId, url); + const uploadResult = await getUploadIdAndSignedUrl(userId, url, articleSavingRequestId); await uploadToSignedUrl(uploadResult, 'application/pdf', url); return uploadResult.id; }; @@ -258,7 +279,7 @@ exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(async (req, res) => { let url = getUrl(req); const userId = req.body.userId || req.query.userId; - const articleSavingRequestId = req.body.saveRequestId || req.query.saveRequestId; + const articleSavingRequestId = (req.query ? req.query.saveRequestId : undefined) || (req.body ? req.body.saveRequestId : undefined); logRecord = { url, @@ -277,11 +298,11 @@ exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(async (req, res) => { return res.sendStatus(400); } - if (!userId || !articleSavingRequestId) { - Object.assign(logRecord, { invalidParams: true, body: req.body, query: req.query }); - logger.error(`Invalid parameters`, logRecord); - return res.sendStatus(400); - } + // if (!userId || !articleSavingRequestId) { + // Object.assign(logRecord, { invalidParams: true, body: req.body, query: req.query }); + // logger.error(`Invalid parameters`, logRecord); + // return res.sendStatus(400); + // } // Before we run the regular handlers we check to see if we need tp // pre-resolve the URL. TODO: This should probably happen recursively, @@ -348,7 +369,7 @@ exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(async (req, res) => { try { if (contentType === 'application/pdf') { - const uploadedFileId = await uploadPdf(finalUrl, userId); + const uploadedFileId = await uploadPdf(finalUrl, userId, articleSavingRequestId); const l = await saveUploadedPdf(userId, finalUrl, uploadedFileId, articleSavingRequestId); } else { if (!content || !title) { @@ -551,7 +572,7 @@ function getUrl(req) { } catch (e) {} } -async function blockResources(page) { +async function blockResources(client) { const blockedResources = [ // Assets // '*/favicon.ico', @@ -574,7 +595,7 @@ async function blockResources(page) { 'sp.analytics.yahoo.com', ] - await page._client.send('Network.setBlockedURLs', { urls: blockedResources }); + await client.send('Network.setBlockedURLs', { urls: blockedResources }); } async function retrievePage(url) { @@ -603,7 +624,7 @@ async function retrievePage(url) { const path = require('path'); const download_path = path.resolve('./download_dir/'); - await page._client.send('Page.setDownloadBehavior', { + await client.send('Page.setDownloadBehavior', { behavior: 'allow', userDataDir: './', downloadPath: download_path, @@ -632,7 +653,7 @@ async function retrievePage(url) { } catch {} }); - await blockResources(page); + await blockResources(client); /* * Disallow MathJax from running in Puppeteer and modifying the document, @@ -683,6 +704,7 @@ async function retrievePage(url) { if (lastPdfUrl) { return { context, page, finalUrl: lastPdfUrl, contentType: 'application/pdf' }; } + await context.close(); throw error; } } @@ -722,7 +744,7 @@ async function retrieveHtml(page) { } })(); }), - page.waitForTimeout(5000), //5 second timeout + await page.waitForTimeout(1000), //5 second timeout ]); logRecord.timing = { ...logRecord.timing, pageScrolled: Date.now() - pageScrollingStart }; diff --git a/packages/puppeteer-parse/package.json b/packages/puppeteer-parse/package.json index c991f50d6..273792ccc 100644 --- a/packages/puppeteer-parse/package.json +++ b/packages/puppeteer-parse/package.json @@ -14,8 +14,6 @@ "linkedom": "^0.14.9", "luxon": "^2.3.1", "puppeteer-core": "^15.3.2", - "puppeteer-extra": "^3.2.3", - "puppeteer-extra-plugin-stealth": "^2.9.0", "winston": "^3.3.3" }, "devDependencies": {