sync changes to content-fetch-gcf

This commit is contained in:
Hongbo Wu
2022-07-15 15:11:41 +08:00
parent b2238ce7f2
commit 1f1698ea81
3 changed files with 45 additions and 33 deletions

View File

@ -74,13 +74,6 @@ RUN apk add --no-cache \
nodejs \
yarn
# Tell Puppeteer to skip installing Chrome. We'll be using the installed package.
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \
PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser
# Puppeteer v10.0.0 works with Chromium 92.
RUN yarn add puppeteer@10.0.0
# Add user so we don't need --no-sandbox.
RUN addgroup -S pptruser && adduser -S -g pptruser pptruser \
&& mkdir -p /home/pptruser/Downloads /app \
@ -90,7 +83,6 @@ RUN addgroup -S pptruser && adduser -S -g pptruser pptruser \
# Run everything after as non-privileged user.
WORKDIR /app
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true
ENV CHROMIUM_PATH /usr/bin/chromium-browser
ENV LAUNCH_HEADLESS=true
@ -109,4 +101,4 @@ ADD /packages/puppeteer-parse ./packages/puppeteer-parse
EXPOSE 8080
# USER pptruser
ENTRYPOINT ["yarn", "workspace", "@omnivore/puppeteer-parse", "start"]
ENTRYPOINT ["yarn", "workspace", "@omnivore/puppeteer-parse", "start"]

View File

@ -5,7 +5,6 @@
/* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config();
const Url = require('url');
const chromium = require('chrome-aws-lambda');
const axios = require('axios');
const jwt = require('jsonwebtoken');
const { promisify } = require('util');
@ -24,11 +23,11 @@ const { pdfHandler } = require('./pdf-handler');
const { mediumHandler } = require('./medium-handler');
const { derstandardHandler } = require('./derstandard-handler');
const { imageHandler } = require('./image-handler');
const puppeteer = require('puppeteer-extra');
const puppeteer = require('puppeteer-core');
// Add stealth plugin to hide puppeteer usage
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());
// const StealthPlugin = require('puppeteer-extra-plugin-stealth');
// puppeteer.use(StealthPlugin());
const storage = new Storage();
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
@ -125,12 +124,34 @@ const userAgentForUrl = (url) => {
// launch Puppeteer
const getBrowserPromise = (async () => {
return puppeteer.launch({
args: chromium.args,
args: [
'--allow-running-insecure-content',
'--autoplay-policy=user-gesture-required',
'--disable-component-update',
'--disable-domain-reliability',
'--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process',
'--disable-print-preview',
'--disable-setuid-sandbox',
'--disable-site-isolation-trials',
'--disable-speech-api',
'--disable-web-security',
'--disk-cache-size=33554432',
'--enable-features=SharedArrayBuffer',
'--hide-scrollbars',
'--ignore-gpu-blocklist',
'--in-process-gpu',
'--mute-audio',
'--no-default-browser-check',
'--no-pings',
'--no-sandbox',
'--no-zygote',
'--use-gl=swiftshader',
'--window-size=1920,1080',
].filter((item) => !!item),
defaultViewport: { height: 1080, width: 1920 },
executablePath: process.env.CHROMIUM_PATH || (await chromium.executablePath),
headless: process.env.LAUNCH_HEADLESS ? true : chromium.headless,
executablePath: process.env.CHROMIUM_PATH,
headless: !!process.env.LAUNCH_HEADLESS,
timeout: 0,
userDataDir: '/tmp/puppeteer',
});
})();
@ -179,10 +200,10 @@ const getUploadIdAndSignedUrl = async (userId, url) => {
return response.data.data.uploadFileRequest;
};
const uploadPdf = async (url, userId) => {
const uploadPdf = async (url, userId, articleSavingRequestId) => {
validateUrlString(url);
const uploadResult = await getUploadIdAndSignedUrl(userId, url);
const uploadResult = await getUploadIdAndSignedUrl(userId, url, articleSavingRequestId);
await uploadToSignedUrl(uploadResult, 'application/pdf', url);
return uploadResult.id;
};
@ -258,7 +279,7 @@ exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(async (req, res) => {
let url = getUrl(req);
const userId = req.body.userId || req.query.userId;
const articleSavingRequestId = req.body.saveRequestId || req.query.saveRequestId;
const articleSavingRequestId = (req.query ? req.query.saveRequestId : undefined) || (req.body ? req.body.saveRequestId : undefined);
logRecord = {
url,
@ -277,11 +298,11 @@ exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(async (req, res) => {
return res.sendStatus(400);
}
if (!userId || !articleSavingRequestId) {
Object.assign(logRecord, { invalidParams: true, body: req.body, query: req.query });
logger.error(`Invalid parameters`, logRecord);
return res.sendStatus(400);
}
// if (!userId || !articleSavingRequestId) {
// Object.assign(logRecord, { invalidParams: true, body: req.body, query: req.query });
// logger.error(`Invalid parameters`, logRecord);
// return res.sendStatus(400);
// }
// Before we run the regular handlers we check to see if we need tp
// pre-resolve the URL. TODO: This should probably happen recursively,
@ -348,7 +369,7 @@ exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(async (req, res) => {
try {
if (contentType === 'application/pdf') {
const uploadedFileId = await uploadPdf(finalUrl, userId);
const uploadedFileId = await uploadPdf(finalUrl, userId, articleSavingRequestId);
const l = await saveUploadedPdf(userId, finalUrl, uploadedFileId, articleSavingRequestId);
} else {
if (!content || !title) {
@ -551,7 +572,7 @@ function getUrl(req) {
} catch (e) {}
}
async function blockResources(page) {
async function blockResources(client) {
const blockedResources = [
// Assets
// '*/favicon.ico',
@ -574,7 +595,7 @@ async function blockResources(page) {
'sp.analytics.yahoo.com',
]
await page._client.send('Network.setBlockedURLs', { urls: blockedResources });
await client.send('Network.setBlockedURLs', { urls: blockedResources });
}
async function retrievePage(url) {
@ -603,7 +624,7 @@ async function retrievePage(url) {
const path = require('path');
const download_path = path.resolve('./download_dir/');
await page._client.send('Page.setDownloadBehavior', {
await client.send('Page.setDownloadBehavior', {
behavior: 'allow',
userDataDir: './',
downloadPath: download_path,
@ -632,7 +653,7 @@ async function retrievePage(url) {
} catch {}
});
await blockResources(page);
await blockResources(client);
/*
* Disallow MathJax from running in Puppeteer and modifying the document,
@ -683,6 +704,7 @@ async function retrievePage(url) {
if (lastPdfUrl) {
return { context, page, finalUrl: lastPdfUrl, contentType: 'application/pdf' };
}
await context.close();
throw error;
}
}
@ -722,7 +744,7 @@ async function retrieveHtml(page) {
}
})();
}),
page.waitForTimeout(5000), //5 second timeout
await page.waitForTimeout(1000), //5 second timeout
]);
logRecord.timing = { ...logRecord.timing, pageScrolled: Date.now() - pageScrollingStart };

View File

@ -14,8 +14,6 @@
"linkedom": "^0.14.9",
"luxon": "^2.3.1",
"puppeteer-core": "^15.3.2",
"puppeteer-extra": "^3.2.3",
"puppeteer-extra-plugin-stealth": "^2.9.0",
"winston": "^3.3.3"
},
"devDependencies": {