Import puppeteer-parse in content-fetch

This commit is contained in:
Hongbo Wu
2022-10-10 11:42:58 +08:00
parent 00fed8a0fb
commit b18af10e75
11 changed files with 213 additions and 241 deletions

View File

@ -38,7 +38,6 @@ ADD /packages/content-fetch ./packages/content-fetch
ADD /packages/content-handler ./packages/content-handler
ADD /packages/puppeteer-parse ./packages/puppeteer-parse
RUN yarn workspace @omnivore/content-handler build
RUN yarn workspace @omnivore/puppeteer-parse build
# After building, fetch the production dependencies
RUN rm -rf /app/packages/content-fetch/node_modules

View File

@ -39,7 +39,6 @@ ADD /packages/content-handler ./packages/content-handler
ADD /packages/puppeteer-parse ./packages/puppeteer-parse
ADD /packages/content-fetch ./packages/content-fetch
RUN yarn workspace @omnivore/content-handler build
RUN yarn workspace @omnivore/puppeteer-parse build
# After building, fetch the production dependencies
RUN rm -rf /app/packages/content-fetch/node_modules

View File

@ -1,3 +1,4 @@
require('dotenv').config();
const express = require('express');
const app = express();

View File

@ -4,84 +4,14 @@
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config();
const { config, format, loggers, transports } = require('winston');
const { LoggingWinston } = require('@google-cloud/logging-winston');
const { DateTime } = require('luxon');
const os = require('os');
const Sentry = require('@sentry/serverless');
const { Storage } = require('@google-cloud/storage');
const { fetchContent, getBrowserPromise, getUrl } = require("@omnivore/puppeteer-parse");
const storage = new Storage();
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined;
const { fetchContent, preview } = require("@omnivore/puppeteer-parse");
Sentry.GCPFunction.init({
dsn: process.env.SENTRY_DSN,
tracesSampleRate: 0,
});
const filePath = `${os.tmpdir()}/previewImage.png`;
const colors = {
emerg: 'inverse underline magenta',
alert: 'underline magenta',
crit: 'inverse underline red', // Any error that is forcing a shutdown of the service or application to prevent data loss.
error: 'underline red', // Any error which is fatal to the operation, but not the service or application
warning: 'underline yellow', // Anything that can potentially cause application oddities
notice: 'underline cyan', // Normal but significant condition
info: 'underline green', // Generally useful information to log
debug: 'underline gray',
};
const googleConfigs = {
level: 'info',
logName: 'logger',
levels: config.syslog.levels,
resource: {
labels: {
function_name: process.env.FUNCTION_TARGET,
project_id: process.env.GCP_PROJECT,
},
type: 'cloud_function',
},
};
function localConfig(id) {
return {
level: 'debug',
format: format.combine(
format.colorize({ all: true, colors }),
format(info =>
Object.assign(info, {
timestamp: DateTime.local().toLocaleString(DateTime.TIME_24_WITH_SECONDS),
}),
)(),
format.printf(info => {
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const { timestamp, message, level, ...meta } = info;
return `[${id}@${info.timestamp}] ${info.message}${
Object.keys(meta).length ? '\n' + JSON.stringify(meta, null, 4) : ''
}`;
}),
),
};
}
function buildLoggerTransport(id, options) {
return process.env.IS_LOCAL
? new transports.Console(localConfig(id))
: new LoggingWinston({ ...googleConfigs, ...{ logName: id }, ...options });
}
function buildLogger(id, options) {
return loggers.get(id, {
levels: config.syslog.levels,
transports: [buildLoggerTransport(id, options)],
});
}
/**
* Cloud Function entry point, HTTP trigger.
* Loads the requested URL via Puppeteer, captures page content and sends it to backend
@ -100,122 +30,4 @@ exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(fetchContent);
* * url - URL address of the page to open
* @param {Object} res Cloud Function response context.
*/
exports.preview = Sentry.GCPFunction.wrapHttpFunction(async (req, res) => {
const functionStartTime = Date.now();
// Grabbing execution and trace ids to attach logs to the appropriate function call
const execution_id = req.get('function-execution-id');
const traceId = (req.get('x-cloud-trace-context') || '').split('/')[0];
const logger = buildLogger('cloudfunctions.googleapis.com%2Fcloud-functions', {
trace: `projects/${process.env.GCLOUD_PROJECT}/traces/${traceId}`,
labels: {
execution_id: execution_id,
},
});
if (!process.env.PREVIEW_IMAGE_BUCKET) {
logger.error(`PREVIEW_IMAGE_BUCKET not set`)
return res.sendStatus(500);
}
const url = getUrl(req);
console.log('preview request url', url);
const logRecord = {
url,
query: req.query,
origin: req.get('Origin'),
labels: {
source: 'publicImagePreview',
},
};
logger.info(`Public preview image generation request`, logRecord);
if (!url) {
logRecord.urlIsInvalid = true;
logger.error(`Valid URL to parse is not specified`, logRecord);
return res.sendStatus(400);
}
const { origin } = new URL(url);
if (!ALLOWED_ORIGINS.some(o => o === origin)) {
logRecord.forbiddenOrigin = true;
logger.error(`This origin is not allowed: ${origin}`, logRecord);
return res.sendStatus(400);
}
const browser = await getBrowserPromise(process.env.PROXY_URL, process.env.CHROMIUM_PATH);
logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime };
const page = await browser.newPage();
const pageLoadingStart = Date.now();
const modifiedUrl = new URL(url);
modifiedUrl.searchParams.append('fontSize', 24);
modifiedUrl.searchParams.append('adjustAspectRatio', 1.91);
try {
await page.goto(modifiedUrl);
logRecord.timing = { ...logRecord.timing, pageLoaded: Date.now() - pageLoadingStart };
} catch (error) {
console.log('error going to page: ', modifiedUrl)
console.log(error)
throw error
}
// We lookup the destination path from our own page content and avoid trusting any passed query params
// selector - CSS selector of the element to get screenshot of
const selector = decodeURIComponent(
await page.$eval(
"head > meta[name='omnivore:preview_image_selector']",
element => element.content,
),
);
if (!selector) {
logRecord.selectorIsInvalid = true;
logger.error(`Valid element selector is not specified`, logRecord);
await page.close();
return res.sendStatus(400);
}
logRecord.selector = selector;
// destination - destination pathname for the image to save with
const destination = decodeURIComponent(
await page.$eval(
"head > meta[name='omnivore:preview_image_destination']",
element => element.content,
),
);
if (!destination) {
logRecord.destinationIsInvalid = true;
logger.error(`Valid file destination is not specified`, logRecord);
await page.close();
return res.sendStatus(400);
}
logRecord.destination = destination;
const screenshotTakingStart = Date.now();
try {
await page.waitForSelector(selector, { timeout: 3000 }); // wait for the selector to load
} catch (error) {
logRecord.elementNotFound = true;
logger.error(`Element is not presented on the page`, logRecord);
await page.close();
return res.sendStatus(400);
}
const element = await page.$(selector);
await element.screenshot({ path: filePath }); // take screenshot of the element in puppeteer
logRecord.timing = { ...logRecord.timing, screenshotTaken: Date.now() - screenshotTakingStart };
await page.close();
try {
const [file] = await previewBucket.upload(filePath, {
destination,
metadata: logRecord,
});
logRecord.file = file.metadata;
} catch (e) {
console.log('error uploading to bucket, this is non-fatal', e)
}
logger.info(`preview-image`, logRecord);
return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`);
});
exports.preview = Sentry.GCPFunction.wrapHttpFunction(preview);

View File

@ -4,26 +4,21 @@
"description": "Service that fetches page content from a URL",
"main": "index.js",
"dependencies": {
"axios": "^0.27.2",
"dotenv": "^8.2.0",
"express": "^4.17.1",
"jsonwebtoken": "^8.5.1",
"linkedom": "^0.14.9",
"luxon": "^2.3.1",
"puppeteer-core": "^16.1.0",
"underscore": "^1.13.4",
"@omnivore/puppeteer-parse": "^1.0.0",
"@google-cloud/logging-winston": "^4.1.2",
"@google-cloud/storage": "^5.18.1",
"@sentry/serverless": "^6.13.3",
"winston": "^3.3.3"
"@sentry/serverless": "^6.13.3"
},
"devDependencies": {
"@google-cloud/functions-framework": "^3.0.0"
"@google-cloud/functions-framework": "^3.0.0",
"chai": "^4.3.6",
"chai-string": "^1.5.0",
"mocha": "^10.0.0"
},
"scripts": {
"start": "node app.js",
"start_gcf": "npx functions-framework --port=9090 --target=puppeteer",
"start_preview": "npx functions-framework --target=preview"
"start_preview": "npx functions-framework --target=preview",
"test": "mocha test/*.js"
}
}

View File

@ -1,3 +0,0 @@
const register = require('@babel/register').default
register({ extensions: ['.ts', '.tsx', '.js', '.jsx'] })

View File

@ -0,0 +1,9 @@
const chai = require("chai");
const expect = chai.expect;
describe('Stub test', () => {
it('should pass', () => {
expect(true).to.be.true
})
})

View File

@ -1,13 +0,0 @@
import 'mocha'
import * as chai from 'chai'
import { expect } from 'chai'
import 'chai/register-should'
import chaiString from 'chai-string'
chai.use(chaiString)
describe('Stub test', () => {
it('should pass', () => {
expect(true).to.be.true
})
})

View File

@ -3,7 +3,6 @@
/* eslint-disable @typescript-eslint/explicit-function-return-type */
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config();
const Url = require('url');
// const puppeteer = require('puppeteer-extra');
const axios = require('axios');
@ -27,6 +26,67 @@ const storage = new Storage();
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined;
const filePath = `${os.tmpdir()}/previewImage.png`;
const colors = {
emerg: 'inverse underline magenta',
alert: 'underline magenta',
crit: 'inverse underline red', // Any error that is forcing a shutdown of the service or application to prevent data loss.
error: 'underline red', // Any error which is fatal to the operation, but not the service or application
warning: 'underline yellow', // Anything that can potentially cause application oddities
notice: 'underline cyan', // Normal but significant condition
info: 'underline green', // Generally useful information to log
debug: 'underline gray',
};
const googleConfigs = {
level: 'info',
logName: 'logger',
levels: config.syslog.levels,
resource: {
labels: {
function_name: process.env.FUNCTION_TARGET,
project_id: process.env.GCP_PROJECT,
},
type: 'cloud_function',
},
};
function localConfig(id) {
return {
level: 'debug',
format: format.combine(
format.colorize({ all: true, colors }),
format(info =>
Object.assign(info, {
timestamp: DateTime.local().toLocaleString(DateTime.TIME_24_WITH_SECONDS),
}),
)(),
format.printf(info => {
// eslint-disable-next-line @typescript-eslint/no-unused-vars
const { timestamp, message, level, ...meta } = info;
return `[${id}@${info.timestamp}] ${info.message}${
Object.keys(meta).length ? '\n' + JSON.stringify(meta, null, 4) : ''
}`;
}),
),
};
}
function buildLoggerTransport(id, options) {
return process.env.IS_LOCAL
? new transports.Console(localConfig(id))
: new LoggingWinston({ ...googleConfigs, ...{ logName: id }, ...options });
}
function buildLogger(id, options) {
return loggers.get(id, {
levels: config.syslog.levels,
transports: [buildLoggerTransport(id, options)],
});
}
const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
@ -208,7 +268,16 @@ const saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId
};
async function fetchContent(req, res) {
functionStartTime = Date.now();
const functionStartTime = Date.now();
// Grabbing execution and trace ids to attach logs to the appropriate function call
const execution_id = req.get('function-execution-id');
const traceId = (req.get('x-cloud-trace-context') || '').split('/')[0];
const logger = buildLogger('cloudfunctions.googleapis.com%2Fcloud-functions', {
trace: `projects/${process.env.GCLOUD_PROJECT}/traces/${traceId}`,
labels: {
execution_id: execution_id,
},
});
let url = getUrl(req);
const userId = req.body.userId || req.query.userId;
@ -616,9 +685,128 @@ async function retrieveHtml(page) {
return { domContent, title };
}
async function preview(req, res) {
const functionStartTime = Date.now();
// Grabbing execution and trace ids to attach logs to the appropriate function call
const execution_id = req.get('function-execution-id');
const traceId = (req.get('x-cloud-trace-context') || '').split('/')[0];
const logger = buildLogger('cloudfunctions.googleapis.com%2Fcloud-functions', {
trace: `projects/${process.env.GCLOUD_PROJECT}/traces/${traceId}`,
labels: {
execution_id: execution_id,
},
});
if (!process.env.PREVIEW_IMAGE_BUCKET) {
logger.error(`PREVIEW_IMAGE_BUCKET not set`)
return res.sendStatus(500);
}
const url = getUrl(req);
console.log('preview request url', url);
const logRecord = {
url,
query: req.query,
origin: req.get('Origin'),
labels: {
source: 'publicImagePreview',
},
};
logger.info(`Public preview image generation request`, logRecord);
if (!url) {
logRecord.urlIsInvalid = true;
logger.error(`Valid URL to parse is not specified`, logRecord);
return res.sendStatus(400);
}
const { origin } = new URL(url);
if (!ALLOWED_ORIGINS.some(o => o === origin)) {
logRecord.forbiddenOrigin = true;
logger.error(`This origin is not allowed: ${origin}`, logRecord);
return res.sendStatus(400);
}
const browser = await getBrowserPromise(process.env.PROXY_URL, process.env.CHROMIUM_PATH);
logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime };
const page = await browser.newPage();
const pageLoadingStart = Date.now();
const modifiedUrl = new URL(url);
modifiedUrl.searchParams.append('fontSize', 24);
modifiedUrl.searchParams.append('adjustAspectRatio', 1.91);
try {
await page.goto(modifiedUrl);
logRecord.timing = { ...logRecord.timing, pageLoaded: Date.now() - pageLoadingStart };
} catch (error) {
console.log('error going to page: ', modifiedUrl)
console.log(error)
throw error
}
// We lookup the destination path from our own page content and avoid trusting any passed query params
// selector - CSS selector of the element to get screenshot of
const selector = decodeURIComponent(
await page.$eval(
"head > meta[name='omnivore:preview_image_selector']",
element => element.content,
),
);
if (!selector) {
logRecord.selectorIsInvalid = true;
logger.error(`Valid element selector is not specified`, logRecord);
await page.close();
return res.sendStatus(400);
}
logRecord.selector = selector;
// destination - destination pathname for the image to save with
const destination = decodeURIComponent(
await page.$eval(
"head > meta[name='omnivore:preview_image_destination']",
element => element.content,
),
);
if (!destination) {
logRecord.destinationIsInvalid = true;
logger.error(`Valid file destination is not specified`, logRecord);
await page.close();
return res.sendStatus(400);
}
logRecord.destination = destination;
const screenshotTakingStart = Date.now();
try {
await page.waitForSelector(selector, { timeout: 3000 }); // wait for the selector to load
} catch (error) {
logRecord.elementNotFound = true;
logger.error(`Element is not presented on the page`, logRecord);
await page.close();
return res.sendStatus(400);
}
const element = await page.$(selector);
await element.screenshot({ path: filePath }); // take screenshot of the element in puppeteer
logRecord.timing = { ...logRecord.timing, screenshotTaken: Date.now() - screenshotTakingStart };
await page.close();
try {
const [file] = await previewBucket.upload(filePath, {
destination,
metadata: logRecord,
});
logRecord.file = file.metadata;
} catch (e) {
console.log('error uploading to bucket, this is non-fatal', e)
}
logger.info(`preview-image`, logRecord);
return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`);
}
module.exports = {
fetchContent,
getBrowserPromise,
getUrl,
preview,
};

View File

@ -8,7 +8,6 @@
"@google-cloud/storage": "^5.18.1",
"@omnivore/content-handler": "1.0.0",
"axios": "^0.27.2",
"dotenv": "^8.2.0",
"jsonwebtoken": "^8.5.1",
"linkedom": "^0.14.9",
"luxon": "^2.3.1",

View File

@ -2444,7 +2444,7 @@
google-gax "^2.24.1"
protobufjs "^6.8.6"
"@google-cloud/functions-framework@3.1.2", "@google-cloud/functions-framework@^3.1.2":
"@google-cloud/functions-framework@3.1.2", "@google-cloud/functions-framework@^3.0.0":
version "3.1.2"
resolved "https://registry.yarnpkg.com/@google-cloud/functions-framework/-/functions-framework-3.1.2.tgz#2cd92ce4307bf7f32555d028dca22e398473b410"
integrity sha512-pYvEH65/Rqh1JNPdcBmorcV7Xoom2/iOSmbtYza8msro7Inl+qOYxbyMiQfySD2gwAyn38WyWPRqsDRcf/BFLg==
@ -10907,13 +10907,6 @@ chownr@^2.0.0:
resolved "https://registry.yarnpkg.com/chownr/-/chownr-2.0.0.tgz#15bfbe53d2eab4cf70f18a8cd68ebe5b3cb1dece"
integrity sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ==
chrome-aws-lambda@^10.1.0:
version "10.1.0"
resolved "https://registry.yarnpkg.com/chrome-aws-lambda/-/chrome-aws-lambda-10.1.0.tgz#ac43b4cdfc1fbb2275c62effada560858099501e"
integrity sha512-NZQVf+J4kqG4sVhRm3WNmOfzY0OtTSm+S8rg77pwePa9RCYHzhnzRs8YvNI6L9tALIW6RpmefWiPURt3vURXcw==
dependencies:
lambdafs "^2.0.3"
chrome-trace-event@^1.0.2:
version "1.0.3"
resolved "https://registry.yarnpkg.com/chrome-trace-event/-/chrome-trace-event-1.0.3.tgz#1015eced4741e15d06664a957dbbf50d041e26ac"
@ -17677,13 +17670,6 @@ kuler@^2.0.0:
resolved "https://registry.yarnpkg.com/kuler/-/kuler-2.0.0.tgz#e2c570a3800388fb44407e851531c1d670b061b3"
integrity sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A==
lambdafs@^2.0.3:
version "2.1.1"
resolved "https://registry.yarnpkg.com/lambdafs/-/lambdafs-2.1.1.tgz#4bf8d3037b6c61bbb4a22ab05c73ee47964c25ed"
integrity sha512-x5k8JcoJWkWLvCVBzrl4pzvkEHSgSBqFjg3Dpsc4AcTMq7oUMym4cL/gRTZ6VM4mUMY+M0dIbQ+V1c1tsqqanQ==
dependencies:
tar-fs "^2.1.1"
language-subtag-registry@~0.3.2:
version "0.3.21"
resolved "https://registry.yarnpkg.com/language-subtag-registry/-/language-subtag-registry-0.3.21.tgz#04ac218bea46f04cb039084602c6da9e788dd45a"
@ -23988,7 +23974,7 @@ tar-fs@2.0.0:
pump "^3.0.0"
tar-stream "^2.0.0"
tar-fs@2.1.1, tar-fs@^2.1.1:
tar-fs@2.1.1:
version "2.1.1"
resolved "https://registry.yarnpkg.com/tar-fs/-/tar-fs-2.1.1.tgz#489a15ab85f1f0befabb370b7de4f9eb5cbe8784"
integrity sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==