diff --git a/packages/content-fetch/Dockerfile b/packages/content-fetch/Dockerfile index b9bf619cc..9ca9f7dee 100644 --- a/packages/content-fetch/Dockerfile +++ b/packages/content-fetch/Dockerfile @@ -38,7 +38,6 @@ ADD /packages/content-fetch ./packages/content-fetch ADD /packages/content-handler ./packages/content-handler ADD /packages/puppeteer-parse ./packages/puppeteer-parse RUN yarn workspace @omnivore/content-handler build -RUN yarn workspace @omnivore/puppeteer-parse build # After building, fetch the production dependencies RUN rm -rf /app/packages/content-fetch/node_modules diff --git a/packages/content-fetch/Dockerfile-gcf b/packages/content-fetch/Dockerfile-gcf index dec0fa93a..8c355485f 100644 --- a/packages/content-fetch/Dockerfile-gcf +++ b/packages/content-fetch/Dockerfile-gcf @@ -39,7 +39,6 @@ ADD /packages/content-handler ./packages/content-handler ADD /packages/puppeteer-parse ./packages/puppeteer-parse ADD /packages/content-fetch ./packages/content-fetch RUN yarn workspace @omnivore/content-handler build -RUN yarn workspace @omnivore/puppeteer-parse build # After building, fetch the production dependencies RUN rm -rf /app/packages/content-fetch/node_modules diff --git a/packages/content-fetch/app.js b/packages/content-fetch/app.js index 0c2cc10fb..252671626 100644 --- a/packages/content-fetch/app.js +++ b/packages/content-fetch/app.js @@ -1,3 +1,4 @@ +require('dotenv').config(); const express = require('express'); const app = express(); diff --git a/packages/content-fetch/index.js b/packages/content-fetch/index.js index 2c18d99f8..a209413e3 100644 --- a/packages/content-fetch/index.js +++ b/packages/content-fetch/index.js @@ -4,84 +4,14 @@ /* eslint-disable @typescript-eslint/no-var-requires */ /* eslint-disable @typescript-eslint/no-require-imports */ require('dotenv').config(); -const { config, format, loggers, transports } = require('winston'); -const { LoggingWinston } = require('@google-cloud/logging-winston'); -const { DateTime } = require('luxon'); -const os = require('os'); const Sentry = require('@sentry/serverless'); -const { Storage } = require('@google-cloud/storage'); -const { fetchContent, getBrowserPromise, getUrl } = require("@omnivore/puppeteer-parse"); - -const storage = new Storage(); -const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : []; -const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined; +const { fetchContent, preview } = require("@omnivore/puppeteer-parse"); Sentry.GCPFunction.init({ dsn: process.env.SENTRY_DSN, tracesSampleRate: 0, }); -const filePath = `${os.tmpdir()}/previewImage.png`; - -const colors = { - emerg: 'inverse underline magenta', - alert: 'underline magenta', - crit: 'inverse underline red', // Any error that is forcing a shutdown of the service or application to prevent data loss. - error: 'underline red', // Any error which is fatal to the operation, but not the service or application - warning: 'underline yellow', // Anything that can potentially cause application oddities - notice: 'underline cyan', // Normal but significant condition - info: 'underline green', // Generally useful information to log - debug: 'underline gray', -}; - -const googleConfigs = { - level: 'info', - logName: 'logger', - levels: config.syslog.levels, - resource: { - labels: { - function_name: process.env.FUNCTION_TARGET, - project_id: process.env.GCP_PROJECT, - }, - type: 'cloud_function', - }, -}; - -function localConfig(id) { - return { - level: 'debug', - format: format.combine( - format.colorize({ all: true, colors }), - format(info => - Object.assign(info, { - timestamp: DateTime.local().toLocaleString(DateTime.TIME_24_WITH_SECONDS), - }), - )(), - format.printf(info => { - // eslint-disable-next-line @typescript-eslint/no-unused-vars - const { timestamp, message, level, ...meta } = info; - - return `[${id}@${info.timestamp}] ${info.message}${ - Object.keys(meta).length ? '\n' + JSON.stringify(meta, null, 4) : '' - }`; - }), - ), - }; -} - -function buildLoggerTransport(id, options) { - return process.env.IS_LOCAL - ? new transports.Console(localConfig(id)) - : new LoggingWinston({ ...googleConfigs, ...{ logName: id }, ...options }); -} - -function buildLogger(id, options) { - return loggers.get(id, { - levels: config.syslog.levels, - transports: [buildLoggerTransport(id, options)], - }); -} - /** * Cloud Function entry point, HTTP trigger. * Loads the requested URL via Puppeteer, captures page content and sends it to backend @@ -100,122 +30,4 @@ exports.puppeteer = Sentry.GCPFunction.wrapHttpFunction(fetchContent); * * url - URL address of the page to open * @param {Object} res Cloud Function response context. */ -exports.preview = Sentry.GCPFunction.wrapHttpFunction(async (req, res) => { - const functionStartTime = Date.now(); - // Grabbing execution and trace ids to attach logs to the appropriate function call - const execution_id = req.get('function-execution-id'); - const traceId = (req.get('x-cloud-trace-context') || '').split('/')[0]; - const logger = buildLogger('cloudfunctions.googleapis.com%2Fcloud-functions', { - trace: `projects/${process.env.GCLOUD_PROJECT}/traces/${traceId}`, - labels: { - execution_id: execution_id, - }, - }); - - if (!process.env.PREVIEW_IMAGE_BUCKET) { - logger.error(`PREVIEW_IMAGE_BUCKET not set`) - return res.sendStatus(500); - } - - const url = getUrl(req); - console.log('preview request url', url); - - const logRecord = { - url, - query: req.query, - origin: req.get('Origin'), - labels: { - source: 'publicImagePreview', - }, - }; - - logger.info(`Public preview image generation request`, logRecord); - - if (!url) { - logRecord.urlIsInvalid = true; - logger.error(`Valid URL to parse is not specified`, logRecord); - return res.sendStatus(400); - } - const { origin } = new URL(url); - if (!ALLOWED_ORIGINS.some(o => o === origin)) { - logRecord.forbiddenOrigin = true; - logger.error(`This origin is not allowed: ${origin}`, logRecord); - return res.sendStatus(400); - } - - const browser = await getBrowserPromise(process.env.PROXY_URL, process.env.CHROMIUM_PATH); - logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime }; - - const page = await browser.newPage(); - const pageLoadingStart = Date.now(); - const modifiedUrl = new URL(url); - modifiedUrl.searchParams.append('fontSize', 24); - modifiedUrl.searchParams.append('adjustAspectRatio', 1.91); - try { - await page.goto(modifiedUrl); - logRecord.timing = { ...logRecord.timing, pageLoaded: Date.now() - pageLoadingStart }; - } catch (error) { - console.log('error going to page: ', modifiedUrl) - console.log(error) - throw error - } - - // We lookup the destination path from our own page content and avoid trusting any passed query params - // selector - CSS selector of the element to get screenshot of - const selector = decodeURIComponent( - await page.$eval( - "head > meta[name='omnivore:preview_image_selector']", - element => element.content, - ), - ); - if (!selector) { - logRecord.selectorIsInvalid = true; - logger.error(`Valid element selector is not specified`, logRecord); - await page.close(); - return res.sendStatus(400); - } - logRecord.selector = selector; - - // destination - destination pathname for the image to save with - const destination = decodeURIComponent( - await page.$eval( - "head > meta[name='omnivore:preview_image_destination']", - element => element.content, - ), - ); - if (!destination) { - logRecord.destinationIsInvalid = true; - logger.error(`Valid file destination is not specified`, logRecord); - await page.close(); - return res.sendStatus(400); - } - logRecord.destination = destination; - - const screenshotTakingStart = Date.now(); - try { - await page.waitForSelector(selector, { timeout: 3000 }); // wait for the selector to load - } catch (error) { - logRecord.elementNotFound = true; - logger.error(`Element is not presented on the page`, logRecord); - await page.close(); - return res.sendStatus(400); - } - const element = await page.$(selector); - await element.screenshot({ path: filePath }); // take screenshot of the element in puppeteer - logRecord.timing = { ...logRecord.timing, screenshotTaken: Date.now() - screenshotTakingStart }; - - await page.close(); - - try { - const [file] = await previewBucket.upload(filePath, { - destination, - metadata: logRecord, - }); - logRecord.file = file.metadata; - } catch (e) { - console.log('error uploading to bucket, this is non-fatal', e) - } - - logger.info(`preview-image`, logRecord); - return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`); -}); +exports.preview = Sentry.GCPFunction.wrapHttpFunction(preview); diff --git a/packages/content-fetch/package.json b/packages/content-fetch/package.json index 45de11581..6facc37a9 100644 --- a/packages/content-fetch/package.json +++ b/packages/content-fetch/package.json @@ -4,26 +4,21 @@ "description": "Service that fetches page content from a URL", "main": "index.js", "dependencies": { - "axios": "^0.27.2", "dotenv": "^8.2.0", "express": "^4.17.1", - "jsonwebtoken": "^8.5.1", - "linkedom": "^0.14.9", - "luxon": "^2.3.1", - "puppeteer-core": "^16.1.0", - "underscore": "^1.13.4", "@omnivore/puppeteer-parse": "^1.0.0", - "@google-cloud/logging-winston": "^4.1.2", - "@google-cloud/storage": "^5.18.1", - "@sentry/serverless": "^6.13.3", - "winston": "^3.3.3" + "@sentry/serverless": "^6.13.3" }, "devDependencies": { - "@google-cloud/functions-framework": "^3.0.0" + "@google-cloud/functions-framework": "^3.0.0", + "chai": "^4.3.6", + "chai-string": "^1.5.0", + "mocha": "^10.0.0" }, "scripts": { "start": "node app.js", "start_gcf": "npx functions-framework --port=9090 --target=puppeteer", - "start_preview": "npx functions-framework --target=preview" + "start_preview": "npx functions-framework --target=preview", + "test": "mocha test/*.js" } } diff --git a/packages/content-fetch/test/babel-register.js b/packages/content-fetch/test/babel-register.js deleted file mode 100644 index a6f65f60a..000000000 --- a/packages/content-fetch/test/babel-register.js +++ /dev/null @@ -1,3 +0,0 @@ -const register = require('@babel/register').default - -register({ extensions: ['.ts', '.tsx', '.js', '.jsx'] }) diff --git a/packages/content-fetch/test/stub.test.js b/packages/content-fetch/test/stub.test.js new file mode 100644 index 000000000..317d21b52 --- /dev/null +++ b/packages/content-fetch/test/stub.test.js @@ -0,0 +1,9 @@ +const chai = require("chai"); + +const expect = chai.expect; + +describe('Stub test', () => { + it('should pass', () => { + expect(true).to.be.true + }) +}) diff --git a/packages/content-fetch/test/stub.test.ts b/packages/content-fetch/test/stub.test.ts deleted file mode 100644 index 173ca4917..000000000 --- a/packages/content-fetch/test/stub.test.ts +++ /dev/null @@ -1,13 +0,0 @@ -import 'mocha' -import * as chai from 'chai' -import { expect } from 'chai' -import 'chai/register-should' -import chaiString from 'chai-string' - -chai.use(chaiString) - -describe('Stub test', () => { - it('should pass', () => { - expect(true).to.be.true - }) -}) diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index 9dbe583c8..0624eb8dd 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -3,7 +3,6 @@ /* eslint-disable @typescript-eslint/explicit-function-return-type */ /* eslint-disable @typescript-eslint/no-var-requires */ /* eslint-disable @typescript-eslint/no-require-imports */ -require('dotenv').config(); const Url = require('url'); // const puppeteer = require('puppeteer-extra'); const axios = require('axios'); @@ -27,6 +26,67 @@ const storage = new Storage(); const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : []; const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined; +const filePath = `${os.tmpdir()}/previewImage.png`; + +const colors = { + emerg: 'inverse underline magenta', + alert: 'underline magenta', + crit: 'inverse underline red', // Any error that is forcing a shutdown of the service or application to prevent data loss. + error: 'underline red', // Any error which is fatal to the operation, but not the service or application + warning: 'underline yellow', // Anything that can potentially cause application oddities + notice: 'underline cyan', // Normal but significant condition + info: 'underline green', // Generally useful information to log + debug: 'underline gray', +}; + +const googleConfigs = { + level: 'info', + logName: 'logger', + levels: config.syslog.levels, + resource: { + labels: { + function_name: process.env.FUNCTION_TARGET, + project_id: process.env.GCP_PROJECT, + }, + type: 'cloud_function', + }, +}; + +function localConfig(id) { + return { + level: 'debug', + format: format.combine( + format.colorize({ all: true, colors }), + format(info => + Object.assign(info, { + timestamp: DateTime.local().toLocaleString(DateTime.TIME_24_WITH_SECONDS), + }), + )(), + format.printf(info => { + // eslint-disable-next-line @typescript-eslint/no-unused-vars + const { timestamp, message, level, ...meta } = info; + + return `[${id}@${info.timestamp}] ${info.message}${ + Object.keys(meta).length ? '\n' + JSON.stringify(meta, null, 4) : '' + }`; + }), + ), + }; +} + +function buildLoggerTransport(id, options) { + return process.env.IS_LOCAL + ? new transports.Console(localConfig(id)) + : new LoggingWinston({ ...googleConfigs, ...{ logName: id }, ...options }); +} + +function buildLogger(id, options) { + return loggers.get(id, { + levels: config.syslog.levels, + transports: [buildLoggerTransport(id, options)], + }); +} + const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' const BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' @@ -208,7 +268,16 @@ const saveUploadedPdf = async (userId, url, uploadFileId, articleSavingRequestId }; async function fetchContent(req, res) { - functionStartTime = Date.now(); + const functionStartTime = Date.now(); + // Grabbing execution and trace ids to attach logs to the appropriate function call + const execution_id = req.get('function-execution-id'); + const traceId = (req.get('x-cloud-trace-context') || '').split('/')[0]; + const logger = buildLogger('cloudfunctions.googleapis.com%2Fcloud-functions', { + trace: `projects/${process.env.GCLOUD_PROJECT}/traces/${traceId}`, + labels: { + execution_id: execution_id, + }, + }); let url = getUrl(req); const userId = req.body.userId || req.query.userId; @@ -616,9 +685,128 @@ async function retrieveHtml(page) { return { domContent, title }; } +async function preview(req, res) { + const functionStartTime = Date.now(); + // Grabbing execution and trace ids to attach logs to the appropriate function call + const execution_id = req.get('function-execution-id'); + const traceId = (req.get('x-cloud-trace-context') || '').split('/')[0]; + const logger = buildLogger('cloudfunctions.googleapis.com%2Fcloud-functions', { + trace: `projects/${process.env.GCLOUD_PROJECT}/traces/${traceId}`, + labels: { + execution_id: execution_id, + }, + }); + + if (!process.env.PREVIEW_IMAGE_BUCKET) { + logger.error(`PREVIEW_IMAGE_BUCKET not set`) + return res.sendStatus(500); + } + + const url = getUrl(req); + console.log('preview request url', url); + + const logRecord = { + url, + query: req.query, + origin: req.get('Origin'), + labels: { + source: 'publicImagePreview', + }, + }; + + logger.info(`Public preview image generation request`, logRecord); + + if (!url) { + logRecord.urlIsInvalid = true; + logger.error(`Valid URL to parse is not specified`, logRecord); + return res.sendStatus(400); + } + const { origin } = new URL(url); + if (!ALLOWED_ORIGINS.some(o => o === origin)) { + logRecord.forbiddenOrigin = true; + logger.error(`This origin is not allowed: ${origin}`, logRecord); + return res.sendStatus(400); + } + + const browser = await getBrowserPromise(process.env.PROXY_URL, process.env.CHROMIUM_PATH); + logRecord.timing = { ...logRecord.timing, browserOpened: Date.now() - functionStartTime }; + + const page = await browser.newPage(); + const pageLoadingStart = Date.now(); + const modifiedUrl = new URL(url); + modifiedUrl.searchParams.append('fontSize', 24); + modifiedUrl.searchParams.append('adjustAspectRatio', 1.91); + try { + await page.goto(modifiedUrl); + logRecord.timing = { ...logRecord.timing, pageLoaded: Date.now() - pageLoadingStart }; + } catch (error) { + console.log('error going to page: ', modifiedUrl) + console.log(error) + throw error + } + + // We lookup the destination path from our own page content and avoid trusting any passed query params + // selector - CSS selector of the element to get screenshot of + const selector = decodeURIComponent( + await page.$eval( + "head > meta[name='omnivore:preview_image_selector']", + element => element.content, + ), + ); + if (!selector) { + logRecord.selectorIsInvalid = true; + logger.error(`Valid element selector is not specified`, logRecord); + await page.close(); + return res.sendStatus(400); + } + logRecord.selector = selector; + + // destination - destination pathname for the image to save with + const destination = decodeURIComponent( + await page.$eval( + "head > meta[name='omnivore:preview_image_destination']", + element => element.content, + ), + ); + if (!destination) { + logRecord.destinationIsInvalid = true; + logger.error(`Valid file destination is not specified`, logRecord); + await page.close(); + return res.sendStatus(400); + } + logRecord.destination = destination; + + const screenshotTakingStart = Date.now(); + try { + await page.waitForSelector(selector, { timeout: 3000 }); // wait for the selector to load + } catch (error) { + logRecord.elementNotFound = true; + logger.error(`Element is not presented on the page`, logRecord); + await page.close(); + return res.sendStatus(400); + } + const element = await page.$(selector); + await element.screenshot({ path: filePath }); // take screenshot of the element in puppeteer + logRecord.timing = { ...logRecord.timing, screenshotTaken: Date.now() - screenshotTakingStart }; + + await page.close(); + + try { + const [file] = await previewBucket.upload(filePath, { + destination, + metadata: logRecord, + }); + logRecord.file = file.metadata; + } catch (e) { + console.log('error uploading to bucket, this is non-fatal', e) + } + + logger.info(`preview-image`, logRecord); + return res.redirect(`${process.env.PREVIEW_IMAGE_CDN_ORIGIN}/${destination}`); +} + module.exports = { fetchContent, - getBrowserPromise, - getUrl, + preview, }; diff --git a/packages/puppeteer-parse/package.json b/packages/puppeteer-parse/package.json index cc7755c0b..172eb4d99 100644 --- a/packages/puppeteer-parse/package.json +++ b/packages/puppeteer-parse/package.json @@ -8,7 +8,6 @@ "@google-cloud/storage": "^5.18.1", "@omnivore/content-handler": "1.0.0", "axios": "^0.27.2", - "dotenv": "^8.2.0", "jsonwebtoken": "^8.5.1", "linkedom": "^0.14.9", "luxon": "^2.3.1", diff --git a/yarn.lock b/yarn.lock index b1b9a214d..3f3923d58 100644 --- a/yarn.lock +++ b/yarn.lock @@ -2444,7 +2444,7 @@ google-gax "^2.24.1" protobufjs "^6.8.6" -"@google-cloud/functions-framework@3.1.2", "@google-cloud/functions-framework@^3.1.2": +"@google-cloud/functions-framework@3.1.2", "@google-cloud/functions-framework@^3.0.0": version "3.1.2" resolved "https://registry.yarnpkg.com/@google-cloud/functions-framework/-/functions-framework-3.1.2.tgz#2cd92ce4307bf7f32555d028dca22e398473b410" integrity sha512-pYvEH65/Rqh1JNPdcBmorcV7Xoom2/iOSmbtYza8msro7Inl+qOYxbyMiQfySD2gwAyn38WyWPRqsDRcf/BFLg== @@ -10907,13 +10907,6 @@ chownr@^2.0.0: resolved "https://registry.yarnpkg.com/chownr/-/chownr-2.0.0.tgz#15bfbe53d2eab4cf70f18a8cd68ebe5b3cb1dece" integrity sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ== -chrome-aws-lambda@^10.1.0: - version "10.1.0" - resolved "https://registry.yarnpkg.com/chrome-aws-lambda/-/chrome-aws-lambda-10.1.0.tgz#ac43b4cdfc1fbb2275c62effada560858099501e" - integrity sha512-NZQVf+J4kqG4sVhRm3WNmOfzY0OtTSm+S8rg77pwePa9RCYHzhnzRs8YvNI6L9tALIW6RpmefWiPURt3vURXcw== - dependencies: - lambdafs "^2.0.3" - chrome-trace-event@^1.0.2: version "1.0.3" resolved "https://registry.yarnpkg.com/chrome-trace-event/-/chrome-trace-event-1.0.3.tgz#1015eced4741e15d06664a957dbbf50d041e26ac" @@ -17677,13 +17670,6 @@ kuler@^2.0.0: resolved "https://registry.yarnpkg.com/kuler/-/kuler-2.0.0.tgz#e2c570a3800388fb44407e851531c1d670b061b3" integrity sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A== -lambdafs@^2.0.3: - version "2.1.1" - resolved "https://registry.yarnpkg.com/lambdafs/-/lambdafs-2.1.1.tgz#4bf8d3037b6c61bbb4a22ab05c73ee47964c25ed" - integrity sha512-x5k8JcoJWkWLvCVBzrl4pzvkEHSgSBqFjg3Dpsc4AcTMq7oUMym4cL/gRTZ6VM4mUMY+M0dIbQ+V1c1tsqqanQ== - dependencies: - tar-fs "^2.1.1" - language-subtag-registry@~0.3.2: version "0.3.21" resolved "https://registry.yarnpkg.com/language-subtag-registry/-/language-subtag-registry-0.3.21.tgz#04ac218bea46f04cb039084602c6da9e788dd45a" @@ -23988,7 +23974,7 @@ tar-fs@2.0.0: pump "^3.0.0" tar-stream "^2.0.0" -tar-fs@2.1.1, tar-fs@^2.1.1: +tar-fs@2.1.1: version "2.1.1" resolved "https://registry.yarnpkg.com/tar-fs/-/tar-fs-2.1.1.tgz#489a15ab85f1f0befabb370b7de4f9eb5cbe8784" integrity sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==