Remove content-fetch-gcf and create a Dockerfile for the cloud function

This commit is contained in:
Hongbo Wu
2022-10-07 18:42:01 +08:00
parent 623bb8780c
commit 00fed8a0fb
15 changed files with 82 additions and 187 deletions

View File

@ -1 +0,0 @@
previewImage.*

View File

@ -1,112 +0,0 @@
# FROM node:14-slim
# # Taken from pu
# # Install latest chrome dev package and fonts to support major charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others)
# # Note: this installs the necessary libs to make the bundled version of Chromium that Puppeteer
# # installs, work.
# RUN apt-get update \
# && apt-get install -y wget gnupg \
# && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
# && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
# && apt-get update \
# && apt-get install -y google-chrome-stable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-freefont-ttf libxss1 \
# --no-install-recommends \
# && rm -rf /var/lib/apt/lists/*
# ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true
# ENV CHROMIUM_PATH "/usr/bin/google-chrome-stable"
# ------------------------
# FROM --platform=linux/arm64 node:14.18
# RUN apt-get update \
# && apt-get install -y chromium \
# && apt-get install -y ca-certificates \
# fonts-liberation \
# libappindicator3-1 \
# libasound2 \
# libatk-bridge2.0-0 \
# libatk1.0-0 \
# libc6 \
# libcairo2 \
# libcups2 \
# libdbus-1-3 \
# libexpat1 \
# libfontconfig1 \
# libgbm1 \
# libgcc1 \
# libglib2.0-0 \
# libgtk-3-0 \
# libnspr4 \
# libnss3 \
# libpango-1.0-0 \
# libpangocairo-1.0-0 \
# libstdc++6 \
# libx11-6 \
# libx11-xcb1 \
# libxcb1 \
# libxcomposite1 \
# libxcursor1 \
# libxdamage1 \
# libxext6 \
# libxfixes3 \
# libxi6 \
# libxrandr2 \
# libxrender1 \
# libxss1 \
# libxtst6 \
# lsb-release \
# wget \
# xdg-utils
FROM node:14.18-alpine
# Installs latest Chromium (92) package.
RUN apk add --no-cache \
chromium \
nss \
freetype \
harfbuzz \
ca-certificates \
ttf-freefont \
nodejs \
yarn
# Tell Puppeteer to skip installing Chrome. We'll be using the installed package.
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \
PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser
# Puppeteer v10.0.0 works with Chromium 92.
RUN yarn add puppeteer@10.0.0
# Add user so we don't need --no-sandbox.
RUN addgroup -S pptruser && adduser -S -g pptruser pptruser \
&& mkdir -p /home/pptruser/Downloads /app \
&& chown -R pptruser:pptruser /home/pptruser \
&& chown -R pptruser:pptruser /app
# Run everything after as non-privileged user.
WORKDIR /app
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true
ENV CHROMIUM_PATH /usr/bin/chromium-browser
ENV LAUNCH_HEADLESS=true
COPY package.json .
COPY yarn.lock .
COPY tsconfig.json .
COPY .prettierrc .
COPY .eslintrc .
COPY /packages/content-fetch-gcf/package.json ./packages/content-fetch-gcf/package.json
RUN yarn install --pure-lockfile
ADD /packages/content-fetch-gcf ./packages/content-fetch-gcf
EXPOSE 8080
# USER pptruser
ENTRYPOINT ["yarn", "workspace", "@omnivore/content-fetch-gcf", "start"]

View File

@ -1,25 +0,0 @@
{
"name": "@omnivore/content-fetch-gcf",
"version": "1.0.0",
"description": "Google Cloud Function that accepts URL of the article and parses its content",
"main": "index.js",
"dependencies": {
"@google-cloud/logging-winston": "^4.1.2",
"@google-cloud/storage": "^5.18.1",
"@sentry/serverless": "^6.13.3",
"axios": "^0.26.0",
"dotenv": "^8.2.0",
"jsonwebtoken": "^8.5.1",
"linkedom": "^0.14.9",
"luxon": "^2.3.1",
"winston": "^3.3.3",
"@omnivore/puppeteer-parse": "^1.0.0"
},
"devDependencies": {
"@google-cloud/functions-framework": "^3.0.0"
},
"scripts": {
"start": "npx functions-framework --port=9090 --target=puppeteer",
"start_preview": "npx functions-framework --target=preview"
}
}

View File

@ -6,3 +6,13 @@ REST_BACKEND_ENDPOINT=http://localhost:4000/api
# set for local development # set for local development
IS_LOCAL=true IS_LOCAL=true
VERIFICATION_TOKEN='123456'
CHROMIUM_PATH=/opt/homebrew/bin/chromium
PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true
LAUNCH_HEADLESS=true
TWITTER_BEARER_TOKEN=token
PORT=9090

View File

@ -30,12 +30,15 @@ COPY .prettierrc .
COPY .eslintrc . COPY .eslintrc .
COPY /packages/content-handler/package.json ./packages/content-handler/package.json COPY /packages/content-handler/package.json ./packages/content-handler/package.json
COPY /packages/puppeteer-parse/package.json ./packages/puppeteer-parse/package.json
RUN yarn install --pure-lockfile RUN yarn install --pure-lockfile
ADD /packages/content-fetch ./packages/content-fetch ADD /packages/content-fetch ./packages/content-fetch
ADD /packages/content-handler ./packages/content-handler ADD /packages/content-handler ./packages/content-handler
ADD /packages/puppeteer-parse ./packages/puppeteer-parse
RUN yarn workspace @omnivore/content-handler build RUN yarn workspace @omnivore/content-handler build
RUN yarn workspace @omnivore/puppeteer-parse build
# After building, fetch the production dependencies # After building, fetch the production dependencies
RUN rm -rf /app/packages/content-fetch/node_modules RUN rm -rf /app/packages/content-fetch/node_modules

View File

@ -0,0 +1,52 @@
FROM node:14.18-alpine
# Installs latest Chromium (92) package.
RUN apk add --no-cache \
chromium \
nss \
freetype \
harfbuzz \
ca-certificates \
ttf-freefont \
nodejs \
yarn
# Add user so we don't need --no-sandbox.
RUN addgroup -S pptruser && adduser -S -g pptruser pptruser \
&& mkdir -p /home/pptruser/Downloads /app \
&& chown -R pptruser:pptruser /home/pptruser \
&& chown -R pptruser:pptruser /app
# Run everything after as non-privileged user.
WORKDIR /app
ENV CHROMIUM_PATH /usr/bin/chromium-browser
ENV LAUNCH_HEADLESS=true
ENV PORT 9090
COPY package.json .
COPY yarn.lock .
COPY tsconfig.json .
COPY .prettierrc .
COPY .eslintrc .
COPY /packages/content-handler/package.json ./packages/content-handler/package.json
COPY /packages/puppeteer-parse/package.json ./packages/puppeteer-parse/package.json
RUN yarn install --pure-lockfile
ADD /packages/content-handler ./packages/content-handler
ADD /packages/puppeteer-parse ./packages/puppeteer-parse
ADD /packages/content-fetch ./packages/content-fetch
RUN yarn workspace @omnivore/content-handler build
RUN yarn workspace @omnivore/puppeteer-parse build
# After building, fetch the production dependencies
RUN rm -rf /app/packages/content-fetch/node_modules
RUN rm -rf /app/node_modules
RUN yarn install --pure-lockfile --production
EXPOSE 9090
# USER pptruser
ENTRYPOINT ["yarn", "workspace", "@omnivore/content-fetch", "start_gcf"]

View File

@ -4,7 +4,6 @@
"description": "Service that fetches page content from a URL", "description": "Service that fetches page content from a URL",
"main": "index.js", "main": "index.js",
"dependencies": { "dependencies": {
"@omnivore/content-handler": "1.0.0",
"axios": "^0.27.2", "axios": "^0.27.2",
"dotenv": "^8.2.0", "dotenv": "^8.2.0",
"express": "^4.17.1", "express": "^4.17.1",
@ -13,9 +12,18 @@
"luxon": "^2.3.1", "luxon": "^2.3.1",
"puppeteer-core": "^16.1.0", "puppeteer-core": "^16.1.0",
"underscore": "^1.13.4", "underscore": "^1.13.4",
"@omnivore/puppeteer-parse": "^1.0.0" "@omnivore/puppeteer-parse": "^1.0.0",
"@google-cloud/logging-winston": "^4.1.2",
"@google-cloud/storage": "^5.18.1",
"@sentry/serverless": "^6.13.3",
"winston": "^3.3.3"
},
"devDependencies": {
"@google-cloud/functions-framework": "^3.0.0"
}, },
"scripts": { "scripts": {
"start": "node app.js" "start": "node app.js",
"start_gcf": "npx functions-framework --port=9090 --target=puppeteer",
"start_preview": "npx functions-framework --target=preview"
} }
} }

View File

@ -1,4 +0,0 @@
node_modules
.env*
Dockerfile
.dockerignore

View File

@ -5,7 +5,7 @@
/* eslint-disable @typescript-eslint/no-require-imports */ /* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config(); require('dotenv').config();
const Url = require('url'); const Url = require('url');
const puppeteer = require('puppeteer-extra'); // const puppeteer = require('puppeteer-extra');
const axios = require('axios'); const axios = require('axios');
const jwt = require('jsonwebtoken'); const jwt = require('jsonwebtoken');
const { promisify } = require('util'); const { promisify } = require('util');
@ -14,10 +14,8 @@ const { config, format, loggers, transports } = require('winston');
const { LoggingWinston } = require('@google-cloud/logging-winston'); const { LoggingWinston } = require('@google-cloud/logging-winston');
const { DateTime } = require('luxon'); const { DateTime } = require('luxon');
const os = require('os'); const os = require('os');
const Sentry = require('@sentry/serverless');
const { Storage } = require('@google-cloud/storage'); const { Storage } = require('@google-cloud/storage');
const { parseHTML } = require('linkedom');
const chromium = require('chrome-aws-lambda');
const puppeteer = require('puppeteer-core'); const puppeteer = require('puppeteer-core');
const { preHandleContent } = require("@omnivore/content-handler"); const { preHandleContent } = require("@omnivore/content-handler");
@ -29,11 +27,6 @@ const storage = new Storage();
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : []; const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined; const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined;
Sentry.GCPFunction.init({
dsn: process.env.SENTRY_DSN,
tracesSampleRate: 0,
});
const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
const BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' const BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
@ -44,12 +37,6 @@ const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com'];
const path = require("path"); const path = require("path");
const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf']; const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf'];
const { parseHTML } = require('linkedom');
// Add stealth plugin to hide puppeteer usage
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
puppeteer.use(StealthPlugin());
const userAgentForUrl = (url) => { const userAgentForUrl = (url) => {
try { try {
const u = new URL(url); const u = new URL(url);
@ -94,8 +81,7 @@ const enableJavascriptForUrl = (url) => {
}; };
// launch Puppeteer // launch Puppeteer
const getBrowserPromise = (async (proxyUrl, chromiumPath) => { const getBrowserPromise = (async () => {
console.log("starting with proxy url", proxyUrl)
return puppeteer.launch({ return puppeteer.launch({
args: [ args: [
'--allow-running-insecure-content', '--allow-running-insecure-content',
@ -122,8 +108,8 @@ const getBrowserPromise = (async (proxyUrl, chromiumPath) => {
'--window-size=1920,1080', '--window-size=1920,1080',
].filter((item) => !!item), ].filter((item) => !!item),
defaultViewport: { height: 1080, width: 1920 }, defaultViewport: { height: 1080, width: 1920 },
executablePath: chromiumPath, executablePath: process.env.CHROMIUM_PATH,
headless: true, headless: !!process.env.HEADLESS,
timeout: 120000, // 2 minutes timeout: 120000, // 2 minutes
}); });
})(); })();
@ -237,7 +223,7 @@ async function fetchContent(req, res) {
}, },
}; };
console.log(`Article parsing request`, logRecord); logger.info(`Article parsing request`, logRecord);
if (!url) { if (!url) {
logRecord.urlIsInvalid = true; logRecord.urlIsInvalid = true;

View File

@ -4,13 +4,10 @@
"description": "Accepts URL of the article and parses its content", "description": "Accepts URL of the article and parses its content",
"main": "index.js", "main": "index.js",
"dependencies": { "dependencies": {
"@google-cloud/functions-framework": "^3.1.2",
"@google-cloud/logging-winston": "^5.1.1", "@google-cloud/logging-winston": "^5.1.1",
"@google-cloud/storage": "^5.18.1", "@google-cloud/storage": "^5.18.1",
"@omnivore/content-handler": "1.0.0", "@omnivore/content-handler": "1.0.0",
"@sentry/serverless": "^6.13.3",
"axios": "^0.27.2", "axios": "^0.27.2",
"chrome-aws-lambda": "^10.1.0",
"dotenv": "^8.2.0", "dotenv": "^8.2.0",
"jsonwebtoken": "^8.5.1", "jsonwebtoken": "^8.5.1",
"linkedom": "^0.14.9", "linkedom": "^0.14.9",
@ -18,8 +15,5 @@
"puppeteer-core": "^16.1.0", "puppeteer-core": "^16.1.0",
"underscore": "^1.13.4", "underscore": "^1.13.4",
"winston": "^3.3.3" "winston": "^3.3.3"
},
"scripts": {
"test": "yarn mocha"
} }
} }

View File

@ -1,3 +0,0 @@
const register = require('@babel/register').default
register({ extensions: ['.ts', '.tsx', '.js', '.jsx'] })

View File

@ -1,13 +0,0 @@
import 'mocha'
import * as chai from 'chai'
import { expect } from 'chai'
import 'chai/register-should'
import chaiString from 'chai-string'
chai.use(chaiString)
describe('Stub test', () => {
it('should pass', () => {
expect(true).to.be.true
})
})