Remove content-fetch-gcf and create a Dockerfile for the cloud function
This commit is contained in:
1
packages/content-fetch-gcf/.gitignore
vendored
1
packages/content-fetch-gcf/.gitignore
vendored
@ -1 +0,0 @@
|
|||||||
previewImage.*
|
|
||||||
@ -1,112 +0,0 @@
|
|||||||
# FROM node:14-slim
|
|
||||||
|
|
||||||
# # Taken from pu
|
|
||||||
|
|
||||||
# # Install latest chrome dev package and fonts to support major charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others)
|
|
||||||
# # Note: this installs the necessary libs to make the bundled version of Chromium that Puppeteer
|
|
||||||
# # installs, work.
|
|
||||||
# RUN apt-get update \
|
|
||||||
# && apt-get install -y wget gnupg \
|
|
||||||
# && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
|
||||||
# && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
|
|
||||||
# && apt-get update \
|
|
||||||
# && apt-get install -y google-chrome-stable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-freefont-ttf libxss1 \
|
|
||||||
# --no-install-recommends \
|
|
||||||
# && rm -rf /var/lib/apt/lists/*
|
|
||||||
|
|
||||||
# ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true
|
|
||||||
# ENV CHROMIUM_PATH "/usr/bin/google-chrome-stable"
|
|
||||||
|
|
||||||
# ------------------------
|
|
||||||
|
|
||||||
# FROM --platform=linux/arm64 node:14.18
|
|
||||||
|
|
||||||
# RUN apt-get update \
|
|
||||||
# && apt-get install -y chromium \
|
|
||||||
# && apt-get install -y ca-certificates \
|
|
||||||
# fonts-liberation \
|
|
||||||
# libappindicator3-1 \
|
|
||||||
# libasound2 \
|
|
||||||
# libatk-bridge2.0-0 \
|
|
||||||
# libatk1.0-0 \
|
|
||||||
# libc6 \
|
|
||||||
# libcairo2 \
|
|
||||||
# libcups2 \
|
|
||||||
# libdbus-1-3 \
|
|
||||||
# libexpat1 \
|
|
||||||
# libfontconfig1 \
|
|
||||||
# libgbm1 \
|
|
||||||
# libgcc1 \
|
|
||||||
# libglib2.0-0 \
|
|
||||||
# libgtk-3-0 \
|
|
||||||
# libnspr4 \
|
|
||||||
# libnss3 \
|
|
||||||
# libpango-1.0-0 \
|
|
||||||
# libpangocairo-1.0-0 \
|
|
||||||
# libstdc++6 \
|
|
||||||
# libx11-6 \
|
|
||||||
# libx11-xcb1 \
|
|
||||||
# libxcb1 \
|
|
||||||
# libxcomposite1 \
|
|
||||||
# libxcursor1 \
|
|
||||||
# libxdamage1 \
|
|
||||||
# libxext6 \
|
|
||||||
# libxfixes3 \
|
|
||||||
# libxi6 \
|
|
||||||
# libxrandr2 \
|
|
||||||
# libxrender1 \
|
|
||||||
# libxss1 \
|
|
||||||
# libxtst6 \
|
|
||||||
# lsb-release \
|
|
||||||
# wget \
|
|
||||||
# xdg-utils
|
|
||||||
|
|
||||||
FROM node:14.18-alpine
|
|
||||||
|
|
||||||
# Installs latest Chromium (92) package.
|
|
||||||
RUN apk add --no-cache \
|
|
||||||
chromium \
|
|
||||||
nss \
|
|
||||||
freetype \
|
|
||||||
harfbuzz \
|
|
||||||
ca-certificates \
|
|
||||||
ttf-freefont \
|
|
||||||
nodejs \
|
|
||||||
yarn
|
|
||||||
|
|
||||||
# Tell Puppeteer to skip installing Chrome. We'll be using the installed package.
|
|
||||||
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \
|
|
||||||
PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser
|
|
||||||
|
|
||||||
# Puppeteer v10.0.0 works with Chromium 92.
|
|
||||||
RUN yarn add puppeteer@10.0.0
|
|
||||||
|
|
||||||
# Add user so we don't need --no-sandbox.
|
|
||||||
RUN addgroup -S pptruser && adduser -S -g pptruser pptruser \
|
|
||||||
&& mkdir -p /home/pptruser/Downloads /app \
|
|
||||||
&& chown -R pptruser:pptruser /home/pptruser \
|
|
||||||
&& chown -R pptruser:pptruser /app
|
|
||||||
|
|
||||||
# Run everything after as non-privileged user.
|
|
||||||
WORKDIR /app
|
|
||||||
|
|
||||||
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true
|
|
||||||
ENV CHROMIUM_PATH /usr/bin/chromium-browser
|
|
||||||
ENV LAUNCH_HEADLESS=true
|
|
||||||
|
|
||||||
COPY package.json .
|
|
||||||
COPY yarn.lock .
|
|
||||||
COPY tsconfig.json .
|
|
||||||
COPY .prettierrc .
|
|
||||||
COPY .eslintrc .
|
|
||||||
|
|
||||||
COPY /packages/content-fetch-gcf/package.json ./packages/content-fetch-gcf/package.json
|
|
||||||
|
|
||||||
RUN yarn install --pure-lockfile
|
|
||||||
|
|
||||||
ADD /packages/content-fetch-gcf ./packages/content-fetch-gcf
|
|
||||||
|
|
||||||
EXPOSE 8080
|
|
||||||
|
|
||||||
# USER pptruser
|
|
||||||
ENTRYPOINT ["yarn", "workspace", "@omnivore/content-fetch-gcf", "start"]
|
|
||||||
@ -1,25 +0,0 @@
|
|||||||
{
|
|
||||||
"name": "@omnivore/content-fetch-gcf",
|
|
||||||
"version": "1.0.0",
|
|
||||||
"description": "Google Cloud Function that accepts URL of the article and parses its content",
|
|
||||||
"main": "index.js",
|
|
||||||
"dependencies": {
|
|
||||||
"@google-cloud/logging-winston": "^4.1.2",
|
|
||||||
"@google-cloud/storage": "^5.18.1",
|
|
||||||
"@sentry/serverless": "^6.13.3",
|
|
||||||
"axios": "^0.26.0",
|
|
||||||
"dotenv": "^8.2.0",
|
|
||||||
"jsonwebtoken": "^8.5.1",
|
|
||||||
"linkedom": "^0.14.9",
|
|
||||||
"luxon": "^2.3.1",
|
|
||||||
"winston": "^3.3.3",
|
|
||||||
"@omnivore/puppeteer-parse": "^1.0.0"
|
|
||||||
},
|
|
||||||
"devDependencies": {
|
|
||||||
"@google-cloud/functions-framework": "^3.0.0"
|
|
||||||
},
|
|
||||||
"scripts": {
|
|
||||||
"start": "npx functions-framework --port=9090 --target=puppeteer",
|
|
||||||
"start_preview": "npx functions-framework --target=preview"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@ -6,3 +6,13 @@ REST_BACKEND_ENDPOINT=http://localhost:4000/api
|
|||||||
|
|
||||||
# set for local development
|
# set for local development
|
||||||
IS_LOCAL=true
|
IS_LOCAL=true
|
||||||
|
|
||||||
|
VERIFICATION_TOKEN='123456'
|
||||||
|
|
||||||
|
CHROMIUM_PATH=/opt/homebrew/bin/chromium
|
||||||
|
PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true
|
||||||
|
LAUNCH_HEADLESS=true
|
||||||
|
|
||||||
|
TWITTER_BEARER_TOKEN=token
|
||||||
|
|
||||||
|
PORT=9090
|
||||||
@ -30,12 +30,15 @@ COPY .prettierrc .
|
|||||||
COPY .eslintrc .
|
COPY .eslintrc .
|
||||||
|
|
||||||
COPY /packages/content-handler/package.json ./packages/content-handler/package.json
|
COPY /packages/content-handler/package.json ./packages/content-handler/package.json
|
||||||
|
COPY /packages/puppeteer-parse/package.json ./packages/puppeteer-parse/package.json
|
||||||
|
|
||||||
RUN yarn install --pure-lockfile
|
RUN yarn install --pure-lockfile
|
||||||
|
|
||||||
ADD /packages/content-fetch ./packages/content-fetch
|
ADD /packages/content-fetch ./packages/content-fetch
|
||||||
ADD /packages/content-handler ./packages/content-handler
|
ADD /packages/content-handler ./packages/content-handler
|
||||||
|
ADD /packages/puppeteer-parse ./packages/puppeteer-parse
|
||||||
RUN yarn workspace @omnivore/content-handler build
|
RUN yarn workspace @omnivore/content-handler build
|
||||||
|
RUN yarn workspace @omnivore/puppeteer-parse build
|
||||||
|
|
||||||
# After building, fetch the production dependencies
|
# After building, fetch the production dependencies
|
||||||
RUN rm -rf /app/packages/content-fetch/node_modules
|
RUN rm -rf /app/packages/content-fetch/node_modules
|
||||||
|
|||||||
52
packages/content-fetch/Dockerfile-gcf
Normal file
52
packages/content-fetch/Dockerfile-gcf
Normal file
@ -0,0 +1,52 @@
|
|||||||
|
FROM node:14.18-alpine
|
||||||
|
|
||||||
|
# Installs latest Chromium (92) package.
|
||||||
|
RUN apk add --no-cache \
|
||||||
|
chromium \
|
||||||
|
nss \
|
||||||
|
freetype \
|
||||||
|
harfbuzz \
|
||||||
|
ca-certificates \
|
||||||
|
ttf-freefont \
|
||||||
|
nodejs \
|
||||||
|
yarn
|
||||||
|
|
||||||
|
# Add user so we don't need --no-sandbox.
|
||||||
|
RUN addgroup -S pptruser && adduser -S -g pptruser pptruser \
|
||||||
|
&& mkdir -p /home/pptruser/Downloads /app \
|
||||||
|
&& chown -R pptruser:pptruser /home/pptruser \
|
||||||
|
&& chown -R pptruser:pptruser /app
|
||||||
|
|
||||||
|
# Run everything after as non-privileged user.
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENV CHROMIUM_PATH /usr/bin/chromium-browser
|
||||||
|
ENV LAUNCH_HEADLESS=true
|
||||||
|
ENV PORT 9090
|
||||||
|
|
||||||
|
COPY package.json .
|
||||||
|
COPY yarn.lock .
|
||||||
|
COPY tsconfig.json .
|
||||||
|
COPY .prettierrc .
|
||||||
|
COPY .eslintrc .
|
||||||
|
|
||||||
|
COPY /packages/content-handler/package.json ./packages/content-handler/package.json
|
||||||
|
COPY /packages/puppeteer-parse/package.json ./packages/puppeteer-parse/package.json
|
||||||
|
|
||||||
|
RUN yarn install --pure-lockfile
|
||||||
|
|
||||||
|
ADD /packages/content-handler ./packages/content-handler
|
||||||
|
ADD /packages/puppeteer-parse ./packages/puppeteer-parse
|
||||||
|
ADD /packages/content-fetch ./packages/content-fetch
|
||||||
|
RUN yarn workspace @omnivore/content-handler build
|
||||||
|
RUN yarn workspace @omnivore/puppeteer-parse build
|
||||||
|
|
||||||
|
# After building, fetch the production dependencies
|
||||||
|
RUN rm -rf /app/packages/content-fetch/node_modules
|
||||||
|
RUN rm -rf /app/node_modules
|
||||||
|
RUN yarn install --pure-lockfile --production
|
||||||
|
|
||||||
|
EXPOSE 9090
|
||||||
|
|
||||||
|
# USER pptruser
|
||||||
|
ENTRYPOINT ["yarn", "workspace", "@omnivore/content-fetch", "start_gcf"]
|
||||||
@ -4,7 +4,6 @@
|
|||||||
"description": "Service that fetches page content from a URL",
|
"description": "Service that fetches page content from a URL",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@omnivore/content-handler": "1.0.0",
|
|
||||||
"axios": "^0.27.2",
|
"axios": "^0.27.2",
|
||||||
"dotenv": "^8.2.0",
|
"dotenv": "^8.2.0",
|
||||||
"express": "^4.17.1",
|
"express": "^4.17.1",
|
||||||
@ -13,9 +12,18 @@
|
|||||||
"luxon": "^2.3.1",
|
"luxon": "^2.3.1",
|
||||||
"puppeteer-core": "^16.1.0",
|
"puppeteer-core": "^16.1.0",
|
||||||
"underscore": "^1.13.4",
|
"underscore": "^1.13.4",
|
||||||
"@omnivore/puppeteer-parse": "^1.0.0"
|
"@omnivore/puppeteer-parse": "^1.0.0",
|
||||||
|
"@google-cloud/logging-winston": "^4.1.2",
|
||||||
|
"@google-cloud/storage": "^5.18.1",
|
||||||
|
"@sentry/serverless": "^6.13.3",
|
||||||
|
"winston": "^3.3.3"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"@google-cloud/functions-framework": "^3.0.0"
|
||||||
},
|
},
|
||||||
"scripts": {
|
"scripts": {
|
||||||
"start": "node app.js"
|
"start": "node app.js",
|
||||||
|
"start_gcf": "npx functions-framework --port=9090 --target=puppeteer",
|
||||||
|
"start_preview": "npx functions-framework --target=preview"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,4 +0,0 @@
|
|||||||
node_modules
|
|
||||||
.env*
|
|
||||||
Dockerfile
|
|
||||||
.dockerignore
|
|
||||||
@ -5,7 +5,7 @@
|
|||||||
/* eslint-disable @typescript-eslint/no-require-imports */
|
/* eslint-disable @typescript-eslint/no-require-imports */
|
||||||
require('dotenv').config();
|
require('dotenv').config();
|
||||||
const Url = require('url');
|
const Url = require('url');
|
||||||
const puppeteer = require('puppeteer-extra');
|
// const puppeteer = require('puppeteer-extra');
|
||||||
const axios = require('axios');
|
const axios = require('axios');
|
||||||
const jwt = require('jsonwebtoken');
|
const jwt = require('jsonwebtoken');
|
||||||
const { promisify } = require('util');
|
const { promisify } = require('util');
|
||||||
@ -14,10 +14,8 @@ const { config, format, loggers, transports } = require('winston');
|
|||||||
const { LoggingWinston } = require('@google-cloud/logging-winston');
|
const { LoggingWinston } = require('@google-cloud/logging-winston');
|
||||||
const { DateTime } = require('luxon');
|
const { DateTime } = require('luxon');
|
||||||
const os = require('os');
|
const os = require('os');
|
||||||
const Sentry = require('@sentry/serverless');
|
|
||||||
const { Storage } = require('@google-cloud/storage');
|
const { Storage } = require('@google-cloud/storage');
|
||||||
|
const { parseHTML } = require('linkedom');
|
||||||
const chromium = require('chrome-aws-lambda');
|
|
||||||
const puppeteer = require('puppeteer-core');
|
const puppeteer = require('puppeteer-core');
|
||||||
const { preHandleContent } = require("@omnivore/content-handler");
|
const { preHandleContent } = require("@omnivore/content-handler");
|
||||||
|
|
||||||
@ -29,11 +27,6 @@ const storage = new Storage();
|
|||||||
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
|
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
|
||||||
const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined;
|
const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined;
|
||||||
|
|
||||||
Sentry.GCPFunction.init({
|
|
||||||
dsn: process.env.SENTRY_DSN,
|
|
||||||
tracesSampleRate: 0,
|
|
||||||
});
|
|
||||||
|
|
||||||
const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
|
const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
|
||||||
const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||||
const BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
const BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||||
@ -44,12 +37,6 @@ const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com'];
|
|||||||
const path = require("path");
|
const path = require("path");
|
||||||
const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf'];
|
const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf'];
|
||||||
|
|
||||||
const { parseHTML } = require('linkedom');
|
|
||||||
|
|
||||||
// Add stealth plugin to hide puppeteer usage
|
|
||||||
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
|
||||||
puppeteer.use(StealthPlugin());
|
|
||||||
|
|
||||||
const userAgentForUrl = (url) => {
|
const userAgentForUrl = (url) => {
|
||||||
try {
|
try {
|
||||||
const u = new URL(url);
|
const u = new URL(url);
|
||||||
@ -94,8 +81,7 @@ const enableJavascriptForUrl = (url) => {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// launch Puppeteer
|
// launch Puppeteer
|
||||||
const getBrowserPromise = (async (proxyUrl, chromiumPath) => {
|
const getBrowserPromise = (async () => {
|
||||||
console.log("starting with proxy url", proxyUrl)
|
|
||||||
return puppeteer.launch({
|
return puppeteer.launch({
|
||||||
args: [
|
args: [
|
||||||
'--allow-running-insecure-content',
|
'--allow-running-insecure-content',
|
||||||
@ -122,8 +108,8 @@ const getBrowserPromise = (async (proxyUrl, chromiumPath) => {
|
|||||||
'--window-size=1920,1080',
|
'--window-size=1920,1080',
|
||||||
].filter((item) => !!item),
|
].filter((item) => !!item),
|
||||||
defaultViewport: { height: 1080, width: 1920 },
|
defaultViewport: { height: 1080, width: 1920 },
|
||||||
executablePath: chromiumPath,
|
executablePath: process.env.CHROMIUM_PATH,
|
||||||
headless: true,
|
headless: !!process.env.HEADLESS,
|
||||||
timeout: 120000, // 2 minutes
|
timeout: 120000, // 2 minutes
|
||||||
});
|
});
|
||||||
})();
|
})();
|
||||||
@ -237,7 +223,7 @@ async function fetchContent(req, res) {
|
|||||||
},
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
console.log(`Article parsing request`, logRecord);
|
logger.info(`Article parsing request`, logRecord);
|
||||||
|
|
||||||
if (!url) {
|
if (!url) {
|
||||||
logRecord.urlIsInvalid = true;
|
logRecord.urlIsInvalid = true;
|
||||||
|
|||||||
@ -4,13 +4,10 @@
|
|||||||
"description": "Accepts URL of the article and parses its content",
|
"description": "Accepts URL of the article and parses its content",
|
||||||
"main": "index.js",
|
"main": "index.js",
|
||||||
"dependencies": {
|
"dependencies": {
|
||||||
"@google-cloud/functions-framework": "^3.1.2",
|
|
||||||
"@google-cloud/logging-winston": "^5.1.1",
|
"@google-cloud/logging-winston": "^5.1.1",
|
||||||
"@google-cloud/storage": "^5.18.1",
|
"@google-cloud/storage": "^5.18.1",
|
||||||
"@omnivore/content-handler": "1.0.0",
|
"@omnivore/content-handler": "1.0.0",
|
||||||
"@sentry/serverless": "^6.13.3",
|
|
||||||
"axios": "^0.27.2",
|
"axios": "^0.27.2",
|
||||||
"chrome-aws-lambda": "^10.1.0",
|
|
||||||
"dotenv": "^8.2.0",
|
"dotenv": "^8.2.0",
|
||||||
"jsonwebtoken": "^8.5.1",
|
"jsonwebtoken": "^8.5.1",
|
||||||
"linkedom": "^0.14.9",
|
"linkedom": "^0.14.9",
|
||||||
@ -18,8 +15,5 @@
|
|||||||
"puppeteer-core": "^16.1.0",
|
"puppeteer-core": "^16.1.0",
|
||||||
"underscore": "^1.13.4",
|
"underscore": "^1.13.4",
|
||||||
"winston": "^3.3.3"
|
"winston": "^3.3.3"
|
||||||
},
|
|
||||||
"scripts": {
|
|
||||||
"test": "yarn mocha"
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,3 +0,0 @@
|
|||||||
const register = require('@babel/register').default
|
|
||||||
|
|
||||||
register({ extensions: ['.ts', '.tsx', '.js', '.jsx'] })
|
|
||||||
@ -1,13 +0,0 @@
|
|||||||
import 'mocha'
|
|
||||||
import * as chai from 'chai'
|
|
||||||
import { expect } from 'chai'
|
|
||||||
import 'chai/register-should'
|
|
||||||
import chaiString from 'chai-string'
|
|
||||||
|
|
||||||
chai.use(chaiString)
|
|
||||||
|
|
||||||
describe('Stub test', () => {
|
|
||||||
it('should pass', () => {
|
|
||||||
expect(true).to.be.true
|
|
||||||
})
|
|
||||||
})
|
|
||||||
Reference in New Issue
Block a user