Remove content-fetch-gcf and create a Dockerfile for the cloud function
This commit is contained in:
1
packages/content-fetch-gcf/.gitignore
vendored
1
packages/content-fetch-gcf/.gitignore
vendored
@ -1 +0,0 @@
|
||||
previewImage.*
|
||||
@ -1,112 +0,0 @@
|
||||
# FROM node:14-slim
|
||||
|
||||
# # Taken from pu
|
||||
|
||||
# # Install latest chrome dev package and fonts to support major charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others)
|
||||
# # Note: this installs the necessary libs to make the bundled version of Chromium that Puppeteer
|
||||
# # installs, work.
|
||||
# RUN apt-get update \
|
||||
# && apt-get install -y wget gnupg \
|
||||
# && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
||||
# && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
|
||||
# && apt-get update \
|
||||
# && apt-get install -y google-chrome-stable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-freefont-ttf libxss1 \
|
||||
# --no-install-recommends \
|
||||
# && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true
|
||||
# ENV CHROMIUM_PATH "/usr/bin/google-chrome-stable"
|
||||
|
||||
# ------------------------
|
||||
|
||||
# FROM --platform=linux/arm64 node:14.18
|
||||
|
||||
# RUN apt-get update \
|
||||
# && apt-get install -y chromium \
|
||||
# && apt-get install -y ca-certificates \
|
||||
# fonts-liberation \
|
||||
# libappindicator3-1 \
|
||||
# libasound2 \
|
||||
# libatk-bridge2.0-0 \
|
||||
# libatk1.0-0 \
|
||||
# libc6 \
|
||||
# libcairo2 \
|
||||
# libcups2 \
|
||||
# libdbus-1-3 \
|
||||
# libexpat1 \
|
||||
# libfontconfig1 \
|
||||
# libgbm1 \
|
||||
# libgcc1 \
|
||||
# libglib2.0-0 \
|
||||
# libgtk-3-0 \
|
||||
# libnspr4 \
|
||||
# libnss3 \
|
||||
# libpango-1.0-0 \
|
||||
# libpangocairo-1.0-0 \
|
||||
# libstdc++6 \
|
||||
# libx11-6 \
|
||||
# libx11-xcb1 \
|
||||
# libxcb1 \
|
||||
# libxcomposite1 \
|
||||
# libxcursor1 \
|
||||
# libxdamage1 \
|
||||
# libxext6 \
|
||||
# libxfixes3 \
|
||||
# libxi6 \
|
||||
# libxrandr2 \
|
||||
# libxrender1 \
|
||||
# libxss1 \
|
||||
# libxtst6 \
|
||||
# lsb-release \
|
||||
# wget \
|
||||
# xdg-utils
|
||||
|
||||
FROM node:14.18-alpine
|
||||
|
||||
# Installs latest Chromium (92) package.
|
||||
RUN apk add --no-cache \
|
||||
chromium \
|
||||
nss \
|
||||
freetype \
|
||||
harfbuzz \
|
||||
ca-certificates \
|
||||
ttf-freefont \
|
||||
nodejs \
|
||||
yarn
|
||||
|
||||
# Tell Puppeteer to skip installing Chrome. We'll be using the installed package.
|
||||
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \
|
||||
PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser
|
||||
|
||||
# Puppeteer v10.0.0 works with Chromium 92.
|
||||
RUN yarn add puppeteer@10.0.0
|
||||
|
||||
# Add user so we don't need --no-sandbox.
|
||||
RUN addgroup -S pptruser && adduser -S -g pptruser pptruser \
|
||||
&& mkdir -p /home/pptruser/Downloads /app \
|
||||
&& chown -R pptruser:pptruser /home/pptruser \
|
||||
&& chown -R pptruser:pptruser /app
|
||||
|
||||
# Run everything after as non-privileged user.
|
||||
WORKDIR /app
|
||||
|
||||
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true
|
||||
ENV CHROMIUM_PATH /usr/bin/chromium-browser
|
||||
ENV LAUNCH_HEADLESS=true
|
||||
|
||||
COPY package.json .
|
||||
COPY yarn.lock .
|
||||
COPY tsconfig.json .
|
||||
COPY .prettierrc .
|
||||
COPY .eslintrc .
|
||||
|
||||
COPY /packages/content-fetch-gcf/package.json ./packages/content-fetch-gcf/package.json
|
||||
|
||||
RUN yarn install --pure-lockfile
|
||||
|
||||
ADD /packages/content-fetch-gcf ./packages/content-fetch-gcf
|
||||
|
||||
EXPOSE 8080
|
||||
|
||||
# USER pptruser
|
||||
ENTRYPOINT ["yarn", "workspace", "@omnivore/content-fetch-gcf", "start"]
|
||||
@ -1,25 +0,0 @@
|
||||
{
|
||||
"name": "@omnivore/content-fetch-gcf",
|
||||
"version": "1.0.0",
|
||||
"description": "Google Cloud Function that accepts URL of the article and parses its content",
|
||||
"main": "index.js",
|
||||
"dependencies": {
|
||||
"@google-cloud/logging-winston": "^4.1.2",
|
||||
"@google-cloud/storage": "^5.18.1",
|
||||
"@sentry/serverless": "^6.13.3",
|
||||
"axios": "^0.26.0",
|
||||
"dotenv": "^8.2.0",
|
||||
"jsonwebtoken": "^8.5.1",
|
||||
"linkedom": "^0.14.9",
|
||||
"luxon": "^2.3.1",
|
||||
"winston": "^3.3.3",
|
||||
"@omnivore/puppeteer-parse": "^1.0.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@google-cloud/functions-framework": "^3.0.0"
|
||||
},
|
||||
"scripts": {
|
||||
"start": "npx functions-framework --port=9090 --target=puppeteer",
|
||||
"start_preview": "npx functions-framework --target=preview"
|
||||
}
|
||||
}
|
||||
@ -6,3 +6,13 @@ REST_BACKEND_ENDPOINT=http://localhost:4000/api
|
||||
|
||||
# set for local development
|
||||
IS_LOCAL=true
|
||||
|
||||
VERIFICATION_TOKEN='123456'
|
||||
|
||||
CHROMIUM_PATH=/opt/homebrew/bin/chromium
|
||||
PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true
|
||||
LAUNCH_HEADLESS=true
|
||||
|
||||
TWITTER_BEARER_TOKEN=token
|
||||
|
||||
PORT=9090
|
||||
@ -30,12 +30,15 @@ COPY .prettierrc .
|
||||
COPY .eslintrc .
|
||||
|
||||
COPY /packages/content-handler/package.json ./packages/content-handler/package.json
|
||||
COPY /packages/puppeteer-parse/package.json ./packages/puppeteer-parse/package.json
|
||||
|
||||
RUN yarn install --pure-lockfile
|
||||
|
||||
ADD /packages/content-fetch ./packages/content-fetch
|
||||
ADD /packages/content-handler ./packages/content-handler
|
||||
ADD /packages/puppeteer-parse ./packages/puppeteer-parse
|
||||
RUN yarn workspace @omnivore/content-handler build
|
||||
RUN yarn workspace @omnivore/puppeteer-parse build
|
||||
|
||||
# After building, fetch the production dependencies
|
||||
RUN rm -rf /app/packages/content-fetch/node_modules
|
||||
|
||||
52
packages/content-fetch/Dockerfile-gcf
Normal file
52
packages/content-fetch/Dockerfile-gcf
Normal file
@ -0,0 +1,52 @@
|
||||
FROM node:14.18-alpine
|
||||
|
||||
# Installs latest Chromium (92) package.
|
||||
RUN apk add --no-cache \
|
||||
chromium \
|
||||
nss \
|
||||
freetype \
|
||||
harfbuzz \
|
||||
ca-certificates \
|
||||
ttf-freefont \
|
||||
nodejs \
|
||||
yarn
|
||||
|
||||
# Add user so we don't need --no-sandbox.
|
||||
RUN addgroup -S pptruser && adduser -S -g pptruser pptruser \
|
||||
&& mkdir -p /home/pptruser/Downloads /app \
|
||||
&& chown -R pptruser:pptruser /home/pptruser \
|
||||
&& chown -R pptruser:pptruser /app
|
||||
|
||||
# Run everything after as non-privileged user.
|
||||
WORKDIR /app
|
||||
|
||||
ENV CHROMIUM_PATH /usr/bin/chromium-browser
|
||||
ENV LAUNCH_HEADLESS=true
|
||||
ENV PORT 9090
|
||||
|
||||
COPY package.json .
|
||||
COPY yarn.lock .
|
||||
COPY tsconfig.json .
|
||||
COPY .prettierrc .
|
||||
COPY .eslintrc .
|
||||
|
||||
COPY /packages/content-handler/package.json ./packages/content-handler/package.json
|
||||
COPY /packages/puppeteer-parse/package.json ./packages/puppeteer-parse/package.json
|
||||
|
||||
RUN yarn install --pure-lockfile
|
||||
|
||||
ADD /packages/content-handler ./packages/content-handler
|
||||
ADD /packages/puppeteer-parse ./packages/puppeteer-parse
|
||||
ADD /packages/content-fetch ./packages/content-fetch
|
||||
RUN yarn workspace @omnivore/content-handler build
|
||||
RUN yarn workspace @omnivore/puppeteer-parse build
|
||||
|
||||
# After building, fetch the production dependencies
|
||||
RUN rm -rf /app/packages/content-fetch/node_modules
|
||||
RUN rm -rf /app/node_modules
|
||||
RUN yarn install --pure-lockfile --production
|
||||
|
||||
EXPOSE 9090
|
||||
|
||||
# USER pptruser
|
||||
ENTRYPOINT ["yarn", "workspace", "@omnivore/content-fetch", "start_gcf"]
|
||||
@ -4,7 +4,6 @@
|
||||
"description": "Service that fetches page content from a URL",
|
||||
"main": "index.js",
|
||||
"dependencies": {
|
||||
"@omnivore/content-handler": "1.0.0",
|
||||
"axios": "^0.27.2",
|
||||
"dotenv": "^8.2.0",
|
||||
"express": "^4.17.1",
|
||||
@ -13,9 +12,18 @@
|
||||
"luxon": "^2.3.1",
|
||||
"puppeteer-core": "^16.1.0",
|
||||
"underscore": "^1.13.4",
|
||||
"@omnivore/puppeteer-parse": "^1.0.0"
|
||||
"@omnivore/puppeteer-parse": "^1.0.0",
|
||||
"@google-cloud/logging-winston": "^4.1.2",
|
||||
"@google-cloud/storage": "^5.18.1",
|
||||
"@sentry/serverless": "^6.13.3",
|
||||
"winston": "^3.3.3"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@google-cloud/functions-framework": "^3.0.0"
|
||||
},
|
||||
"scripts": {
|
||||
"start": "node app.js"
|
||||
"start": "node app.js",
|
||||
"start_gcf": "npx functions-framework --port=9090 --target=puppeteer",
|
||||
"start_preview": "npx functions-framework --target=preview"
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,4 +0,0 @@
|
||||
node_modules
|
||||
.env*
|
||||
Dockerfile
|
||||
.dockerignore
|
||||
@ -5,7 +5,7 @@
|
||||
/* eslint-disable @typescript-eslint/no-require-imports */
|
||||
require('dotenv').config();
|
||||
const Url = require('url');
|
||||
const puppeteer = require('puppeteer-extra');
|
||||
// const puppeteer = require('puppeteer-extra');
|
||||
const axios = require('axios');
|
||||
const jwt = require('jsonwebtoken');
|
||||
const { promisify } = require('util');
|
||||
@ -14,10 +14,8 @@ const { config, format, loggers, transports } = require('winston');
|
||||
const { LoggingWinston } = require('@google-cloud/logging-winston');
|
||||
const { DateTime } = require('luxon');
|
||||
const os = require('os');
|
||||
const Sentry = require('@sentry/serverless');
|
||||
const { Storage } = require('@google-cloud/storage');
|
||||
|
||||
const chromium = require('chrome-aws-lambda');
|
||||
const { parseHTML } = require('linkedom');
|
||||
const puppeteer = require('puppeteer-core');
|
||||
const { preHandleContent } = require("@omnivore/content-handler");
|
||||
|
||||
@ -29,11 +27,6 @@ const storage = new Storage();
|
||||
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
|
||||
const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined;
|
||||
|
||||
Sentry.GCPFunction.init({
|
||||
dsn: process.env.SENTRY_DSN,
|
||||
tracesSampleRate: 0,
|
||||
});
|
||||
|
||||
const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'
|
||||
const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||
const BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36'
|
||||
@ -44,12 +37,6 @@ const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com'];
|
||||
const path = require("path");
|
||||
const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf'];
|
||||
|
||||
const { parseHTML } = require('linkedom');
|
||||
|
||||
// Add stealth plugin to hide puppeteer usage
|
||||
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
|
||||
puppeteer.use(StealthPlugin());
|
||||
|
||||
const userAgentForUrl = (url) => {
|
||||
try {
|
||||
const u = new URL(url);
|
||||
@ -94,8 +81,7 @@ const enableJavascriptForUrl = (url) => {
|
||||
};
|
||||
|
||||
// launch Puppeteer
|
||||
const getBrowserPromise = (async (proxyUrl, chromiumPath) => {
|
||||
console.log("starting with proxy url", proxyUrl)
|
||||
const getBrowserPromise = (async () => {
|
||||
return puppeteer.launch({
|
||||
args: [
|
||||
'--allow-running-insecure-content',
|
||||
@ -122,8 +108,8 @@ const getBrowserPromise = (async (proxyUrl, chromiumPath) => {
|
||||
'--window-size=1920,1080',
|
||||
].filter((item) => !!item),
|
||||
defaultViewport: { height: 1080, width: 1920 },
|
||||
executablePath: chromiumPath,
|
||||
headless: true,
|
||||
executablePath: process.env.CHROMIUM_PATH,
|
||||
headless: !!process.env.HEADLESS,
|
||||
timeout: 120000, // 2 minutes
|
||||
});
|
||||
})();
|
||||
@ -237,7 +223,7 @@ async function fetchContent(req, res) {
|
||||
},
|
||||
};
|
||||
|
||||
console.log(`Article parsing request`, logRecord);
|
||||
logger.info(`Article parsing request`, logRecord);
|
||||
|
||||
if (!url) {
|
||||
logRecord.urlIsInvalid = true;
|
||||
|
||||
@ -4,13 +4,10 @@
|
||||
"description": "Accepts URL of the article and parses its content",
|
||||
"main": "index.js",
|
||||
"dependencies": {
|
||||
"@google-cloud/functions-framework": "^3.1.2",
|
||||
"@google-cloud/logging-winston": "^5.1.1",
|
||||
"@google-cloud/storage": "^5.18.1",
|
||||
"@omnivore/content-handler": "1.0.0",
|
||||
"@sentry/serverless": "^6.13.3",
|
||||
"axios": "^0.27.2",
|
||||
"chrome-aws-lambda": "^10.1.0",
|
||||
"dotenv": "^8.2.0",
|
||||
"jsonwebtoken": "^8.5.1",
|
||||
"linkedom": "^0.14.9",
|
||||
@ -18,8 +15,5 @@
|
||||
"puppeteer-core": "^16.1.0",
|
||||
"underscore": "^1.13.4",
|
||||
"winston": "^3.3.3"
|
||||
},
|
||||
"scripts": {
|
||||
"test": "yarn mocha"
|
||||
}
|
||||
}
|
||||
|
||||
@ -1,3 +0,0 @@
|
||||
const register = require('@babel/register').default
|
||||
|
||||
register({ extensions: ['.ts', '.tsx', '.js', '.jsx'] })
|
||||
@ -1,13 +0,0 @@
|
||||
import 'mocha'
|
||||
import * as chai from 'chai'
|
||||
import { expect } from 'chai'
|
||||
import 'chai/register-should'
|
||||
import chaiString from 'chai-string'
|
||||
|
||||
chai.use(chaiString)
|
||||
|
||||
describe('Stub test', () => {
|
||||
it('should pass', () => {
|
||||
expect(true).to.be.true
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user