diff --git a/packages/content-fetch-gcf/.gitignore b/packages/content-fetch-gcf/.gitignore deleted file mode 100644 index 9bfbc5e8b..000000000 --- a/packages/content-fetch-gcf/.gitignore +++ /dev/null @@ -1 +0,0 @@ -previewImage.* \ No newline at end of file diff --git a/packages/content-fetch-gcf/Dockerfile b/packages/content-fetch-gcf/Dockerfile deleted file mode 100644 index c0df04111..000000000 --- a/packages/content-fetch-gcf/Dockerfile +++ /dev/null @@ -1,112 +0,0 @@ -# FROM node:14-slim - -# # Taken from pu - -# # Install latest chrome dev package and fonts to support major charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others) -# # Note: this installs the necessary libs to make the bundled version of Chromium that Puppeteer -# # installs, work. -# RUN apt-get update \ -# && apt-get install -y wget gnupg \ -# && wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \ -# && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \ -# && apt-get update \ -# && apt-get install -y google-chrome-stable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-freefont-ttf libxss1 \ -# --no-install-recommends \ -# && rm -rf /var/lib/apt/lists/* - -# ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true -# ENV CHROMIUM_PATH "/usr/bin/google-chrome-stable" - -# ------------------------ - -# FROM --platform=linux/arm64 node:14.18 - -# RUN apt-get update \ -# && apt-get install -y chromium \ -# && apt-get install -y ca-certificates \ -# fonts-liberation \ -# libappindicator3-1 \ -# libasound2 \ -# libatk-bridge2.0-0 \ -# libatk1.0-0 \ -# libc6 \ -# libcairo2 \ -# libcups2 \ -# libdbus-1-3 \ -# libexpat1 \ -# libfontconfig1 \ -# libgbm1 \ -# libgcc1 \ -# libglib2.0-0 \ -# libgtk-3-0 \ -# libnspr4 \ -# libnss3 \ -# libpango-1.0-0 \ -# libpangocairo-1.0-0 \ -# libstdc++6 \ -# libx11-6 \ -# libx11-xcb1 \ -# libxcb1 \ -# libxcomposite1 \ -# libxcursor1 \ -# libxdamage1 \ -# libxext6 \ -# libxfixes3 \ -# libxi6 \ -# libxrandr2 \ -# libxrender1 \ -# libxss1 \ -# libxtst6 \ -# lsb-release \ -# wget \ -# xdg-utils - -FROM node:14.18-alpine - -# Installs latest Chromium (92) package. -RUN apk add --no-cache \ - chromium \ - nss \ - freetype \ - harfbuzz \ - ca-certificates \ - ttf-freefont \ - nodejs \ - yarn - -# Tell Puppeteer to skip installing Chrome. We'll be using the installed package. -ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true \ - PUPPETEER_EXECUTABLE_PATH=/usr/bin/chromium-browser - -# Puppeteer v10.0.0 works with Chromium 92. -RUN yarn add puppeteer@10.0.0 - -# Add user so we don't need --no-sandbox. -RUN addgroup -S pptruser && adduser -S -g pptruser pptruser \ - && mkdir -p /home/pptruser/Downloads /app \ - && chown -R pptruser:pptruser /home/pptruser \ - && chown -R pptruser:pptruser /app - -# Run everything after as non-privileged user. -WORKDIR /app - -ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true -ENV CHROMIUM_PATH /usr/bin/chromium-browser -ENV LAUNCH_HEADLESS=true - -COPY package.json . -COPY yarn.lock . -COPY tsconfig.json . -COPY .prettierrc . -COPY .eslintrc . - -COPY /packages/content-fetch-gcf/package.json ./packages/content-fetch-gcf/package.json - -RUN yarn install --pure-lockfile - -ADD /packages/content-fetch-gcf ./packages/content-fetch-gcf - -EXPOSE 8080 - -# USER pptruser -ENTRYPOINT ["yarn", "workspace", "@omnivore/content-fetch-gcf", "start"] diff --git a/packages/content-fetch-gcf/package.json b/packages/content-fetch-gcf/package.json deleted file mode 100644 index efec3ca5e..000000000 --- a/packages/content-fetch-gcf/package.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "name": "@omnivore/content-fetch-gcf", - "version": "1.0.0", - "description": "Google Cloud Function that accepts URL of the article and parses its content", - "main": "index.js", - "dependencies": { - "@google-cloud/logging-winston": "^4.1.2", - "@google-cloud/storage": "^5.18.1", - "@sentry/serverless": "^6.13.3", - "axios": "^0.26.0", - "dotenv": "^8.2.0", - "jsonwebtoken": "^8.5.1", - "linkedom": "^0.14.9", - "luxon": "^2.3.1", - "winston": "^3.3.3", - "@omnivore/puppeteer-parse": "^1.0.0" - }, - "devDependencies": { - "@google-cloud/functions-framework": "^3.0.0" - }, - "scripts": { - "start": "npx functions-framework --port=9090 --target=puppeteer", - "start_preview": "npx functions-framework --target=preview" - } -} diff --git a/packages/content-fetch-gcf/.env.example b/packages/content-fetch/.env.example similarity index 56% rename from packages/content-fetch-gcf/.env.example rename to packages/content-fetch/.env.example index 64242a22d..4b7dd61c0 100644 --- a/packages/content-fetch-gcf/.env.example +++ b/packages/content-fetch/.env.example @@ -6,3 +6,13 @@ REST_BACKEND_ENDPOINT=http://localhost:4000/api # set for local development IS_LOCAL=true + +VERIFICATION_TOKEN='123456' + +CHROMIUM_PATH=/opt/homebrew/bin/chromium +PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=true +LAUNCH_HEADLESS=true + +TWITTER_BEARER_TOKEN=token + +PORT=9090 diff --git a/packages/content-fetch-gcf/.gcloudignore b/packages/content-fetch/.gcloudignore similarity index 100% rename from packages/content-fetch-gcf/.gcloudignore rename to packages/content-fetch/.gcloudignore diff --git a/packages/content-fetch/Dockerfile b/packages/content-fetch/Dockerfile index 1201c497d..b9bf619cc 100644 --- a/packages/content-fetch/Dockerfile +++ b/packages/content-fetch/Dockerfile @@ -30,12 +30,15 @@ COPY .prettierrc . COPY .eslintrc . COPY /packages/content-handler/package.json ./packages/content-handler/package.json +COPY /packages/puppeteer-parse/package.json ./packages/puppeteer-parse/package.json RUN yarn install --pure-lockfile ADD /packages/content-fetch ./packages/content-fetch ADD /packages/content-handler ./packages/content-handler +ADD /packages/puppeteer-parse ./packages/puppeteer-parse RUN yarn workspace @omnivore/content-handler build +RUN yarn workspace @omnivore/puppeteer-parse build # After building, fetch the production dependencies RUN rm -rf /app/packages/content-fetch/node_modules diff --git a/packages/content-fetch/Dockerfile-gcf b/packages/content-fetch/Dockerfile-gcf new file mode 100644 index 000000000..dec0fa93a --- /dev/null +++ b/packages/content-fetch/Dockerfile-gcf @@ -0,0 +1,52 @@ +FROM node:14.18-alpine + +# Installs latest Chromium (92) package. +RUN apk add --no-cache \ + chromium \ + nss \ + freetype \ + harfbuzz \ + ca-certificates \ + ttf-freefont \ + nodejs \ + yarn + +# Add user so we don't need --no-sandbox. +RUN addgroup -S pptruser && adduser -S -g pptruser pptruser \ + && mkdir -p /home/pptruser/Downloads /app \ + && chown -R pptruser:pptruser /home/pptruser \ + && chown -R pptruser:pptruser /app + +# Run everything after as non-privileged user. +WORKDIR /app + +ENV CHROMIUM_PATH /usr/bin/chromium-browser +ENV LAUNCH_HEADLESS=true +ENV PORT 9090 + +COPY package.json . +COPY yarn.lock . +COPY tsconfig.json . +COPY .prettierrc . +COPY .eslintrc . + +COPY /packages/content-handler/package.json ./packages/content-handler/package.json +COPY /packages/puppeteer-parse/package.json ./packages/puppeteer-parse/package.json + +RUN yarn install --pure-lockfile + +ADD /packages/content-handler ./packages/content-handler +ADD /packages/puppeteer-parse ./packages/puppeteer-parse +ADD /packages/content-fetch ./packages/content-fetch +RUN yarn workspace @omnivore/content-handler build +RUN yarn workspace @omnivore/puppeteer-parse build + +# After building, fetch the production dependencies +RUN rm -rf /app/packages/content-fetch/node_modules +RUN rm -rf /app/node_modules +RUN yarn install --pure-lockfile --production + +EXPOSE 9090 + +# USER pptruser +ENTRYPOINT ["yarn", "workspace", "@omnivore/content-fetch", "start_gcf"] diff --git a/packages/content-fetch-gcf/README.md b/packages/content-fetch/README.md similarity index 100% rename from packages/content-fetch-gcf/README.md rename to packages/content-fetch/README.md diff --git a/packages/content-fetch-gcf/index.js b/packages/content-fetch/index.js similarity index 100% rename from packages/content-fetch-gcf/index.js rename to packages/content-fetch/index.js diff --git a/packages/content-fetch/package.json b/packages/content-fetch/package.json index da8862d24..45de11581 100644 --- a/packages/content-fetch/package.json +++ b/packages/content-fetch/package.json @@ -4,7 +4,6 @@ "description": "Service that fetches page content from a URL", "main": "index.js", "dependencies": { - "@omnivore/content-handler": "1.0.0", "axios": "^0.27.2", "dotenv": "^8.2.0", "express": "^4.17.1", @@ -13,9 +12,18 @@ "luxon": "^2.3.1", "puppeteer-core": "^16.1.0", "underscore": "^1.13.4", - "@omnivore/puppeteer-parse": "^1.0.0" + "@omnivore/puppeteer-parse": "^1.0.0", + "@google-cloud/logging-winston": "^4.1.2", + "@google-cloud/storage": "^5.18.1", + "@sentry/serverless": "^6.13.3", + "winston": "^3.3.3" + }, + "devDependencies": { + "@google-cloud/functions-framework": "^3.0.0" }, "scripts": { - "start": "node app.js" + "start": "node app.js", + "start_gcf": "npx functions-framework --port=9090 --target=puppeteer", + "start_preview": "npx functions-framework --target=preview" } } diff --git a/packages/puppeteer-parse/.dockerignore b/packages/puppeteer-parse/.dockerignore deleted file mode 100644 index 2310bc768..000000000 --- a/packages/puppeteer-parse/.dockerignore +++ /dev/null @@ -1,4 +0,0 @@ -node_modules -.env* -Dockerfile -.dockerignore diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index f2f4cf02f..9dbe583c8 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -5,7 +5,7 @@ /* eslint-disable @typescript-eslint/no-require-imports */ require('dotenv').config(); const Url = require('url'); -const puppeteer = require('puppeteer-extra'); +// const puppeteer = require('puppeteer-extra'); const axios = require('axios'); const jwt = require('jsonwebtoken'); const { promisify } = require('util'); @@ -14,10 +14,8 @@ const { config, format, loggers, transports } = require('winston'); const { LoggingWinston } = require('@google-cloud/logging-winston'); const { DateTime } = require('luxon'); const os = require('os'); -const Sentry = require('@sentry/serverless'); const { Storage } = require('@google-cloud/storage'); - -const chromium = require('chrome-aws-lambda'); +const { parseHTML } = require('linkedom'); const puppeteer = require('puppeteer-core'); const { preHandleContent } = require("@omnivore/content-handler"); @@ -29,11 +27,6 @@ const storage = new Storage(); const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : []; const previewBucket = process.env.PREVIEW_IMAGE_BUCKET ? storage.bucket(process.env.PREVIEW_IMAGE_BUCKET) : undefined; -Sentry.GCPFunction.init({ - dsn: process.env.SENTRY_DSN, - tracesSampleRate: 0, -}); - const MOBILE_USER_AGENT = 'Mozilla/5.0 (Linux; Android 6.0.1; Nexus 5X Build/MMB29P) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.62 Mobile Safari/537.36 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' const DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' const BOT_DESKTOP_USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_6_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4372.0 Safari/537.36' @@ -44,12 +37,6 @@ const NON_SCRIPT_HOSTS= ['medium.com', 'fastcompany.com']; const path = require("path"); const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf']; -const { parseHTML } = require('linkedom'); - -// Add stealth plugin to hide puppeteer usage -const StealthPlugin = require('puppeteer-extra-plugin-stealth'); -puppeteer.use(StealthPlugin()); - const userAgentForUrl = (url) => { try { const u = new URL(url); @@ -94,8 +81,7 @@ const enableJavascriptForUrl = (url) => { }; // launch Puppeteer -const getBrowserPromise = (async (proxyUrl, chromiumPath) => { - console.log("starting with proxy url", proxyUrl) +const getBrowserPromise = (async () => { return puppeteer.launch({ args: [ '--allow-running-insecure-content', @@ -122,8 +108,8 @@ const getBrowserPromise = (async (proxyUrl, chromiumPath) => { '--window-size=1920,1080', ].filter((item) => !!item), defaultViewport: { height: 1080, width: 1920 }, - executablePath: chromiumPath, - headless: true, + executablePath: process.env.CHROMIUM_PATH, + headless: !!process.env.HEADLESS, timeout: 120000, // 2 minutes }); })(); @@ -237,7 +223,7 @@ async function fetchContent(req, res) { }, }; - console.log(`Article parsing request`, logRecord); + logger.info(`Article parsing request`, logRecord); if (!url) { logRecord.urlIsInvalid = true; diff --git a/packages/puppeteer-parse/package.json b/packages/puppeteer-parse/package.json index e73914fa4..cc7755c0b 100644 --- a/packages/puppeteer-parse/package.json +++ b/packages/puppeteer-parse/package.json @@ -4,13 +4,10 @@ "description": "Accepts URL of the article and parses its content", "main": "index.js", "dependencies": { - "@google-cloud/functions-framework": "^3.1.2", "@google-cloud/logging-winston": "^5.1.1", "@google-cloud/storage": "^5.18.1", "@omnivore/content-handler": "1.0.0", - "@sentry/serverless": "^6.13.3", "axios": "^0.27.2", - "chrome-aws-lambda": "^10.1.0", "dotenv": "^8.2.0", "jsonwebtoken": "^8.5.1", "linkedom": "^0.14.9", @@ -18,8 +15,5 @@ "puppeteer-core": "^16.1.0", "underscore": "^1.13.4", "winston": "^3.3.3" - }, - "scripts": { - "test": "yarn mocha" } } diff --git a/packages/puppeteer-parse/test/babel-register.js b/packages/puppeteer-parse/test/babel-register.js deleted file mode 100644 index a6f65f60a..000000000 --- a/packages/puppeteer-parse/test/babel-register.js +++ /dev/null @@ -1,3 +0,0 @@ -const register = require('@babel/register').default - -register({ extensions: ['.ts', '.tsx', '.js', '.jsx'] }) diff --git a/packages/puppeteer-parse/test/stub.test.ts b/packages/puppeteer-parse/test/stub.test.ts deleted file mode 100644 index 173ca4917..000000000 --- a/packages/puppeteer-parse/test/stub.test.ts +++ /dev/null @@ -1,13 +0,0 @@ -import 'mocha' -import * as chai from 'chai' -import { expect } from 'chai' -import 'chai/register-should' -import chaiString from 'chai-string' - -chai.use(chaiString) - -describe('Stub test', () => { - it('should pass', () => { - expect(true).to.be.true - }) -})