Merge pull request #1269 from omnivore-app/fix/content-handler-dependency

Import content-handler as local dependency
This commit is contained in:
Hongbo Wu
2022-10-04 16:05:14 +08:00
committed by GitHub
28 changed files with 235 additions and 101 deletions

View File

@ -0,0 +1,5 @@
node_modules
.env*
Dockerfile
.dockerignore
*.yaml

View File

@ -23,12 +23,26 @@ WORKDIR /app
ENV CHROMIUM_PATH /usr/bin/chromium-browser
ENV LAUNCH_HEADLESS=true
COPY . /app/
WORKDIR app
COPY package.json .
COPY yarn.lock .
COPY tsconfig.json .
COPY .prettierrc .
COPY .eslintrc .
COPY /packages/content-handler/package.json ./packages/content-handler/package.json
RUN yarn install --pure-lockfile
ADD /packages/content-fetch ./packages/content-fetch
ADD /packages/content-handler ./packages/content-handler
RUN yarn workspace @omnivore/content-handler build
# After building, fetch the production dependencies
RUN rm -rf /app/packages/content-fetch/node_modules
RUN rm -rf /app/node_modules
RUN yarn install --pure-lockfile --production
EXPOSE 8080
ENTRYPOINT ["yarn", "start"]
CMD ["yarn", "workspace", "@omnivore/content-fetch", "start"]

View File

@ -33,10 +33,19 @@ COPY tsconfig.json .
COPY .prettierrc .
COPY .eslintrc .
COPY /packages/content-fetch ./packages/content-fetch
COPY /packages/content-handler/package.json ./packages/content-handler/package.json
RUN yarn install --pure-lockfile
ADD /packages/content-fetch ./packages/content-fetch
ADD /packages/content-handler ./packages/content-handler
RUN yarn workspace @omnivore/content-handler build
# After building, fetch the production dependencies
RUN rm -rf /app/packages/content-fetch/node_modules
RUN rm -rf /app/node_modules
RUN yarn install --pure-lockfile --production
EXPOSE 8080
CMD ["yarn", "workspace", "@omnivore/content-fetch", "start"]

View File

@ -100,7 +100,7 @@ const getBrowserPromise = (async () => {
'--window-size=1920,1080',
].filter((item) => !!item),
defaultViewport: { height: 1080, width: 1920 },
executablePath: process.env.CHROMIUM_PATH ,
executablePath: process.env.CHROMIUM_PATH,
headless: !!process.env.LAUNCH_HEADLESS,
timeout: 120000, // 2 minutes
});

View File

@ -4,6 +4,7 @@
"description": "Service that fetches page content from a URL",
"main": "index.js",
"dependencies": {
"@omnivore/content-handler": "1.0.0",
"axios": "^0.27.2",
"dotenv": "^8.2.0",
"express": "^4.17.1",
@ -11,8 +12,7 @@
"linkedom": "^0.14.9",
"luxon": "^2.3.1",
"puppeteer-core": "^16.1.0",
"underscore": "^1.13.4",
"@omnivore/content-handler": "1.0.0"
"underscore": "^1.13.4"
},
"scripts": {
"start": "node app.js",

View File

@ -15,6 +15,11 @@
"build": "tsc"
},
"devDependencies": {
"@types/addressparser": "^1.0.1",
"@types/luxon": "^3.0.1",
"@types/rfc2047": "^2.0.1",
"@types/underscore": "^1.11.4",
"@types/uuid": "^8.3.4",
"chai": "^4.3.6",
"chai-as-promised": "^7.1.1",
"chai-string": "^1.5.0",

View File

@ -40,7 +40,7 @@ export class SubstackHandler extends ContentHandler {
findNewsletterHeaderHref(dom: Document): string | undefined {
// Substack header links
const postLink = dom.querySelector('h1 a ')
const postLink = dom.querySelector('h1 a')
if (postLink) {
return postLink.getAttribute('href') || undefined
}

View File

@ -21,6 +21,29 @@ const load = (path: string): string => {
}
describe('Newsletter email test', () => {
before(() => {
nock('https://email.mg2.substack.com')
.head(
'/c/eJxNkk2TojAQhn-N3KTyQfg4cGDGchdnYcsZx9K5UCE0EMVAkTiKv36iHnarupNUd7rfVJ4W3EDTj1M89No496Uw0wCxgovuwBgYnbOGsZBVjDHzKPWYU8VehUMWOlIX9Qhw4rKLzXgGZziXnRTcyF7dK0iIGMVOG_OS1aTmKPRDilgVhTQUPCQIcE0x-MFTmJ8rCUpA3KtuenR2urg1ZtAzmszI0tq_Z7m66y-ilQo0uAqMTQ7WRX8auJKg56blZg7WB-iHDuYEBzO6NP0R1IwuYFphQbbTjnTH9NBfs80nym4Zyj8uUvyKbtUyGr5eUz9fNDQ7JCxfJDo9dW1lY9lmj_JNivPbGmf2Pt_lN9tDit9b-WeTetni85Z9pDpVOd7L1E_Vy7egayNO23ZP34eSeLJeux1b0rer_xaZ7ykS78nuSjMY-nL98rparNZNcv07JCjN06_EkTFBxBqOUMACErnELUNMSxTUjLDQZwzcqa4bRjCfeejUEFefS224OLr2S5wxPtij7lVrs80d2CNseRV2P52VNFMBipcdVE-U5jkRD7hFAwpGOylVwU2Mfc9qBh7DoR89yVnWXhgQFHnIsbpVb6tU_B-hH_2yzWY'
)
.reply(302, undefined, {
Location:
'https://newsletter.slowchinese.net/p/companies-that-eat-people-217',
})
.get('/p/companies-that-eat-people-217')
.reply(200, '')
nock('https://u23463625.ct.sendgrid.net')
.head(
'/ss/c/AX1lEgEQaxtvFxLaVo0GBo_geajNrlI1TGeIcmMViR3pL3fEDZnbbkoeKcaY62QZk0KPFudUiUXc_uMLerV4nA/3k5/3TFZmreTR0qKSCgowABnVg/h30/zzLik7UXd1H_n4oyd5W8Xu639AYQQB2UXz-CsssSnno'
)
.reply(302, undefined, {
Location: 'https://www.milkroad.com/p/talked-guy-spent-30m-beeple',
})
.get('/p/talked-guy-spent-30m-beeple')
.reply(200, '')
})
describe('#getNewsletterUrl()', () => {
it('returns url when email is from SubStack', async () => {
const rawUrl = '<https://hongbo130.substack.com/p/tldr>'
@ -140,16 +163,6 @@ describe('Newsletter email test', () => {
describe('findNewsletterUrl', async () => {
it('gets the URL from the header if it is a substack newsletter', async () => {
nock('https://email.mg2.substack.com')
.head(
'/c/eJxNkk2TojAQhn-N3KTyQfg4cGDGchdnYcsZx9K5UCE0EMVAkTiKv36iHnarupNUd7rfVJ4W3EDTj1M89No496Uw0wCxgovuwBgYnbOGsZBVjDHzKPWYU8VehUMWOlIX9Qhw4rKLzXgGZziXnRTcyF7dK0iIGMVOG_OS1aTmKPRDilgVhTQUPCQIcE0x-MFTmJ8rCUpA3KtuenR2urg1ZtAzmszI0tq_Z7m66y-ilQo0uAqMTQ7WRX8auJKg56blZg7WB-iHDuYEBzO6NP0R1IwuYFphQbbTjnTH9NBfs80nym4Zyj8uUvyKbtUyGr5eUz9fNDQ7JCxfJDo9dW1lY9lmj_JNivPbGmf2Pt_lN9tDit9b-WeTetni85Z9pDpVOd7L1E_Vy7egayNO23ZP34eSeLJeux1b0rer_xaZ7ykS78nuSjMY-nL98rparNZNcv07JCjN06_EkTFBxBqOUMACErnELUNMSxTUjLDQZwzcqa4bRjCfeejUEFefS224OLr2S5wxPtij7lVrs80d2CNseRV2P52VNFMBipcdVE-U5jkRD7hFAwpGOylVwU2Mfc9qBh7DoR89yVnWXhgQFHnIsbpVb6tU_B-hH_2yzWY'
)
.reply(302, undefined, {
Location:
'https://newsletter.slowchinese.net/p/companies-that-eat-people-217',
})
.get('/p/companies-that-eat-people-217')
.reply(200, '')
const html = load('./test/data/substack-forwarded-newsletter.html')
const url = await new SubstackHandler().findNewsletterUrl(html)
// Not sure if the redirects from substack expire, this test could eventually fail
@ -158,21 +171,12 @@ describe('Newsletter email test', () => {
)
}).timeout(10000)
it('gets the URL from the header if it is a beehiiv newsletter', async () => {
nock('https://u23463625.ct.sendgrid.net')
.head(
'/ss/c/AX1lEgEQaxtvFxLaVo0GBo_geajNrlI1TGeIcmMViR3pL3fEDZnbbkoeKcaY62QZk0KPFudUiUXc_uMLerV4nA/3k5/3TFZmreTR0qKSCgowABnVg/h30/zzLik7UXd1H_n4oyd5W8Xu639AYQQB2UXz-CsssSnno'
)
.reply(302, undefined, {
Location: 'https://www.milkroad.com/p/talked-guy-spent-30m-beeple',
})
.get('/p/talked-guy-spent-30m-beeple')
.reply(200, '')
const html = load('./test/data/beehiiv-newsletter.html')
const url = await new BeehiivHandler().findNewsletterUrl(html)
expect(url).to.startWith(
'https://www.milkroad.com/p/talked-guy-spent-30m-beeple'
)
})
}).timeout(10000)
it('returns undefined if it is not a newsletter', async () => {
const html = load('./test/data/substack-forwarded-welcome-email.html')
const url = await new SubstackHandler().findNewsletterUrl(html)

View File

@ -0,0 +1,5 @@
node_modules
build
.env*
Dockerfile
.dockerignore

View File

@ -0,0 +1,30 @@
FROM node:14.18-alpine
# Run everything after as non-privileged user.
WORKDIR /app
COPY package.json .
COPY yarn.lock .
COPY tsconfig.json .
COPY .prettierrc .
COPY .eslintrc .
COPY /packages/inbound-email-handler/package.json ./packages/inbound-email-handler/package.json
COPY /packages/content-handler/package.json ./packages/content-handler/package.json
RUN yarn install --pure-lockfile
ADD /packages/inbound-email-handler ./packages/inbound-email-handler
ADD /packages/content-handler ./packages/content-handler
RUN yarn workspace @omnivore/content-handler build
RUN yarn workspace @omnivore/inbound-email-handler build
# After building, fetch the production dependencies
RUN rm -rf /app/packages/inbound-email-handler/node_modules
RUN rm -rf /app/node_modules
RUN yarn install --pure-lockfile --production
EXPOSE 8080
CMD ["yarn", "workspace", "@omnivore/inbound-email-handler", "start"]

View File

@ -24,14 +24,16 @@
"@types/json-bigint": "^1.0.1",
"@types/node": "^14.11.2",
"@types/rfc2047": "^2.0.1",
"eslint-plugin-prettier": "^4.0.0"
"chai": "^4.3.6",
"eslint-plugin-prettier": "^4.0.0",
"mocha": "^10.0.0"
},
"dependencies": {
"@google-cloud/functions-framework": "3.1.2",
"@google-cloud/pubsub": "^2.18.4",
"@omnivore/content-handler": "1.0.0",
"@sendgrid/client": "^7.6.0",
"@sentry/serverless": "^6.16.1",
"@omnivore/content-handler": "1.0.0",
"addressparser": "^1.0.1",
"axios": "^0.27.2",
"jsonwebtoken": "^8.5.1",

View File

@ -1,3 +1,4 @@
import 'mocha'
import { expect } from 'chai'
import {
getConfirmationCode,

View File

@ -1,13 +0,0 @@
import 'mocha'
import * as chai from 'chai'
import { expect } from 'chai'
import 'chai/register-should'
import chaiString from 'chai-string'
chai.use(chaiString)
describe('Stub test', () => {
it('should pass', () => {
expect(true).to.be.true
})
})

View File

@ -5,5 +5,5 @@
"rootDir": ".",
"lib": ["dom"]
},
"include": ["src", "test"]
"include": ["src"]
}

View File

@ -0,0 +1,5 @@
node_modules
build
.env*
Dockerfile
.dockerignore

View File

@ -0,0 +1,27 @@
FROM node:14.18-alpine
# Run everything after as non-privileged user.
WORKDIR /app
COPY package.json .
COPY yarn.lock .
COPY tsconfig.json .
COPY .prettierrc .
COPY .eslintrc .
COPY /packages/pdf-handler/package.json ./packages/pdf-handler/package.json
RUN yarn install --pure-lockfile
ADD /packages/pdf-handler ./packages/pdf-handler
RUN yarn workspace @omnivore/pdf-handler build
# After building, fetch the production dependencies
RUN rm -rf /app/packages/pdf-handler/node_modules
RUN rm -rf /app/node_modules
RUN yarn install --pure-lockfile --production
EXPOSE 8080
CMD ["yarn", "workspace", "@omnivore/pdf-handler", "start"]

View File

@ -20,7 +20,10 @@
"deploy": "yarn build && yarn gcloud-deploy"
},
"devDependencies": {
"@types/node": "^14.11.2"
"@types/node": "^14.11.2",
"chai": "^4.3.6",
"chai-string": "^1.5.0",
"mocha": "^10.0.0"
},
"dependencies": {
"@google-cloud/functions-framework": "3.1.2",

View File

@ -1,7 +1,6 @@
import 'mocha'
import * as chai from 'chai'
import { expect } from 'chai'
import 'chai/register-should'
import chaiString from 'chai-string'
import {
getDocument,

View File

@ -5,5 +5,5 @@
"rootDir": ".",
"lib": ["dom"]
},
"include": ["src", "test"]
"include": ["src"]
}

View File

@ -0,0 +1,4 @@
node_modules
.env*
Dockerfile
.dockerignore

View File

@ -85,6 +85,7 @@ WORKDIR /app
ENV CHROMIUM_PATH /usr/bin/chromium-browser
ENV LAUNCH_HEADLESS=true
ENV PORT 9090
COPY package.json .
COPY yarn.lock .
@ -93,12 +94,20 @@ COPY .prettierrc .
COPY .eslintrc .
COPY /packages/puppeteer-parse/package.json ./packages/puppeteer-parse/package.json
COPY /packages/content-handler/package.json ./packages/content-handler/package.json
RUN yarn install --pure-lockfile
ADD /packages/puppeteer-parse ./packages/puppeteer-parse
ADD /packages/content-handler ./packages/content-handler
RUN yarn workspace @omnivore/content-handler build
EXPOSE 8080
# After building, fetch the production dependencies
RUN rm -rf /app/packages/puppeteer-parse/node_modules
RUN rm -rf /app/node_modules
RUN yarn install --pure-lockfile --production
EXPOSE 9090
# USER pptruser
ENTRYPOINT ["yarn", "workspace", "@omnivore/puppeteer-parse", "start"]

View File

@ -118,43 +118,43 @@ const userAgentForUrl = (url) => {
// launch Puppeteer
const getBrowserPromise = (async () => {
return puppeteer.launch({
args: chromium.args,
defaultViewport: chromium.defaultViewport,
executablePath: await chromium.executablePath,
headless: chromium.headless,
ignoreHTTPSErrors: true,
});
// return puppeteer.launch({
// args: [
// '--allow-running-insecure-content',
// '--autoplay-policy=user-gesture-required',
// '--disable-component-update',
// '--disable-domain-reliability',
// '--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process',
// '--disable-print-preview',
// '--disable-setuid-sandbox',
// '--disable-site-isolation-trials',
// '--disable-speech-api',
// '--disable-web-security',
// '--disk-cache-size=33554432',
// '--enable-features=SharedArrayBuffer',
// '--hide-scrollbars',
// '--ignore-gpu-blocklist',
// '--in-process-gpu',
// '--mute-audio',
// '--no-default-browser-check',
// '--no-pings',
// '--no-sandbox',
// '--no-zygote',
// '--use-gl=swiftshader',
// '--window-size=1920,1080',
// ].filter((item) => !!item),
// defaultViewport: { height: 1080, width: 1920 },
// args: chromium.args,
// defaultViewport: chromium.defaultViewport,
// executablePath: process.env.CHROMIUM_PATH,
// headless: !!process.env.LAUNCH_HEADLESS,
// timeout: 0,
// headless: chromium.headless,
// ignoreHTTPSErrors: true,
// });
return puppeteer.launch({
args: [
'--allow-running-insecure-content',
'--autoplay-policy=user-gesture-required',
'--disable-component-update',
'--disable-domain-reliability',
'--disable-features=AudioServiceOutOfProcess,IsolateOrigins,site-per-process',
'--disable-print-preview',
'--disable-setuid-sandbox',
'--disable-site-isolation-trials',
'--disable-speech-api',
'--disable-web-security',
'--disk-cache-size=33554432',
'--enable-features=SharedArrayBuffer',
'--hide-scrollbars',
'--ignore-gpu-blocklist',
'--in-process-gpu',
'--mute-audio',
'--no-default-browser-check',
'--no-pings',
'--no-sandbox',
'--no-zygote',
'--use-gl=swiftshader',
'--window-size=1920,1080',
].filter((item) => !!item),
defaultViewport: { height: 1080, width: 1920 },
executablePath: process.env.CHROMIUM_PATH,
headless: !!process.env.LAUNCH_HEADLESS,
timeout: 120000, // 2 minutes
});
})();
let logRecord, functionStartTime;

View File

@ -4,8 +4,10 @@
"description": "Google Cloud Function that accepts URL of the article and parses its content",
"main": "index.js",
"dependencies": {
"@google-cloud/functions-framework": "^3.1.2",
"@google-cloud/logging-winston": "^5.1.1",
"@google-cloud/storage": "^5.18.1",
"@omnivore/content-handler": "1.0.0",
"@sentry/serverless": "^6.13.3",
"axios": "^0.27.2",
"chrome-aws-lambda": "^10.1.0",
@ -17,9 +19,6 @@
"underscore": "^1.13.4",
"winston": "^3.3.3"
},
"devDependencies": {
"@google-cloud/functions-framework": "^3.0.0"
},
"scripts": {
"start": "npx functions-framework --port=9090 --target=puppeteer",
"start_preview": "npx functions-framework --target=preview",

View File

@ -0,0 +1,5 @@
node_modules
build
.env*
Dockerfile
.dockerignore

View File

@ -0,0 +1,27 @@
FROM node:14.18-alpine
# Run everything after as non-privileged user.
WORKDIR /app
COPY package.json .
COPY yarn.lock .
COPY tsconfig.json .
COPY .prettierrc .
COPY .eslintrc .
COPY /packages/text-to-speech/package.json ./packages/text-to-speech/package.json
RUN yarn install --pure-lockfile
ADD /packages/text-to-speech ./packages/text-to-speech
RUN yarn workspace @omnivore/text-to-speech-handler build
# After building, fetch the production dependencies
RUN rm -rf /app/packages/text-to-speech/node_modules
RUN rm -rf /app/node_modules
RUN yarn install --pure-lockfile --production
EXPOSE 8080
CMD ["yarn", "workspace", "@omnivore/text-to-speech-handler", "start"]

View File

@ -25,7 +25,9 @@
"@types/natural": "^5.1.1",
"@types/node": "^14.11.2",
"@types/underscore": "^1.11.4",
"eslint-plugin-prettier": "^4.0.0"
"chai": "^4.3.6",
"eslint-plugin-prettier": "^4.0.0",
"mocha": "^10.0.0"
},
"dependencies": {
"@google-cloud/functions-framework": "3.1.2",

View File

@ -1,13 +0,0 @@
import 'mocha'
import * as chai from 'chai'
import { expect } from 'chai'
import 'chai/register-should'
import chaiString from 'chai-string'
chai.use(chaiString)
describe('Stub test', () => {
it('should pass', () => {
expect(true).to.be.true
})
})

View File

@ -2390,7 +2390,7 @@
google-gax "^2.24.1"
protobufjs "^6.8.6"
"@google-cloud/functions-framework@3.1.2", "@google-cloud/functions-framework@^3.0.0":
"@google-cloud/functions-framework@3.1.2", "@google-cloud/functions-framework@^3.1.2":
version "3.1.2"
resolved "https://registry.yarnpkg.com/@google-cloud/functions-framework/-/functions-framework-3.1.2.tgz#2cd92ce4307bf7f32555d028dca22e398473b410"
integrity sha512-pYvEH65/Rqh1JNPdcBmorcV7Xoom2/iOSmbtYza8msro7Inl+qOYxbyMiQfySD2gwAyn38WyWPRqsDRcf/BFLg==
@ -7885,6 +7885,11 @@
resolved "https://registry.yarnpkg.com/@types/luxon/-/luxon-1.27.1.tgz#aceeb2d5be8fccf541237e184e37ecff5faa9096"
integrity sha512-cPiXpOvPFDr2edMnOXlz3UBDApwUfR+cpizvxCy0n3vp9bz/qe8BWzHPIEFcy+ogUOyjKuCISgyq77ELZPmkkg==
"@types/luxon@^3.0.1":
version "3.0.1"
resolved "https://registry.yarnpkg.com/@types/luxon/-/luxon-3.0.1.tgz#2b1657096473e24b049bdedf3710f99645f3a17f"
integrity sha512-/LAvk1cMOJt0ghzMFrZEvByUhsiEfeeT2IF53Le+Ki3A538yEL9pRZ7a6MuCxdrYK+YNqNIDmrKU/r2nnw04zQ==
"@types/mdast@^3.0.0":
version "3.0.10"
resolved "https://registry.yarnpkg.com/@types/mdast/-/mdast-3.0.10.tgz#4724244a82a4598884cbbe9bcfd73dff927ee8af"
@ -8249,7 +8254,7 @@
dependencies:
"@types/node" "*"
"@types/uuid@^8.3.0", "@types/uuid@^8.3.1":
"@types/uuid@^8.3.0", "@types/uuid@^8.3.1", "@types/uuid@^8.3.4":
version "8.3.4"
resolved "https://registry.yarnpkg.com/@types/uuid/-/uuid-8.3.4.tgz#bd86a43617df0594787d38b735f55c805becf1bc"
integrity sha512-c/I8ZRb51j+pYGAu5CrFMRxqZ2ke4y2grEBO5AUjgSkSk+qT2Ea+OdWElz/OiMf5MNpn2b17kuVBwZLQJXzihw==