diff --git a/packages/content-fetch/Dockerfile b/packages/content-fetch/Dockerfile index b05320b08..fd025629c 100644 --- a/packages/content-fetch/Dockerfile +++ b/packages/content-fetch/Dockerfile @@ -23,12 +23,18 @@ WORKDIR /app ENV CHROMIUM_PATH /usr/bin/chromium-browser ENV LAUNCH_HEADLESS=true -COPY . /app/ -WORKDIR app +COPY package.json . +COPY yarn.lock . +COPY tsconfig.json . +COPY .prettierrc . +COPY .eslintrc . + +COPY /packages/content-fetch ./packages/content-fetch +COPY /packages/content-handler ./packages/content-handler RUN yarn install --pure-lockfile EXPOSE 8080 -ENTRYPOINT ["yarn", "start"] +CMD ["yarn", "workspace", "@omnivore/content-fetch", "start"] diff --git a/packages/content-fetch/Dockerfile-local b/packages/content-fetch/Dockerfile-local index 383011f10..505e4c2da 100644 --- a/packages/content-fetch/Dockerfile-local +++ b/packages/content-fetch/Dockerfile-local @@ -34,6 +34,7 @@ COPY .prettierrc . COPY .eslintrc . COPY /packages/content-fetch ./packages/content-fetch +COPY /packages/content-handler ./packages/content-handler RUN yarn install --pure-lockfile diff --git a/packages/content-fetch/package.json b/packages/content-fetch/package.json index 3df85a237..70915c84c 100644 --- a/packages/content-fetch/package.json +++ b/packages/content-fetch/package.json @@ -4,6 +4,7 @@ "description": "Service that fetches page content from a URL", "main": "index.js", "dependencies": { + "@omnivore/content-handler": "file:./../content-handler", "axios": "^0.27.2", "dotenv": "^8.2.0", "express": "^4.17.1", @@ -11,8 +12,7 @@ "linkedom": "^0.14.9", "luxon": "^2.3.1", "puppeteer-core": "^16.1.0", - "underscore": "^1.13.4", - "@omnivore/content-handler": "1.0.0" + "underscore": "^1.13.4" }, "scripts": { "start": "node app.js", diff --git a/packages/inbound-email-handler/package.json b/packages/inbound-email-handler/package.json index ed22a170c..b11c89e31 100644 --- a/packages/inbound-email-handler/package.json +++ b/packages/inbound-email-handler/package.json @@ -29,9 +29,9 @@ "dependencies": { "@google-cloud/functions-framework": "3.1.2", "@google-cloud/pubsub": "^2.18.4", + "@omnivore/content-handler": "file:./../content-handler", "@sendgrid/client": "^7.6.0", "@sentry/serverless": "^6.16.1", - "@omnivore/content-handler": "1.0.0", "addressparser": "^1.0.1", "axios": "^0.27.2", "jsonwebtoken": "^8.5.1", diff --git a/packages/puppeteer-parse/content-handler/.eslintignore b/packages/puppeteer-parse/content-handler/.eslintignore new file mode 100644 index 000000000..c2658d7d1 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/.eslintignore @@ -0,0 +1 @@ +node_modules/ diff --git a/packages/puppeteer-parse/content-handler/.eslintrc b/packages/puppeteer-parse/content-handler/.eslintrc new file mode 100644 index 000000000..e006282a6 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/.eslintrc @@ -0,0 +1,6 @@ +{ + "extends": "../../.eslintrc", + "parserOptions": { + "project": "tsconfig.json" + } +} \ No newline at end of file diff --git a/packages/puppeteer-parse/content-handler/.gitignore b/packages/puppeteer-parse/content-handler/.gitignore new file mode 100644 index 000000000..0ae7e5c9e --- /dev/null +++ b/packages/puppeteer-parse/content-handler/.gitignore @@ -0,0 +1,2 @@ +node_modules +/lib diff --git a/packages/puppeteer-parse/content-handler/.npmignore b/packages/puppeteer-parse/content-handler/.npmignore new file mode 100644 index 000000000..b5e2b8569 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/.npmignore @@ -0,0 +1,7 @@ +/test/ +src +tsconfig.json +.eslintrc +.eslintignore +.gitignore +mocha-config.json diff --git a/packages/puppeteer-parse/content-handler/mocha-config.json b/packages/puppeteer-parse/content-handler/mocha-config.json new file mode 100644 index 000000000..44d1d24c1 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/mocha-config.json @@ -0,0 +1,5 @@ +{ + "extension": ["ts"], + "spec": "test/**/*.test.ts", + "require": "test/babel-register.js" + } \ No newline at end of file diff --git a/packages/puppeteer-parse/content-handler/package.json b/packages/puppeteer-parse/content-handler/package.json new file mode 100644 index 000000000..e4021b3e4 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/package.json @@ -0,0 +1,34 @@ +{ + "name": "@omnivore/content-handler", + "version": "1.0.0", + "description": "A standalone version of content handler to parse and format each type of content", + "main": "build/src/index.js", + "types": "build/src/index.d.ts", + "files": [ + "build/src" + ], + "license": "Apache-2.0", + "scripts": { + "test": "yarn mocha -r ts-node/register --config mocha-config.json", + "lint": "eslint src --ext ts,js,tsx,jsx", + "compile": "tsc", + "build": "tsc" + }, + "devDependencies": { + "chai": "^4.3.6", + "chai-as-promised": "^7.1.1", + "chai-string": "^1.5.0", + "eslint-plugin-prettier": "^4.0.0", + "mocha": "^10.0.0", + "nock": "^13.2.9" + }, + "dependencies": { + "addressparser": "^1.0.1", + "axios": "^0.27.2", + "linkedom": "^0.14.16", + "luxon": "^3.0.4", + "rfc2047": "^4.0.1", + "underscore": "^1.13.6", + "uuid": "^9.0.0" + } +} diff --git a/packages/puppeteer-parse/content-handler/src/content-handler.ts b/packages/puppeteer-parse/content-handler/src/content-handler.ts new file mode 100644 index 000000000..22216fabe --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/content-handler.ts @@ -0,0 +1,175 @@ +import addressparser from 'addressparser' +import rfc2047 from 'rfc2047' +import { v4 as uuid } from 'uuid' +import { parseHTML } from 'linkedom' +import axios from 'axios' + +interface Unsubscribe { + mailTo?: string + httpUrl?: string +} + +export interface NewsletterInput { + postHeader: string + from: string + unSubHeader: string + email: string + html: string + title: string +} + +export interface NewsletterResult { + email: string + content: string + url: string + title: string + author: string + unsubMailTo?: string + unsubHttpUrl?: string +} + +export interface PreHandleResult { + url?: string + title?: string + content?: string + contentType?: string + dom?: Document +} + +export const FAKE_URL_PREFIX = 'https://omnivore.app/no_url?q=' +export const generateUniqueUrl = () => FAKE_URL_PREFIX + uuid() + +export abstract class ContentHandler { + protected senderRegex: RegExp + protected urlRegex: RegExp + name: string + + protected constructor() { + this.senderRegex = new RegExp(/NEWSLETTER_SENDER_REGEX/) + this.urlRegex = new RegExp(/NEWSLETTER_URL_REGEX/) + this.name = 'Handler name' + } + + shouldResolve(url: string): boolean { + return false + } + + async resolve(url: string): Promise { + return Promise.resolve(url) + } + + shouldPreHandle(url: string, dom?: Document): boolean { + return false + } + + async preHandle(url: string, dom?: Document): Promise { + return Promise.resolve({ url, dom }) + } + + async isNewsletter(input: { + postHeader: string + from: string + unSubHeader: string + html?: string + }): Promise { + const re = new RegExp(this.senderRegex) + return Promise.resolve( + re.test(input.from) && (!!input.postHeader || !!input.unSubHeader) + ) + } + + findNewsletterHeaderHref(dom: Document): string | undefined { + return undefined + } + + // Given an HTML blob tries to find a URL to use for + // a canonical URL. + async findNewsletterUrl(html: string): Promise { + const dom = parseHTML(html).document + + // Check if this is a substack newsletter + const href = this.findNewsletterHeaderHref(dom) + if (href) { + // Try to make a HEAD request, so we get the redirected URL, since these + // will usually be behind tracking url redirects + try { + const response = await axios.head(href, { timeout: 5000 }) + return Promise.resolve( + // eslint-disable-next-line @typescript-eslint/no-unsafe-member-access + response.request.res.responseUrl as string | undefined + ) + } catch (e) { + console.log('error making HEAD request', e) + return Promise.resolve(href) + } + } + + return Promise.resolve(undefined) + } + + async parseNewsletterUrl( + _postHeader: string, + html: string + ): Promise { + // get newsletter url from html + const matches = html.match(this.urlRegex) + if (matches) { + return Promise.resolve(matches[1]) + } + return Promise.resolve(undefined) + } + + parseAuthor(from: string): string { + // get author name from email + // e.g. 'Jackson Harper from Omnivore App ' + // or 'Mike Allen ' + const parsed = addressparser(from) + if (parsed.length > 0) { + return parsed[0].name + } + return from + } + + parseUnsubscribe(unSubHeader: string): Unsubscribe { + // parse list-unsubscribe header + // e.g. List-Unsubscribe: , + const decoded = rfc2047.decode(unSubHeader) + return { + mailTo: decoded.match(/<(https?:\/\/[^>]*)>/)?.[1], + httpUrl: decoded.match(/]*)>/)?.[1], + } + } + + async handleNewsletter({ + email, + html, + postHeader, + title, + from, + unSubHeader, + }: NewsletterInput): Promise { + console.log('handleNewsletter', email, postHeader, title, from) + + if (!email || !html || !title || !from) { + console.log('invalid newsletter email') + throw new Error('invalid newsletter email') + } + + // fallback to default url if newsletter url does not exist + // assign a random uuid to the default url to avoid duplicate url + const url = + (await this.parseNewsletterUrl(postHeader, html)) || generateUniqueUrl() + const author = this.parseAuthor(from) + const unsubscribe = this.parseUnsubscribe(unSubHeader) + + return { + email, + content: html, + url, + title, + author, + unsubMailTo: unsubscribe.mailTo || '', + unsubHttpUrl: unsubscribe.httpUrl || '', + } + } +} diff --git a/packages/puppeteer-parse/content-handler/src/index.ts b/packages/puppeteer-parse/content-handler/src/index.ts new file mode 100644 index 000000000..e41c811c4 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/index.ts @@ -0,0 +1,116 @@ +import { AppleNewsHandler } from './websites/apple-news-handler' +import { BloombergHandler } from './websites/bloomberg-handler' +import { DerstandardHandler } from './websites/derstandard-handler' +import { ImageHandler } from './websites/image-handler' +import { MediumHandler } from './websites/medium-handler' +import { PdfHandler } from './websites/pdf-handler' +import { ScrapingBeeHandler } from './websites/scrapingBee-handler' +import { TDotCoHandler } from './websites/t-dot-co-handler' +import { TwitterHandler } from './websites/twitter-handler' +import { YoutubeHandler } from './websites/youtube-handler' +import { WikipediaHandler } from './websites/wikipedia-handler' +import { + ContentHandler, + NewsletterInput, + NewsletterResult, + PreHandleResult, +} from './content-handler' +import { SubstackHandler } from './newsletters/substack-handler' +import { AxiosHandler } from './newsletters/axios-handler' +import { GolangHandler } from './newsletters/golang-handler' +import { MorningBrewHandler } from './newsletters/morning-brew-handler' +import { BloombergNewsletterHandler } from './newsletters/bloomberg-newsletter-handler' +import { BeehiivHandler } from './newsletters/beehiiv-handler' +import { ConvertkitHandler } from './newsletters/convertkit-handler' +import { RevueHandler } from './newsletters/revue-handler' + +const validateUrlString = (url: string) => { + const u = new URL(url) + // Make sure the URL is http or https + if (u.protocol !== 'http:' && u.protocol !== 'https:') { + throw new Error('Invalid URL protocol check failed') + } + // Make sure the domain is not localhost + if (u.hostname === 'localhost' || u.hostname === '0.0.0.0') { + throw new Error('Invalid URL is localhost') + } + // Make sure the domain is not a private IP + if (/^(10|172\.16|192\.168)\..*/.test(u.hostname)) { + throw new Error('Invalid URL is private ip') + } +} + +const contentHandlers: ContentHandler[] = [ + new AppleNewsHandler(), + new BloombergHandler(), + new DerstandardHandler(), + new ImageHandler(), + new MediumHandler(), + new PdfHandler(), + new ScrapingBeeHandler(), + new TDotCoHandler(), + new TwitterHandler(), + new YoutubeHandler(), + new WikipediaHandler(), +] + +const newsletterHandlers: ContentHandler[] = [ + new AxiosHandler(), + new BloombergNewsletterHandler(), + new GolangHandler(), + new SubstackHandler(), + new MorningBrewHandler(), + new SubstackHandler(), + new BeehiivHandler(), + new ConvertkitHandler(), + new RevueHandler(), +] + +export const preHandleContent = async ( + url: string, + dom?: Document +): Promise => { + // Before we run the regular handlers we check to see if we need tp + // pre-resolve the URL. TODO: This should probably happen recursively, + // so URLs can be pre-resolved, handled, pre-resolved, handled, etc. + for (const handler of contentHandlers) { + if (handler.shouldResolve(url)) { + try { + const resolvedUrl = await handler.resolve(url) + if (resolvedUrl && validateUrlString(resolvedUrl)) { + url = resolvedUrl + } + } catch (err) { + console.log('error resolving url with handler', handler.name, err) + } + break + } + } + // Before we fetch the page we check the handlers, to see if they want + // to perform a prefetch action that can modify our requests. + // enumerate the handlers and see if any of them want to handle the request + for (const handler of contentHandlers) { + if (handler.shouldPreHandle(url, dom)) { + console.log('preHandleContent', handler.name, url) + return handler.preHandle(url, dom) + } + } + return undefined +} + +export const handleNewsletter = async ( + input: NewsletterInput +): Promise => { + for (const handler of newsletterHandlers) { + if (await handler.isNewsletter(input)) { + return handler.handleNewsletter(input) + } + } + + return undefined +} + +module.exports = { + preHandleContent, + handleNewsletter, +} diff --git a/packages/puppeteer-parse/content-handler/src/newsletters/axios-handler.ts b/packages/puppeteer-parse/content-handler/src/newsletters/axios-handler.ts new file mode 100644 index 000000000..cd783c30e --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/newsletters/axios-handler.ts @@ -0,0 +1,46 @@ +import { ContentHandler, PreHandleResult } from '../content-handler' + +export class AxiosHandler extends ContentHandler { + constructor() { + super() + this.senderRegex = /<.+@axios.com>/ + this.urlRegex = /View in browser at (.*)<\/a>/ + this.name = 'axios' + } + + shouldPreHandle(url: string, dom?: Document): boolean { + const host = this.name + '.com' + // check if url ends with axios.com + return new URL(url).hostname.endsWith(host) + } + + async preHandle(url: string, dom: Document): Promise { + const body = dom.querySelector('table') + + let isFooter = false + // this removes ads and replaces table with a div + body?.querySelectorAll('table').forEach((el) => { + // remove the footer and the ads + if (!el.textContent || el.textContent.length < 20 || isFooter) { + el.remove() + } else { + // removes the first few rows of the table (the header) + // remove the last two rows of the table (they are ads) + el.querySelectorAll('tr').forEach((tr, i) => { + if (i <= 7 || i >= el.querySelectorAll('tr').length - 2) { + console.log('removing', tr) + tr.remove() + } + }) + // replace the table with a div + const div = dom.createElement('div') + div.innerHTML = el.innerHTML + el.parentNode?.replaceChild(div, el) + // set the isFooter flag to true because the next table is the footer + isFooter = true + } + }) + + return Promise.resolve({ dom }) + } +} diff --git a/packages/puppeteer-parse/content-handler/src/newsletters/beehiiv-handler.ts b/packages/puppeteer-parse/content-handler/src/newsletters/beehiiv-handler.ts new file mode 100644 index 000000000..0a50c1920 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/newsletters/beehiiv-handler.ts @@ -0,0 +1,43 @@ +import { ContentHandler } from '../content-handler' +import { parseHTML } from 'linkedom' + +export class BeehiivHandler extends ContentHandler { + constructor() { + super() + this.name = 'beehiiv' + } + + findNewsletterHeaderHref(dom: Document): string | undefined { + const readOnline = dom.querySelectorAll('table tr td div a[class*="link"]') + let res: string | undefined = undefined + readOnline.forEach((e) => { + if (e.textContent === 'Read Online') { + res = e.getAttribute('href') || undefined + } + }) + return res + } + + async isNewsletter(input: { + postHeader: string + from: string + unSubHeader: string + html: string + }): Promise { + const dom = parseHTML(input.html).document + if (dom.querySelectorAll('img[src*="beehiiv.net"]').length > 0) { + const beehiivUrl = this.findNewsletterHeaderHref(dom) + if (beehiivUrl) { + return Promise.resolve(true) + } + } + return false + } + + async parseNewsletterUrl( + postHeader: string, + html: string + ): Promise { + return this.findNewsletterUrl(html) + } +} diff --git a/packages/puppeteer-parse/content-handler/src/newsletters/bloomberg-newsletter-handler.ts b/packages/puppeteer-parse/content-handler/src/newsletters/bloomberg-newsletter-handler.ts new file mode 100644 index 000000000..a5f84f076 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/newsletters/bloomberg-newsletter-handler.ts @@ -0,0 +1,37 @@ +import { ContentHandler, PreHandleResult } from '../content-handler' + +export class BloombergNewsletterHandler extends ContentHandler { + constructor() { + super() + this.senderRegex = /<.+@mail.bloomberg.*.com>/ + this.urlRegex = / { + const body = dom.querySelector('.wrapper') + + // this removes header + body?.querySelector('.sailthru-variables')?.remove() + body?.querySelector('.preview-text')?.remove() + body?.querySelector('.logo-wrapper')?.remove() + body?.querySelector('.by-the-number-wrapper')?.remove() + // this removes footer + body?.querySelector('.quote-box-wrapper')?.remove() + body?.querySelector('.header-wrapper')?.remove() + body?.querySelector('.component-wrapper')?.remove() + body?.querySelector('.footer')?.remove() + + return Promise.resolve({ dom }) + } +} diff --git a/packages/puppeteer-parse/content-handler/src/newsletters/convertkit-handler.ts b/packages/puppeteer-parse/content-handler/src/newsletters/convertkit-handler.ts new file mode 100644 index 000000000..72e65f5da --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/newsletters/convertkit-handler.ts @@ -0,0 +1,41 @@ +import { ContentHandler } from '../content-handler' +import { parseHTML } from 'linkedom' + +export class ConvertkitHandler extends ContentHandler { + constructor() { + super() + this.name = 'convertkit' + } + + findNewsletterHeaderHref(dom: Document): string | undefined { + const readOnline = dom.querySelectorAll('table tr td a') + let res: string | undefined = undefined + readOnline.forEach((e) => { + if (e.textContent === 'View this email in your browser') { + res = e.getAttribute('href') || undefined + } + }) + return res + } + + async isNewsletter(input: { + postHeader: string + from: string + unSubHeader: string + html: string + }): Promise { + const dom = parseHTML(input.html).document + return Promise.resolve( + dom.querySelectorAll( + 'img[src*="convertkit.com"], img[src*="convertkit-mail.com"]' + ).length > 0 + ) + } + + async parseNewsletterUrl( + postHeader: string, + html: string + ): Promise { + return this.findNewsletterUrl(html) + } +} diff --git a/packages/puppeteer-parse/content-handler/src/newsletters/golang-handler.ts b/packages/puppeteer-parse/content-handler/src/newsletters/golang-handler.ts new file mode 100644 index 000000000..7d4724004 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/newsletters/golang-handler.ts @@ -0,0 +1,27 @@ +import { ContentHandler, PreHandleResult } from '../content-handler' + +export class GolangHandler extends ContentHandler { + constructor() { + super() + this.senderRegex = /<.+@golangweekly.com>/ + this.urlRegex = /Read on the Web<\/a>/ + this.name = 'golangweekly' + } + + shouldPreHandle(url: string, dom?: Document): boolean { + const host = this.name + '.com' + // check if url ends with golangweekly.com + return new URL(url).hostname.endsWith(host) + } + + async preHandle(url: string, dom: Document): Promise { + const body = dom.querySelector('body') + + // this removes the "Subscribe" button + body?.querySelector('.el-splitbar')?.remove() + // this removes the title + body?.querySelector('.el-masthead')?.remove() + + return Promise.resolve({ dom }) + } +} diff --git a/packages/puppeteer-parse/content-handler/src/newsletters/morning-brew-handler.ts b/packages/puppeteer-parse/content-handler/src/newsletters/morning-brew-handler.ts new file mode 100644 index 000000000..f187ac0dc --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/newsletters/morning-brew-handler.ts @@ -0,0 +1,35 @@ +import { ContentHandler, PreHandleResult } from '../content-handler' + +export class MorningBrewHandler extends ContentHandler { + constructor() { + super() + this.senderRegex = /Morning Brew / + this.urlRegex = /View Online<\/a>/ + this.name = 'morningbrew' + } + + shouldPreHandle(url: string, dom?: Document): boolean { + const host = this.name + '.com' + // check if url ends with morningbrew.com + return new URL(url).hostname.endsWith(host) + } + + async preHandle(url: string, dom: Document): Promise { + // retain the width of the cells in the table of market info + dom.querySelectorAll('.markets-arrow-cell').forEach((td) => { + const table = td.closest('table') + if (table) { + const bubbleTable = table.querySelector('.markets-bubble') + if (bubbleTable) { + // replace the nested table with the text + const e = bubbleTable.querySelector('.markets-table-text') + e && bubbleTable.parentNode?.replaceChild(e, bubbleTable) + } + // set custom class for the table + table.className = 'morning-brew-markets' + } + }) + + return Promise.resolve({ dom }) + } +} diff --git a/packages/puppeteer-parse/content-handler/src/newsletters/revue-handler.ts b/packages/puppeteer-parse/content-handler/src/newsletters/revue-handler.ts new file mode 100644 index 000000000..d8c8f911c --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/newsletters/revue-handler.ts @@ -0,0 +1,46 @@ +import { ContentHandler } from '../content-handler' +import { parseHTML } from 'linkedom' + +export class RevueHandler extends ContentHandler { + constructor() { + super() + this.name = 'revue' + } + + findNewsletterHeaderHref(dom: Document): string | undefined { + const viewOnline = dom.querySelectorAll('table tr td a[target="_blank"]') + let res: string | undefined = undefined + viewOnline.forEach((e) => { + if (e.textContent === 'View online') { + res = e.getAttribute('href') || undefined + } + }) + return res + } + + async isNewsletter(input: { + postHeader: string + from: string + unSubHeader: string + html: string + }): Promise { + const dom = parseHTML(input.html).document + if ( + dom.querySelectorAll('img[src*="getrevue.co"], img[src*="revue.email"]') + .length > 0 + ) { + const getrevueUrl = this.findNewsletterHeaderHref(dom) + if (getrevueUrl) { + return Promise.resolve(true) + } + } + return false + } + + async parseNewsletterUrl( + postHeader: string, + html: string + ): Promise { + return this.findNewsletterUrl(html) + } +} diff --git a/packages/puppeteer-parse/content-handler/src/newsletters/substack-handler.ts b/packages/puppeteer-parse/content-handler/src/newsletters/substack-handler.ts new file mode 100644 index 000000000..164068623 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/newsletters/substack-handler.ts @@ -0,0 +1,90 @@ +import addressparser from 'addressparser' +import { ContentHandler, PreHandleResult } from '../content-handler' +import { parseHTML } from 'linkedom' + +export class SubstackHandler extends ContentHandler { + constructor() { + super() + this.name = 'substack' + } + + shouldPreHandle(url: string, dom: Document): boolean { + const host = this.name + '.com' + // check if url ends with substack.com + // or has a profile image hosted at substack.com + return ( + new URL(url).hostname.endsWith(host) || + !!dom + .querySelector('.email-body img') + ?.getAttribute('src') + ?.includes(host) + ) + } + + async preHandle(url: string, dom: Document): Promise { + const body = dom.querySelector('.email-body-container') + + // this removes header and profile avatar + body?.querySelector('.header')?.remove() + body?.querySelector('.preamble')?.remove() + body?.querySelector('.meta-author-wrap')?.remove() + // this removes meta button + body?.querySelector('.post-meta')?.remove() + // this removes footer + body?.querySelector('.post-cta')?.remove() + body?.querySelector('.container-border')?.remove() + body?.querySelector('.footer')?.remove() + + return Promise.resolve(dom) + } + + findNewsletterHeaderHref(dom: Document): string | undefined { + // Substack header links + const postLink = dom.querySelector('h1 a ') + if (postLink) { + return postLink.getAttribute('href') || undefined + } + + return undefined + } + + async isNewsletter({ + postHeader, + html, + }: { + postHeader: string + from: string + unSubHeader: string + html: string + }): Promise { + if (postHeader) { + return Promise.resolve(true) + } + const dom = parseHTML(html).document + // substack newsletter emails have tables with a *post-meta class + if (dom.querySelector('table[class$="post-meta"]')) { + return true + } + // If the article has a header link, and substack icons its probably a newsletter + const href = this.findNewsletterHeaderHref(dom) + const heartIcon = dom.querySelector( + 'table tbody td span a img[src*="HeartIcon"]' + ) + const recommendIcon = dom.querySelector( + 'table tbody td span a img[src*="RecommendIconRounded"]' + ) + return Promise.resolve(!!(href && (heartIcon || recommendIcon))) + } + + async parseNewsletterUrl( + postHeader: string, + html: string + ): Promise { + // raw SubStack newsletter url is like + // we need to get the real url from the raw url + if (postHeader && addressparser(postHeader).length > 0) { + return Promise.resolve(addressparser(postHeader)[0].name) + } + return this.findNewsletterUrl(html) + } +} diff --git a/packages/puppeteer-parse/content-handler/src/websites/apple-news-handler.ts b/packages/puppeteer-parse/content-handler/src/websites/apple-news-handler.ts new file mode 100644 index 000000000..0b4026fb6 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/websites/apple-news-handler.ts @@ -0,0 +1,31 @@ +import axios from 'axios' +import { parseHTML } from 'linkedom' +import { ContentHandler, PreHandleResult } from '../content-handler' + +export class AppleNewsHandler extends ContentHandler { + constructor() { + super() + this.name = 'Apple News' + } + + shouldPreHandle(url: string, dom?: Document): boolean { + const u = new URL(url) + return u.hostname === 'apple.news' + } + + async preHandle(url: string, document?: Document): Promise { + const MOBILE_USER_AGENT = + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36' + const response = await axios.get(url, { + headers: { 'User-Agent': MOBILE_USER_AGENT }, + }) + const data = response.data as string + const dom = parseHTML(data).document + // make sure it's a valid URL by wrapping in new URL + const href = dom + .querySelector('span.click-here') + ?.parentElement?.getAttribute('href') + const u = href ? new URL(href) : undefined + return { url: u?.href } + } +} diff --git a/packages/puppeteer-parse/content-handler/src/websites/bloomberg-handler.ts b/packages/puppeteer-parse/content-handler/src/websites/bloomberg-handler.ts new file mode 100644 index 000000000..a867a3503 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/websites/bloomberg-handler.ts @@ -0,0 +1,41 @@ +import axios from 'axios' +import { parseHTML } from 'linkedom' +import { ContentHandler, PreHandleResult } from '../content-handler' + +export class BloombergHandler extends ContentHandler { + constructor() { + super() + this.name = 'Bloomberg' + } + + shouldPreHandle(url: string, dom?: Document): boolean { + const BLOOMBERG_URL_MATCH = + /https?:\/\/(www\.)?bloomberg.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)/ + return BLOOMBERG_URL_MATCH.test(url.toString()) + } + + async preHandle(url: string, document?: Document): Promise { + console.log('prehandling bloomberg url', url) + + try { + const response = await axios.get('https://app.scrapingbee.com/api/v1', { + params: { + api_key: process.env.SCRAPINGBEE_API_KEY, + url: url, + return_page_source: true, + block_ads: true, + block_resources: false, + }, + }) + const dom = parseHTML(response.data).document + return { + title: dom.title, + content: dom.querySelector('body')?.innerHTML, + url: url, + } + } catch (error) { + console.error('error prehandling bloomberg url', error) + throw error + } + } +} diff --git a/packages/puppeteer-parse/content-handler/src/websites/derstandard-handler.ts b/packages/puppeteer-parse/content-handler/src/websites/derstandard-handler.ts new file mode 100644 index 000000000..28742a3e5 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/websites/derstandard-handler.ts @@ -0,0 +1,34 @@ +import { ContentHandler, PreHandleResult } from '../content-handler' +import axios from 'axios' +import { parseHTML } from 'linkedom' + +export class DerstandardHandler extends ContentHandler { + constructor() { + super() + this.name = 'Derstandard' + } + + shouldPreHandle(url: string, dom?: Document): boolean { + const u = new URL(url) + return u.hostname === 'www.derstandard.at' + } + + async preHandle(url: string, document?: Document): Promise { + const response = await axios.get(url, { + // set cookie to give consent to get the article + headers: { + cookie: `DSGVO_ZUSAGE_V1=true; consentUUID=2bacb9c1-1e80-4be0-9f7b-ee987cf4e7b0_6`, + }, + }) + const content = response.data as string + + const dom = parseHTML(content).document + const titleElement = dom.querySelector('.article-title') + titleElement && titleElement.remove() + + return { + content: dom.body.outerHTML, + title: titleElement?.textContent || undefined, + } + } +} diff --git a/packages/puppeteer-parse/content-handler/src/websites/image-handler.ts b/packages/puppeteer-parse/content-handler/src/websites/image-handler.ts new file mode 100644 index 000000000..068a1cc66 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/websites/image-handler.ts @@ -0,0 +1,32 @@ +import { ContentHandler, PreHandleResult } from '../content-handler' + +export class ImageHandler extends ContentHandler { + constructor() { + super() + this.name = 'Image' + } + + shouldPreHandle(url: string, dom?: Document): boolean { + const IMAGE_URL_PATTERN = /(https?:\/\/.*\.(?:jpg|jpeg|png|webp))/i + return IMAGE_URL_PATTERN.test(url.toString()) + } + + async preHandle(url: string, document?: Document): Promise { + const title = url.toString().split('/').pop() || 'Image' + const content = ` + + + ${title} + + + + +
+ ${title} +
+ + ` + + return Promise.resolve({ title, content }) + } +} diff --git a/packages/puppeteer-parse/content-handler/src/websites/medium-handler.ts b/packages/puppeteer-parse/content-handler/src/websites/medium-handler.ts new file mode 100644 index 000000000..211a30c37 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/websites/medium-handler.ts @@ -0,0 +1,26 @@ +import { ContentHandler, PreHandleResult } from '../content-handler' + +export class MediumHandler extends ContentHandler { + constructor() { + super() + this.name = 'Medium' + } + + shouldPreHandle(url: string, dom?: Document): boolean { + const u = new URL(url) + return u.hostname.endsWith('medium.com') + } + + async preHandle(url: string, document?: Document): Promise { + console.log('prehandling medium url', url) + + try { + const res = new URL(url) + res.searchParams.delete('source') + return Promise.resolve({ url: res.toString() }) + } catch (error) { + console.error('error prehandling medium url', error) + throw error + } + } +} diff --git a/packages/puppeteer-parse/content-handler/src/websites/pdf-handler.ts b/packages/puppeteer-parse/content-handler/src/websites/pdf-handler.ts new file mode 100644 index 000000000..4c4ef748d --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/websites/pdf-handler.ts @@ -0,0 +1,18 @@ +import { ContentHandler, PreHandleResult } from '../content-handler' + +export class PdfHandler extends ContentHandler { + constructor() { + super() + this.name = 'PDF' + } + + shouldPreHandle(url: string, dom?: Document): boolean { + const u = new URL(url) + const path = u.pathname.replace(u.search, '') + return path.endsWith('.pdf') + } + + async preHandle(_url: string, document?: Document): Promise { + return Promise.resolve({ contentType: 'application/pdf' }) + } +} diff --git a/packages/puppeteer-parse/content-handler/src/websites/scrapingBee-handler.ts b/packages/puppeteer-parse/content-handler/src/websites/scrapingBee-handler.ts new file mode 100644 index 000000000..4c04d00e8 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/websites/scrapingBee-handler.ts @@ -0,0 +1,38 @@ +import { ContentHandler, PreHandleResult } from '../content-handler' +import axios from 'axios' +import { parseHTML } from 'linkedom' + +export class ScrapingBeeHandler extends ContentHandler { + constructor() { + super() + this.name = 'ScrapingBee' + } + + shouldPreHandle(url: string, dom?: Document): boolean { + const u = new URL(url) + const hostnames = ['nytimes.com', 'news.google.com'] + + return hostnames.some((h) => u.hostname.endsWith(h)) + } + + async preHandle(url: string, document?: Document): Promise { + console.log('prehandling url with scrapingbee', url) + + try { + const response = await axios.get('https://app.scrapingbee.com/api/v1', { + params: { + api_key: process.env.SCRAPINGBEE_API_KEY, + url: url, + return_page_source: true, + block_ads: true, + block_resources: false, + }, + }) + const dom = parseHTML(response.data).document + return { title: dom.title, content: response.data as string, url: url } + } catch (error) { + console.error('error prehandling url w/scrapingbee', error) + throw error + } + } +} diff --git a/packages/puppeteer-parse/content-handler/src/websites/t-dot-co-handler.ts b/packages/puppeteer-parse/content-handler/src/websites/t-dot-co-handler.ts new file mode 100644 index 000000000..277a8c087 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/websites/t-dot-co-handler.ts @@ -0,0 +1,26 @@ +import { ContentHandler } from '../content-handler' +import axios from 'axios' + +export class TDotCoHandler extends ContentHandler { + constructor() { + super() + this.name = 't.co' + } + + shouldResolve(url: string): boolean { + const T_DOT_CO_URL_MATCH = /^https:\/\/(?:www\.)?t\.co\/.*$/ + return T_DOT_CO_URL_MATCH.test(url) + } + + async resolve(url: string) { + return axios + .get(url, { maxRedirects: 0, validateStatus: null }) + .then((res) => { + return new URL(res.headers.location).href + }) + .catch((err) => { + console.log('err with t.co url', err) + return undefined + }) + } +} diff --git a/packages/puppeteer-parse/content-handler/src/websites/twitter-handler.ts b/packages/puppeteer-parse/content-handler/src/websites/twitter-handler.ts new file mode 100644 index 000000000..ddd37e45c --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/websites/twitter-handler.ts @@ -0,0 +1,167 @@ +import { ContentHandler, PreHandleResult } from '../content-handler' +import axios from 'axios' +import { DateTime } from 'luxon' +import _ from 'underscore' + +const TWITTER_BEARER_TOKEN = process.env.TWITTER_BEARER_TOKEN +const TWITTER_URL_MATCH = + /twitter\.com\/(?:#!\/)?(\w+)\/status(?:es)?\/(\d+)(?:\/.*)?/ + +const getTweetFields = () => { + const TWEET_FIELDS = + '&tweet.fields=attachments,author_id,conversation_id,created_at,' + + 'entities,geo,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets,' + + 'source,withheld' + const EXPANSIONS = '&expansions=author_id,attachments.media_keys' + const USER_FIELDS = + '&user.fields=created_at,description,entities,location,pinned_tweet_id,profile_image_url,protected,public_metrics,url,verified,withheld' + const MEDIA_FIELDS = + '&media.fields=duration_ms,height,preview_image_url,url,media_key,public_metrics,width' + + return `${TWEET_FIELDS}${EXPANSIONS}${USER_FIELDS}${MEDIA_FIELDS}` +} + +const getTweetById = async (id: string) => { + const BASE_ENDPOINT = 'https://api.twitter.com/2/tweets/' + const apiUrl = new URL(BASE_ENDPOINT + id + '?' + getTweetFields()) + + if (!TWITTER_BEARER_TOKEN) { + throw new Error('No Twitter bearer token found') + } + + return axios.get(apiUrl.toString(), { + headers: { + Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`, + redirect: 'follow', + }, + }) +} + +const titleForAuthor = (author: { name: string }) => { + return `${author.name} on Twitter` +} + +const tweetIdFromStatusUrl = (url: string): string | undefined => { + const match = url.toString().match(TWITTER_URL_MATCH) + return match?.[2] +} + +const formatTimestamp = (timestamp: string) => { + return DateTime.fromJSDate(new Date(timestamp)).toLocaleString( + DateTime.DATETIME_FULL + ) +} + +export class TwitterHandler extends ContentHandler { + constructor() { + super() + this.name = 'Twitter' + } + + shouldPreHandle(url: string, dom?: Document): boolean { + return !!TWITTER_BEARER_TOKEN && TWITTER_URL_MATCH.test(url.toString()) + } + + async preHandle(url: string, document?: Document): Promise { + console.log('prehandling twitter url', url) + + const tweetId = tweetIdFromStatusUrl(url) + if (!tweetId) { + throw new Error('could not find tweet id in url') + } + const tweetData = (await getTweetById(tweetId)).data as { + data: { + author_id: string + text: string + entities: { + urls: [ + { + url: string + expanded_url: string + display_url: string + } + ] + } + created_at: string + } + includes: { + users: [ + { + id: string + name: string + profile_image_url: string + username: string + } + ] + media: [ + { + preview_image_url: string + type: string + url: string + } + ] + } + } + const authorId = tweetData.data.author_id + const author = tweetData.includes.users.filter((u) => (u.id = authorId))[0] + // escape html entities in title + const title = _.escape(titleForAuthor(author)) + const authorImage = author.profile_image_url.replace('_normal', '_400x400') + + let text = tweetData.data.text + if (tweetData.data.entities && tweetData.data.entities.urls) { + for (const urlObj of tweetData.data.entities.urls) { + text = text.replace( + urlObj.url, + `
${urlObj.display_url}` + ) + } + } + + const front = ` +
+

${text}

+ ` + + let includesHtml = '' + if (tweetData.includes.media) { + includesHtml = tweetData.includes.media + .map((m) => { + const linkUrl = m.type == 'photo' ? m.url : url + const previewUrl = m.type == 'photo' ? m.url : m.preview_image_url + const mediaOpen = ` + + + + ` + return mediaOpen + }) + .join('\n') + } + + const back = ` + — ${ + author.username + } ${author.name} ${formatTimestamp( + tweetData.data.created_at + )} +
+ ` + const content = ` + + + + + + + + ${front} + ${includesHtml} + ${back} + ` + + return { content, url, title } + } +} diff --git a/packages/puppeteer-parse/content-handler/src/websites/wikipedia-handler.ts b/packages/puppeteer-parse/content-handler/src/websites/wikipedia-handler.ts new file mode 100644 index 000000000..8c3a176fd --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/websites/wikipedia-handler.ts @@ -0,0 +1,20 @@ +import { ContentHandler, PreHandleResult } from '../content-handler' + +export class WikipediaHandler extends ContentHandler { + constructor() { + super() + this.name = 'wikipedia' + } + + shouldPreHandle(url: string, dom?: Document): boolean { + return new URL(url).hostname.endsWith('wikipedia.org') + } + + async preHandle(url: string, dom: Document): Promise { + // This removes the [edit] anchors from wikipedia pages + dom.querySelectorAll('.mw-editsection').forEach((e) => e.remove()) + // this removes the sidebar + dom.querySelector('.infobox')?.remove() + return Promise.resolve({ dom }) + } +} diff --git a/packages/puppeteer-parse/content-handler/src/websites/youtube-handler.ts b/packages/puppeteer-parse/content-handler/src/websites/youtube-handler.ts new file mode 100644 index 000000000..4cdb7ee98 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/src/websites/youtube-handler.ts @@ -0,0 +1,76 @@ +import { ContentHandler, PreHandleResult } from '../content-handler' +import axios from 'axios' +import _ from 'underscore' + +const YOUTUBE_URL_MATCH = + /^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))(\/(?:[\w-]+\?v=|embed\/|v\/)?)([\w-]+)(\S+)?$/ + +export const getYoutubeVideoId = (url: string) => { + const u = new URL(url) + const videoId = u.searchParams.get('v') + if (!videoId) { + const match = url.toString().match(YOUTUBE_URL_MATCH) + if (match === null || match.length < 6 || !match[5]) { + return undefined + } + return match[5] + } + return videoId +} + +export class YoutubeHandler extends ContentHandler { + constructor() { + super() + this.name = 'Youtube' + } + + shouldPreHandle(url: string, dom?: Document): boolean { + return YOUTUBE_URL_MATCH.test(url.toString()) + } + + async preHandle(url: string, document?: Document): Promise { + const videoId = getYoutubeVideoId(url) + if (!videoId) { + return {} + } + + const oembedUrl = + `https://www.youtube.com/oembed?format=json&url=` + + encodeURIComponent(`https://www.youtube.com/watch?v=${videoId}`) + const oembed = (await axios.get(oembedUrl.toString())).data as { + title: string + width: number + height: number + thumbnail_url: string + author_name: string + author_url: string + } + // escape html entities in title + const title = _.escape(oembed.title) + const ratio = oembed.width / oembed.height + const thumbnail = oembed.thumbnail_url + const height = 350 + const width = height * ratio + const authorName = _.escape(oembed.author_name) + + const content = ` + + ${title} + + + + + + + + +

${title}

+ + + ` + + console.log('got video id', videoId) + + return { content, title: 'Youtube Content' } + } +} diff --git a/packages/puppeteer-parse/content-handler/test/apple-news-handler.test.ts b/packages/puppeteer-parse/content-handler/test/apple-news-handler.test.ts new file mode 100644 index 000000000..1584f9e28 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/test/apple-news-handler.test.ts @@ -0,0 +1,10 @@ +import { AppleNewsHandler } from '../src/websites/apple-news-handler' + +describe('open a simple web page', () => { + it('should return a response', async () => { + const response = await new AppleNewsHandler().preHandle( + 'https://apple.news/AxjzaZaPvSn23b67LhXI5EQ' + ) + console.log('response', response) + }) +}) diff --git a/packages/puppeteer-parse/content-handler/test/babel-register.js b/packages/puppeteer-parse/content-handler/test/babel-register.js new file mode 100644 index 000000000..a6f65f60a --- /dev/null +++ b/packages/puppeteer-parse/content-handler/test/babel-register.js @@ -0,0 +1,3 @@ +const register = require('@babel/register').default + +register({ extensions: ['.ts', '.tsx', '.js', '.jsx'] }) diff --git a/packages/puppeteer-parse/content-handler/test/data/beehiiv-newsletter.html b/packages/puppeteer-parse/content-handler/test/data/beehiiv-newsletter.html new file mode 100644 index 000000000..369d42af0 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/test/data/beehiiv-newsletter.html @@ -0,0 +1,15 @@ +I talked to a guy that spent $30M on a Beeple + +
+
+ + + diff --git a/packages/puppeteer-parse/content-handler/test/data/substack-forwarded-newsletter.html b/packages/puppeteer-parse/content-handler/test/data/substack-forwarded-newsletter.html new file mode 100644 index 000000000..0f67fd04d --- /dev/null +++ b/packages/puppeteer-parse/content-handler/test/data/substack-forwarded-newsletter.html @@ -0,0 +1 @@ +


---------- Forwarded message ---------
From: Andrew Methven <slowchinese@substack.com>
Date: Fri, Feb 18, 2022 at 11:57 PM
Subject: Companies that eat people
To: <XXXXXXXXXX@gmail.com>


Slow Chinese 每周漫闻 ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌

Companies that eat people

Slow Chinese 每周漫闻

The phrase, ‘eating people’ (吃人 chī rén), is used to criticise companies in China that exploit their employees.

It’s originally from Lǔ Xùn’s (鲁迅), A Madman's Diary (狂人日记 kuángrén rìjì), published in 1918:

我翻开历史一查,这历史没有年代。歪歪斜斜的每页上都写着“仁义道德”几个字,我横竖睡不着,仔细看了半夜,才从字缝里看出来,满本上都写着两个字“吃人"!

As I look through the pages of history, I see there are no dates. On each page, written messily, are the characters, ‘benevolence and morality’. I can’t sleep. I read into the night. Finally, I find hidden between the characters across the page, the words, ‘eating people’.

The times have changed since Lu Xun made that observation more than 100 years ago, but the culture of ‘eating people’ has not, according to social media comments this week, such as this one:

吃人的事实,从来没有变过,历朝历代都是如此 - The reality of [companies] exploiting their employees is nothing new. It’s been the same throughout history.

Two of China’s biggest tech companies, Tencent and Bilibili, have recently been accused of ‘eating people’, abusing and exploiting their staff.

So that’s what we discuss this week.

  • Conversations worth consuming: interview with Zhāng Yìfēi 张义飞 a former employee of Tencent

  • Words of the week: coverage and social media commentary of a recent death of a Bilibili employee allegedly due to overwork.

The audio version of this newsletter is already live - become a member to access it in your podcast app!

Use this link to claim a one-month free trial of the membership to give the full experience a go:

One-month free trial


1. CONVERSATIONS WITH CONSUMING

腾讯带头“反内卷”:光子工作室拒绝996,保障双休_游戏

Interview with Zhang Yifei

Two weeks ago a 25-year-old programmer at Tencent, Zhāng Yìfēi 张义飞, became an Internet sensation after standing up to his bosses at the company. He announced in an internal group chat that he was quitting his job, which then went viral on social media.

If 20-hour days is what the company wants, he wrote, ‘I’ll resign tomorrow’

36Kr interviewed Zhang this week (in Chinese). He talks more about the overtime culture at Tencent, and why he dared to take on his company in such a public way - he already had another job lined up.

There are some excellent words in his description of life as a working person at Tencent.

Useful words

  • 卡 kǎ - stop, block

    什么时候离职的?有人卡你吗 - When did you leave your job? Did they try to stop you?

  • 剥削 bō xuē - exploit

    加班严重、996工作制、互联网巨头压榨剥削员工等话题再次被拿来讨论

    - Topics such as serious overtime, the 996 work system, and the exploitation of employees by Internet giants are being discussed again.

  • 底气 dǐ qì - confidence, back up

    自己已经提前拿到其他公司的offer,比较有底气 - I already had an offer from another company, so I was relatively confident about doing it.

  • 忌惮 jì dàn - fear, be afraid of

    如果一些互联网大厂因此忌惮、不录用我,我正好也不想去这种加班严重的地方 - If some big Internet companies are afraid to hire me, that’s fine by me. I also don't want to work in a company with such heavy overtime.

  • 手软 shǒu ruǎn - ‘soft hand’, forgiving

    不要特立独行,搞小团体,否则他不会手软 - Do not march to a different beat or form small cliques. He will come down hard on this kind of behaviour.

  • 打硬仗 dǎ yìng zhàng - fight a hard war

    张小龙管理下的企业微信,经常会强调用小而精的团队打硬仗 - The company Wechat, under Zhang Xiaolong’s management, would often emphasise using a small and efficient team to work on tight deadlines.

    • Note: a common phrase used in Chinese companies when a team is working intensely on a project or against a ridiculous deadline.

    • Related: 打胜仗 dǎ shèngzhàng - win a war

  • 喊口号 hǎn kǒuhào - shouting slogans

    但大家普遍的看法是,不想看到空洞地喊口号,只想看到具体行为 - The general view is they don’t want to see people shouting empty slogans. They want to see action.

Idioms

  • 初出茅庐 chūchū máolú - ‘just come out of the thatched cottage’; inexperienced, wet behind the ears

    但对于大众而言,互联网巨头和初出茅庐的应届生,相比较下毕竟力量悬殊 - There’s no comparison between the power of the big internet companies and graduate employees with no experience.

  • 昏昏沉沉 hūnhūn chénchén - feeling sleepy

    来这里入职两个月,感到昏昏沉沉,记忆力下降很多

    - I’ve been here for two months. I feel tired and my memory has declined a lot.

  • 热火朝天 rèhuǒ cháotiān - ‘hot fire face sky’; vigorously, with energy

    到点的时候,差不多一半人还没走,都在热火朝天地讨论工作

    - When it was time to finish at the end of the day, around half of the team stayed behind to talk energetically about their work.


2. WORDS OF THE WEEK

上海之旅-前往Bilibili总部! - 哔哩哔哩

Bilibili eats people

A man who headed a content moderation department at Chinese video-streaming site Bilibili died last week after suffering a cerebral hemorrhage while working a Chinese New Year holiday shift.

The company was heavily criticised (Sohu - in Chinese) of having a toxic work culture.

One of the top comments on social media adapted the line from Lu Xun’s A Madman’s Diary. But instead of looking through the pages of history, overworked netizens find the same message hidden in their payslips:

我翻开工资单一查,这工资单没有工资,歪歪斜斜的每条都写着“迟到扣款”四个字。我横竖睡不着,仔细看了半夜,才从字缝里看出来,满本上都写着两个字“吃人"!

I glance at my payslip. I don’t see any pay on it. All I see are the four characters scrawled across the page: ‘fined for being late’. I can’t sleep. I look at it deep into the night. I finally find hidden between the characters across the page, written the words: ‘eating people’.

The words shared below are from the Sohu article and also from social media comments.

Useful words

  • 猝死 cù sǐ - sudden death, die suddenly

    他2月5日凌晨脑出血猝死 - He died suddenly in the early hours of the morning on 5 February.

  • 嗝屁 gé pì - hiccup, to die

    对大企业嗝屁几个不算啥,对于各个家庭你就是唯一呀 - A few people dying means nothing to a big company, but for a family it’s their only child!

    • Related: 翘辫子 qiào biàn zi - make braids - kick the bucket (Qing Dynasty reference relating to when men had to remove their braids).

      他因为加班严重而翘辫子了 - He died because of too much overtime.

  • 企图 qǐ tú - try to, seek to do something (negative)

    通过各种企图将这件事压下来,我决定发声 - Bilibili attempted to suppress the situation through different means, so I decided to speak up.

    • Note: similar to 试图 shì tú, but more negative connotations

  • 腐朽 fǔ xiǔ - degenerate, rotten

    道出了资本家的腐朽和恶臭 - Reeks of the stench and rot of capitalism.

  • 压垮 yā kuǎ - crush

    就是无情的压榨现有劳动力,能压垮一个是一个,多招一个人算我输 - It’s the callous exploitation of the current employees. The company tries to squeeze as much as possible from each and every one of them rather than hiring one more employee.

Idioms

  • 血汗工厂 xuèhàn gōngchǎng - ‘blood sweat factory’ - sweat shop

    B站因员工猝死一事,被推进了“血汗工厂”的舆论漩涡 - Bilibili has been dragged into a public debate about the company being a ‘sweatshop’ due to the sudden death of an employee.

    • Note: The pronunciation of 血 is normally xiě in colloquial phrases, and xuè in technical terms. But the rule is vague and not very helpful. In this phrase it's always xuè. But in 血汗钱 xiěhàn qián, ‘hard earned money’, xiě is more common. So confusing!

  • 混淆视听 hùnxiáo shìtīng - to muddle or confuse an issue

    晚9到早9确实不属于加班,因为是大夜班的正常时间,大厂就这样混淆视听? - 9pm to 9am does not count as overtime. But that’s because the night shift is a normal working shift for these big tech companies. They are muddling up the matter.

  • 枯燥乏味 kūzào fáwèi - boring

    做审核的确工作强度很大,而且枯燥乏味 - Being a content moderator is a very intense job. It’s also extremely boring.

    • More: 枯燥无味 kūzào wúwèi - boring (same meaning)

  • 恬不知耻 tián bù zhī chǐ - shameless

    觉得正常吗?居然还能如此恬不知耻的说“没有让他加班” - Is this normal? How can they be so shameless in saying the company did not ‘ask him to work overtime’?

  • 难上加难 nán shàng jiā nán - very difficult

    只要企业做大了,普通职工想维权难上加难 - When the company gets big it’s almost impossible for employees to protect their rights.

    • Related: 雪上加霜 xuěshàng jiāshuāng - make matters worse

Colloquial phrases

  • 万变不离其宗 wàn biàn bùlí qízōng - make ten thousand changes but remain the same in essence

    好像这些大公司公关都是万变不离其宗,核心就是推卸责任!- It seems the PR of these big companies tells a nice story, but in essence they don’t change. They are merely avoiding their responsibility.

  • 不见棺材不落泪 bùjiàn guāncai bù luò lèi - won’t cry until they see the coffin

    这也说得出口啊!真是不见血不掉泪啊 - They actually say this? Do they really have to let somebody die before they accept they are in the wrong?

    • More: I wrote more about this colloquialism in SupChina’s phrase of the week.

    • Related: 不到黄河心不死 bù dào huánghé xīn bù sǐ - not to stop until one reaches the Yellow River; refuse to give up until all hope is gone


3. RECOMMENDATIONS

Become a member of the community

As a member of the community you get access to unique resources to help you master modern Mandarin, learn, use, and understand Chinese language the way people speak it today.

  • 📚 Resources: Pleco downloads, word lists, and example sentences print-outs and audio download for each issue.

  • 🔉 Audio: audio version of the newsletter delivered as a member-only podcast every Saturday morning (before the free newsletter is published)

  • 🤓 Archive: full database of all words and phrases in the archive (nearly 1,300!) searchable according to word-type, sector and topic with audio and example sentences for each entry, updated weekly.

Use this link to claim a one-month free trial of the membership to give the full experience a go.

One-month free trial

That’s it for this week.

I look forward to seeing you in your inbox same time next weekend.

Andrew

+++

ps - please do share this newsletter on your social channels and with your networks

Share Slow Chinese 每周漫闻

Like

© 2022 Andrew Methven Unsubscribe
548 Market Street PMB 72296, San Francisco, CA 94104

Publish on Substack

diff --git a/packages/puppeteer-parse/content-handler/test/data/substack-forwarded-welcome-email.html b/packages/puppeteer-parse/content-handler/test/data/substack-forwarded-welcome-email.html new file mode 100644 index 000000000..feb14eeb3 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/test/data/substack-forwarded-welcome-email.html @@ -0,0 +1 @@ +


---------- Forwarded message ---------
From: Andrew Methven <slowchinese@substack.com>
Date: Thu, Dec 9, 2021 at 11:27 PM
Subject: How can Slow Chinese 每周漫闻 help you?
To: <XXXXXXXXXX@gmail.com>


Thank you for subscribing to for Slow Chinese 每周漫闻 ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌

Thanks so much for subscribing to Slow Chinese 每周漫闻 and welcome aboard!

I’m excited to help you improve and practice your Chinese language skills.

Here’s a quick way I can help:

Reply to this email and tell me about your story of learning Chinese and what challenges you currently have with the language.

I’ll reply with a specific suggestion to help you.

Also, to make sure the next issue of the newsletter doesn’t land in your spam folder, add my email address to your contacts.

Thanks!

Andrew

© 2021 Andrew Methven Unsubscribe
548 Market Street PMB 72296, San Francisco, CA 94104

Publish on Substack

diff --git a/packages/puppeteer-parse/content-handler/test/data/substack-private-forwarded-newsletter.html b/packages/puppeteer-parse/content-handler/test/data/substack-private-forwarded-newsletter.html new file mode 100644 index 000000000..bb240dbe8 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/test/data/substack-private-forwarded-newsletter.html @@ -0,0 +1,2 @@ +


---------- Forwarded message ---------
From: giggs <darkgiggsxx@gmail.com>
Date: Wed, Mar 2, 2022 at 5:29 PM
Subject: Fwd: The German Retreat From Nuclear Power
To: Radek <radoslaw.jurga@gmail.com>



---------- Forwarded message ---------
De : Bismarck Analysis <bismarck@substack.com>
Date: mer. 2 mars 2022 à 15:02
Subject: The German Retreat From Nuclear Power
To: <darkgiggsxx@gmail.com>


Germany's economic priorities are torn between industrial growth and energy degrowth. As its last nuclear plants are shut down, the choice is between relying on fossil fuels or closing factories. ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌

The German Retreat From Nuclear Power

Germany's economic priorities are torn between industrial growth and energy degrowth. As its last nuclear plants are shut down, the choice is between relying on fossil fuels or closing factories.

Isar Nuclear Power Plant near Landshut, Germany in 2016. The Isar station is scheduled to be shut down by the end of 2022. Photo by Dennis Hansch. Source.
+
diff --git a/packages/puppeteer-parse/content-handler/test/newsletter.test.ts b/packages/puppeteer-parse/content-handler/test/newsletter.test.ts new file mode 100644 index 000000000..dd3b7941c --- /dev/null +++ b/packages/puppeteer-parse/content-handler/test/newsletter.test.ts @@ -0,0 +1,191 @@ +import 'mocha' +import * as chai from 'chai' +import { expect } from 'chai' +import chaiAsPromised from 'chai-as-promised' +import chaiString from 'chai-string' +import { SubstackHandler } from '../src/newsletters/substack-handler' +import { AxiosHandler } from '../src/newsletters/axios-handler' +import { BloombergNewsletterHandler } from '../src/newsletters/bloomberg-newsletter-handler' +import { GolangHandler } from '../src/newsletters/golang-handler' +import { MorningBrewHandler } from '../src/newsletters/morning-brew-handler' +import nock from 'nock' +import { generateUniqueUrl } from '../src/content-handler' +import fs from 'fs' +import { BeehiivHandler } from '../src/newsletters/beehiiv-handler' + +chai.use(chaiAsPromised) +chai.use(chaiString) + +const load = (path: string): string => { + return fs.readFileSync(path, 'utf8') +} + +describe('Newsletter email test', () => { + describe('#getNewsletterUrl()', () => { + it('returns url when email is from SubStack', async () => { + const rawUrl = '' + + await expect( + new SubstackHandler().parseNewsletterUrl(rawUrl, '') + ).to.eventually.equal('https://hongbo130.substack.com/p/tldr') + }) + + it('returns url when email is from Axios', async () => { + const url = 'https://axios.com/blog/the-best-way-to-build-a-web-app' + const html = `View in browser at ${url}` + + await expect( + new AxiosHandler().parseNewsletterUrl('', html) + ).to.eventually.equal(url) + }) + + it('returns url when email is from Bloomberg', async () => { + const url = 'https://www.bloomberg.com/news/google-is-now-a-partner' + const html = ` + + View in browser + + ` + + await expect( + new BloombergNewsletterHandler().parseNewsletterUrl('', html) + ).to.eventually.equal(url) + }) + + it('returns url when email is from Golang Weekly', async () => { + const url = 'https://www.golangweekly.com/first' + const html = ` + Read on the Web + ` + + await expect( + new GolangHandler().parseNewsletterUrl('', html) + ).to.eventually.equal(url) + }) + + it('returns url when email is from Morning Brew', async () => { + const url = 'https://www.morningbrew.com/daily/issues/first' + const html = ` + View Online + ` + + await expect( + new MorningBrewHandler().parseNewsletterUrl('', html) + ).to.eventually.equal(url) + }) + }) + + describe('get author from email address', () => { + it('returns author when email is from Substack', () => { + const from = 'Jackson Harper from Omnivore App ' + expect(new AxiosHandler().parseAuthor(from)).to.equal( + 'Jackson Harper from Omnivore App' + ) + }) + + it('returns author when email is from Axios', () => { + const from = 'Mike Allen ' + expect(new AxiosHandler().parseAuthor(from)).to.equal('Mike Allen') + }) + }) + + describe('isProbablyNewsletter', () => { + it('returns true for substack newsletter', async () => { + const html = load('./test/data/substack-forwarded-newsletter.html') + await expect( + new SubstackHandler().isNewsletter({ + html, + postHeader: '', + from: '', + unSubHeader: '', + }) + ).to.eventually.be.true + }) + it('returns true for private forwarded substack newsletter', async () => { + const html = load( + './test/data/substack-private-forwarded-newsletter.html' + ) + await expect( + new SubstackHandler().isNewsletter({ + html, + postHeader: '', + from: '', + unSubHeader: '', + }) + ).to.eventually.be.true + }) + it('returns false for substack welcome email', async () => { + const html = load('./test/data/substack-forwarded-welcome-email.html') + await expect( + new SubstackHandler().isNewsletter({ + html, + postHeader: '', + from: '', + unSubHeader: '', + }) + ).to.eventually.be.false + }) + it('returns true for beehiiv.com newsletter', async () => { + const html = load('./test/data/beehiiv-newsletter.html') + await expect( + new BeehiivHandler().isNewsletter({ + html, + postHeader: '', + from: '', + unSubHeader: '', + }) + ).to.eventually.be.true + }) + }) + + describe('findNewsletterUrl', async () => { + it('gets the URL from the header if it is a substack newsletter', async () => { + nock('https://email.mg2.substack.com') + .head( + '/c/eJxNkk2TojAQhn-N3KTyQfg4cGDGchdnYcsZx9K5UCE0EMVAkTiKv36iHnarupNUd7rfVJ4W3EDTj1M89No496Uw0wCxgovuwBgYnbOGsZBVjDHzKPWYU8VehUMWOlIX9Qhw4rKLzXgGZziXnRTcyF7dK0iIGMVOG_OS1aTmKPRDilgVhTQUPCQIcE0x-MFTmJ8rCUpA3KtuenR2urg1ZtAzmszI0tq_Z7m66y-ilQo0uAqMTQ7WRX8auJKg56blZg7WB-iHDuYEBzO6NP0R1IwuYFphQbbTjnTH9NBfs80nym4Zyj8uUvyKbtUyGr5eUz9fNDQ7JCxfJDo9dW1lY9lmj_JNivPbGmf2Pt_lN9tDit9b-WeTetni85Z9pDpVOd7L1E_Vy7egayNO23ZP34eSeLJeux1b0rer_xaZ7ykS78nuSjMY-nL98rparNZNcv07JCjN06_EkTFBxBqOUMACErnELUNMSxTUjLDQZwzcqa4bRjCfeejUEFefS224OLr2S5wxPtij7lVrs80d2CNseRV2P52VNFMBipcdVE-U5jkRD7hFAwpGOylVwU2Mfc9qBh7DoR89yVnWXhgQFHnIsbpVb6tU_B-hH_2yzWY' + ) + .reply(302, undefined, { + Location: + 'https://newsletter.slowchinese.net/p/companies-that-eat-people-217', + }) + .get('/p/companies-that-eat-people-217') + .reply(200, '') + const html = load('./test/data/substack-forwarded-newsletter.html') + const url = await new SubstackHandler().findNewsletterUrl(html) + // Not sure if the redirects from substack expire, this test could eventually fail + expect(url).to.startWith( + 'https://newsletter.slowchinese.net/p/companies-that-eat-people-217' + ) + }).timeout(10000) + it('gets the URL from the header if it is a beehiiv newsletter', async () => { + nock('https://u23463625.ct.sendgrid.net') + .head( + '/ss/c/AX1lEgEQaxtvFxLaVo0GBo_geajNrlI1TGeIcmMViR3pL3fEDZnbbkoeKcaY62QZk0KPFudUiUXc_uMLerV4nA/3k5/3TFZmreTR0qKSCgowABnVg/h30/zzLik7UXd1H_n4oyd5W8Xu639AYQQB2UXz-CsssSnno' + ) + .reply(302, undefined, { + Location: 'https://www.milkroad.com/p/talked-guy-spent-30m-beeple', + }) + .get('/p/talked-guy-spent-30m-beeple') + .reply(200, '') + const html = load('./test/data/beehiiv-newsletter.html') + const url = await new BeehiivHandler().findNewsletterUrl(html) + expect(url).to.startWith( + 'https://www.milkroad.com/p/talked-guy-spent-30m-beeple' + ) + }) + it('returns undefined if it is not a newsletter', async () => { + const html = load('./test/data/substack-forwarded-welcome-email.html') + const url = await new SubstackHandler().findNewsletterUrl(html) + expect(url).to.be.undefined + }) + }) + + describe('generateUniqueUrl', () => { + it('generates a unique URL', () => { + const url1 = generateUniqueUrl() + const url2 = generateUniqueUrl() + + expect(url1).to.not.eql(url2) + }) + }) +}) diff --git a/packages/puppeteer-parse/content-handler/test/youtube-handler.test.ts b/packages/puppeteer-parse/content-handler/test/youtube-handler.test.ts new file mode 100644 index 000000000..beb4d3a66 --- /dev/null +++ b/packages/puppeteer-parse/content-handler/test/youtube-handler.test.ts @@ -0,0 +1,25 @@ +import { expect } from 'chai' +import 'mocha' +import { getYoutubeVideoId } from '../src/websites/youtube-handler' + +describe('getYoutubeVideoId', () => { + it('should parse video id out of a URL', async () => { + expect('BnSUk0je6oo').to.eq( + getYoutubeVideoId('https://www.youtube.com/watch?v=BnSUk0je6oo&t=269s') + ) + expect('vFD2gu007dc').to.eq( + getYoutubeVideoId( + 'https://www.youtube.com/watch?v=vFD2gu007dc&list=RDvFD2gu007dc&start_radio=1' + ) + ) + expect('vFD2gu007dc').to.eq( + getYoutubeVideoId('https://youtu.be/vFD2gu007dc') + ) + expect('BMFVCnbRaV4').to.eq( + getYoutubeVideoId('https://youtube.com/watch?v=BMFVCnbRaV4&feature=share') + ) + expect('cg9b4RC87LI').to.eq( + getYoutubeVideoId('https://youtu.be/cg9b4RC87LI?t=116') + ) + }) +}) diff --git a/packages/puppeteer-parse/content-handler/tsconfig.json b/packages/puppeteer-parse/content-handler/tsconfig.json new file mode 100644 index 000000000..aeb8d2c3a --- /dev/null +++ b/packages/puppeteer-parse/content-handler/tsconfig.json @@ -0,0 +1,10 @@ +{ + "extends": "@tsconfig/node14/tsconfig.json", + "compilerOptions": { + "rootDir": ".", + "declaration": true, + "outDir": "build", + "lib": ["dom"] + }, + "include": ["src"] +} diff --git a/packages/puppeteer-parse/package.json b/packages/puppeteer-parse/package.json index 11e60b00d..0d1ab90c3 100644 --- a/packages/puppeteer-parse/package.json +++ b/packages/puppeteer-parse/package.json @@ -6,6 +6,7 @@ "dependencies": { "@google-cloud/logging-winston": "^5.1.1", "@google-cloud/storage": "^5.18.1", + "@omnivore/content-handler": "file:./../content-handler", "@sentry/serverless": "^6.13.3", "axios": "^0.27.2", "chrome-aws-lambda": "^10.1.0", diff --git a/yarn.lock b/yarn.lock index b76eae9d8..1edbd4c7a 100644 --- a/yarn.lock +++ b/yarn.lock @@ -4312,6 +4312,17 @@ dependencies: "@octokit/openapi-types" "^9.5.0" +"@omnivore/content-handler@file:./packages/content-handler": + version "1.0.0" + dependencies: + addressparser "^1.0.1" + axios "^0.27.2" + linkedom "^0.14.16" + luxon "^3.0.4" + rfc2047 "^4.0.1" + underscore "^1.13.6" + uuid "^9.0.0" + "@opentelemetry/api-metrics@0.27.0": version "0.27.0" resolved "https://registry.yarnpkg.com/@opentelemetry/api-metrics/-/api-metrics-0.27.0.tgz#d8eca344ed1155f3ea8a8133ade827b4bb90efbf"