Import content-handler as local dependency

This commit is contained in:
Hongbo Wu
2022-10-03 11:11:24 +08:00
parent 33355cb208
commit a9607adfd3
42 changed files with 1503 additions and 6 deletions

View File

@ -23,12 +23,18 @@ WORKDIR /app
ENV CHROMIUM_PATH /usr/bin/chromium-browser ENV CHROMIUM_PATH /usr/bin/chromium-browser
ENV LAUNCH_HEADLESS=true ENV LAUNCH_HEADLESS=true
COPY . /app/ COPY package.json .
WORKDIR app COPY yarn.lock .
COPY tsconfig.json .
COPY .prettierrc .
COPY .eslintrc .
COPY /packages/content-fetch ./packages/content-fetch
COPY /packages/content-handler ./packages/content-handler
RUN yarn install --pure-lockfile RUN yarn install --pure-lockfile
EXPOSE 8080 EXPOSE 8080
ENTRYPOINT ["yarn", "start"] CMD ["yarn", "workspace", "@omnivore/content-fetch", "start"]

View File

@ -34,6 +34,7 @@ COPY .prettierrc .
COPY .eslintrc . COPY .eslintrc .
COPY /packages/content-fetch ./packages/content-fetch COPY /packages/content-fetch ./packages/content-fetch
COPY /packages/content-handler ./packages/content-handler
RUN yarn install --pure-lockfile RUN yarn install --pure-lockfile

View File

@ -4,6 +4,7 @@
"description": "Service that fetches page content from a URL", "description": "Service that fetches page content from a URL",
"main": "index.js", "main": "index.js",
"dependencies": { "dependencies": {
"@omnivore/content-handler": "file:./../content-handler",
"axios": "^0.27.2", "axios": "^0.27.2",
"dotenv": "^8.2.0", "dotenv": "^8.2.0",
"express": "^4.17.1", "express": "^4.17.1",
@ -11,8 +12,7 @@
"linkedom": "^0.14.9", "linkedom": "^0.14.9",
"luxon": "^2.3.1", "luxon": "^2.3.1",
"puppeteer-core": "^16.1.0", "puppeteer-core": "^16.1.0",
"underscore": "^1.13.4", "underscore": "^1.13.4"
"@omnivore/content-handler": "1.0.0"
}, },
"scripts": { "scripts": {
"start": "node app.js", "start": "node app.js",

View File

@ -29,9 +29,9 @@
"dependencies": { "dependencies": {
"@google-cloud/functions-framework": "3.1.2", "@google-cloud/functions-framework": "3.1.2",
"@google-cloud/pubsub": "^2.18.4", "@google-cloud/pubsub": "^2.18.4",
"@omnivore/content-handler": "file:./../content-handler",
"@sendgrid/client": "^7.6.0", "@sendgrid/client": "^7.6.0",
"@sentry/serverless": "^6.16.1", "@sentry/serverless": "^6.16.1",
"@omnivore/content-handler": "1.0.0",
"addressparser": "^1.0.1", "addressparser": "^1.0.1",
"axios": "^0.27.2", "axios": "^0.27.2",
"jsonwebtoken": "^8.5.1", "jsonwebtoken": "^8.5.1",

View File

@ -0,0 +1 @@
node_modules/

View File

@ -0,0 +1,6 @@
{
"extends": "../../.eslintrc",
"parserOptions": {
"project": "tsconfig.json"
}
}

View File

@ -0,0 +1,2 @@
node_modules
/lib

View File

@ -0,0 +1,7 @@
/test/
src
tsconfig.json
.eslintrc
.eslintignore
.gitignore
mocha-config.json

View File

@ -0,0 +1,5 @@
{
"extension": ["ts"],
"spec": "test/**/*.test.ts",
"require": "test/babel-register.js"
}

View File

@ -0,0 +1,34 @@
{
"name": "@omnivore/content-handler",
"version": "1.0.0",
"description": "A standalone version of content handler to parse and format each type of content",
"main": "build/src/index.js",
"types": "build/src/index.d.ts",
"files": [
"build/src"
],
"license": "Apache-2.0",
"scripts": {
"test": "yarn mocha -r ts-node/register --config mocha-config.json",
"lint": "eslint src --ext ts,js,tsx,jsx",
"compile": "tsc",
"build": "tsc"
},
"devDependencies": {
"chai": "^4.3.6",
"chai-as-promised": "^7.1.1",
"chai-string": "^1.5.0",
"eslint-plugin-prettier": "^4.0.0",
"mocha": "^10.0.0",
"nock": "^13.2.9"
},
"dependencies": {
"addressparser": "^1.0.1",
"axios": "^0.27.2",
"linkedom": "^0.14.16",
"luxon": "^3.0.4",
"rfc2047": "^4.0.1",
"underscore": "^1.13.6",
"uuid": "^9.0.0"
}
}

View File

@ -0,0 +1,175 @@
import addressparser from 'addressparser'
import rfc2047 from 'rfc2047'
import { v4 as uuid } from 'uuid'
import { parseHTML } from 'linkedom'
import axios from 'axios'
interface Unsubscribe {
mailTo?: string
httpUrl?: string
}
export interface NewsletterInput {
postHeader: string
from: string
unSubHeader: string
email: string
html: string
title: string
}
export interface NewsletterResult {
email: string
content: string
url: string
title: string
author: string
unsubMailTo?: string
unsubHttpUrl?: string
}
export interface PreHandleResult {
url?: string
title?: string
content?: string
contentType?: string
dom?: Document
}
export const FAKE_URL_PREFIX = 'https://omnivore.app/no_url?q='
export const generateUniqueUrl = () => FAKE_URL_PREFIX + uuid()
export abstract class ContentHandler {
protected senderRegex: RegExp
protected urlRegex: RegExp
name: string
protected constructor() {
this.senderRegex = new RegExp(/NEWSLETTER_SENDER_REGEX/)
this.urlRegex = new RegExp(/NEWSLETTER_URL_REGEX/)
this.name = 'Handler name'
}
shouldResolve(url: string): boolean {
return false
}
async resolve(url: string): Promise<string | undefined> {
return Promise.resolve(url)
}
shouldPreHandle(url: string, dom?: Document): boolean {
return false
}
async preHandle(url: string, dom?: Document): Promise<PreHandleResult> {
return Promise.resolve({ url, dom })
}
async isNewsletter(input: {
postHeader: string
from: string
unSubHeader: string
html?: string
}): Promise<boolean> {
const re = new RegExp(this.senderRegex)
return Promise.resolve(
re.test(input.from) && (!!input.postHeader || !!input.unSubHeader)
)
}
findNewsletterHeaderHref(dom: Document): string | undefined {
return undefined
}
// Given an HTML blob tries to find a URL to use for
// a canonical URL.
async findNewsletterUrl(html: string): Promise<string | undefined> {
const dom = parseHTML(html).document
// Check if this is a substack newsletter
const href = this.findNewsletterHeaderHref(dom)
if (href) {
// Try to make a HEAD request, so we get the redirected URL, since these
// will usually be behind tracking url redirects
try {
const response = await axios.head(href, { timeout: 5000 })
return Promise.resolve(
// eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
response.request.res.responseUrl as string | undefined
)
} catch (e) {
console.log('error making HEAD request', e)
return Promise.resolve(href)
}
}
return Promise.resolve(undefined)
}
async parseNewsletterUrl(
_postHeader: string,
html: string
): Promise<string | undefined> {
// get newsletter url from html
const matches = html.match(this.urlRegex)
if (matches) {
return Promise.resolve(matches[1])
}
return Promise.resolve(undefined)
}
parseAuthor(from: string): string {
// get author name from email
// e.g. 'Jackson Harper from Omnivore App <jacksonh@substack.com>'
// or 'Mike Allen <mike@axios.com>'
const parsed = addressparser(from)
if (parsed.length > 0) {
return parsed[0].name
}
return from
}
parseUnsubscribe(unSubHeader: string): Unsubscribe {
// parse list-unsubscribe header
// e.g. List-Unsubscribe: <https://omnivore.com/unsub>, <mailto:unsub@omnivore.com>
const decoded = rfc2047.decode(unSubHeader)
return {
mailTo: decoded.match(/<(https?:\/\/[^>]*)>/)?.[1],
httpUrl: decoded.match(/<mailto:([^>]*)>/)?.[1],
}
}
async handleNewsletter({
email,
html,
postHeader,
title,
from,
unSubHeader,
}: NewsletterInput): Promise<NewsletterResult> {
console.log('handleNewsletter', email, postHeader, title, from)
if (!email || !html || !title || !from) {
console.log('invalid newsletter email')
throw new Error('invalid newsletter email')
}
// fallback to default url if newsletter url does not exist
// assign a random uuid to the default url to avoid duplicate url
const url =
(await this.parseNewsletterUrl(postHeader, html)) || generateUniqueUrl()
const author = this.parseAuthor(from)
const unsubscribe = this.parseUnsubscribe(unSubHeader)
return {
email,
content: html,
url,
title,
author,
unsubMailTo: unsubscribe.mailTo || '',
unsubHttpUrl: unsubscribe.httpUrl || '',
}
}
}

View File

@ -0,0 +1,116 @@
import { AppleNewsHandler } from './websites/apple-news-handler'
import { BloombergHandler } from './websites/bloomberg-handler'
import { DerstandardHandler } from './websites/derstandard-handler'
import { ImageHandler } from './websites/image-handler'
import { MediumHandler } from './websites/medium-handler'
import { PdfHandler } from './websites/pdf-handler'
import { ScrapingBeeHandler } from './websites/scrapingBee-handler'
import { TDotCoHandler } from './websites/t-dot-co-handler'
import { TwitterHandler } from './websites/twitter-handler'
import { YoutubeHandler } from './websites/youtube-handler'
import { WikipediaHandler } from './websites/wikipedia-handler'
import {
ContentHandler,
NewsletterInput,
NewsletterResult,
PreHandleResult,
} from './content-handler'
import { SubstackHandler } from './newsletters/substack-handler'
import { AxiosHandler } from './newsletters/axios-handler'
import { GolangHandler } from './newsletters/golang-handler'
import { MorningBrewHandler } from './newsletters/morning-brew-handler'
import { BloombergNewsletterHandler } from './newsletters/bloomberg-newsletter-handler'
import { BeehiivHandler } from './newsletters/beehiiv-handler'
import { ConvertkitHandler } from './newsletters/convertkit-handler'
import { RevueHandler } from './newsletters/revue-handler'
const validateUrlString = (url: string) => {
const u = new URL(url)
// Make sure the URL is http or https
if (u.protocol !== 'http:' && u.protocol !== 'https:') {
throw new Error('Invalid URL protocol check failed')
}
// Make sure the domain is not localhost
if (u.hostname === 'localhost' || u.hostname === '0.0.0.0') {
throw new Error('Invalid URL is localhost')
}
// Make sure the domain is not a private IP
if (/^(10|172\.16|192\.168)\..*/.test(u.hostname)) {
throw new Error('Invalid URL is private ip')
}
}
const contentHandlers: ContentHandler[] = [
new AppleNewsHandler(),
new BloombergHandler(),
new DerstandardHandler(),
new ImageHandler(),
new MediumHandler(),
new PdfHandler(),
new ScrapingBeeHandler(),
new TDotCoHandler(),
new TwitterHandler(),
new YoutubeHandler(),
new WikipediaHandler(),
]
const newsletterHandlers: ContentHandler[] = [
new AxiosHandler(),
new BloombergNewsletterHandler(),
new GolangHandler(),
new SubstackHandler(),
new MorningBrewHandler(),
new SubstackHandler(),
new BeehiivHandler(),
new ConvertkitHandler(),
new RevueHandler(),
]
export const preHandleContent = async (
url: string,
dom?: Document
): Promise<PreHandleResult | undefined> => {
// Before we run the regular handlers we check to see if we need tp
// pre-resolve the URL. TODO: This should probably happen recursively,
// so URLs can be pre-resolved, handled, pre-resolved, handled, etc.
for (const handler of contentHandlers) {
if (handler.shouldResolve(url)) {
try {
const resolvedUrl = await handler.resolve(url)
if (resolvedUrl && validateUrlString(resolvedUrl)) {
url = resolvedUrl
}
} catch (err) {
console.log('error resolving url with handler', handler.name, err)
}
break
}
}
// Before we fetch the page we check the handlers, to see if they want
// to perform a prefetch action that can modify our requests.
// enumerate the handlers and see if any of them want to handle the request
for (const handler of contentHandlers) {
if (handler.shouldPreHandle(url, dom)) {
console.log('preHandleContent', handler.name, url)
return handler.preHandle(url, dom)
}
}
return undefined
}
export const handleNewsletter = async (
input: NewsletterInput
): Promise<NewsletterResult | undefined> => {
for (const handler of newsletterHandlers) {
if (await handler.isNewsletter(input)) {
return handler.handleNewsletter(input)
}
}
return undefined
}
module.exports = {
preHandleContent,
handleNewsletter,
}

View File

@ -0,0 +1,46 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
export class AxiosHandler extends ContentHandler {
constructor() {
super()
this.senderRegex = /<.+@axios.com>/
this.urlRegex = /View in browser at <a.*>(.*)<\/a>/
this.name = 'axios'
}
shouldPreHandle(url: string, dom?: Document): boolean {
const host = this.name + '.com'
// check if url ends with axios.com
return new URL(url).hostname.endsWith(host)
}
async preHandle(url: string, dom: Document): Promise<PreHandleResult> {
const body = dom.querySelector('table')
let isFooter = false
// this removes ads and replaces table with a div
body?.querySelectorAll('table').forEach((el) => {
// remove the footer and the ads
if (!el.textContent || el.textContent.length < 20 || isFooter) {
el.remove()
} else {
// removes the first few rows of the table (the header)
// remove the last two rows of the table (they are ads)
el.querySelectorAll('tr').forEach((tr, i) => {
if (i <= 7 || i >= el.querySelectorAll('tr').length - 2) {
console.log('removing', tr)
tr.remove()
}
})
// replace the table with a div
const div = dom.createElement('div')
div.innerHTML = el.innerHTML
el.parentNode?.replaceChild(div, el)
// set the isFooter flag to true because the next table is the footer
isFooter = true
}
})
return Promise.resolve({ dom })
}
}

View File

@ -0,0 +1,43 @@
import { ContentHandler } from '../content-handler'
import { parseHTML } from 'linkedom'
export class BeehiivHandler extends ContentHandler {
constructor() {
super()
this.name = 'beehiiv'
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const readOnline = dom.querySelectorAll('table tr td div a[class*="link"]')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (e.textContent === 'Read Online') {
res = e.getAttribute('href') || undefined
}
})
return res
}
async isNewsletter(input: {
postHeader: string
from: string
unSubHeader: string
html: string
}): Promise<boolean> {
const dom = parseHTML(input.html).document
if (dom.querySelectorAll('img[src*="beehiiv.net"]').length > 0) {
const beehiivUrl = this.findNewsletterHeaderHref(dom)
if (beehiivUrl) {
return Promise.resolve(true)
}
}
return false
}
async parseNewsletterUrl(
postHeader: string,
html: string
): Promise<string | undefined> {
return this.findNewsletterUrl(html)
}
}

View File

@ -0,0 +1,37 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
export class BloombergNewsletterHandler extends ContentHandler {
constructor() {
super()
this.senderRegex = /<.+@mail.bloomberg.*.com>/
this.urlRegex = /<a class="view-in-browser__url" href=["']([^"']*)["']/
this.name = 'bloomberg'
}
shouldPreHandle(url: string, dom: Document): boolean {
const host = this.name + '.com'
// check if url ends with bloomberg.com
return (
new URL(url).hostname.endsWith(host) ||
dom.querySelector('.logo-image')?.getAttribute('alt')?.toLowerCase() ===
this.name
)
}
async preHandle(url: string, dom: Document): Promise<PreHandleResult> {
const body = dom.querySelector('.wrapper')
// this removes header
body?.querySelector('.sailthru-variables')?.remove()
body?.querySelector('.preview-text')?.remove()
body?.querySelector('.logo-wrapper')?.remove()
body?.querySelector('.by-the-number-wrapper')?.remove()
// this removes footer
body?.querySelector('.quote-box-wrapper')?.remove()
body?.querySelector('.header-wrapper')?.remove()
body?.querySelector('.component-wrapper')?.remove()
body?.querySelector('.footer')?.remove()
return Promise.resolve({ dom })
}
}

View File

@ -0,0 +1,41 @@
import { ContentHandler } from '../content-handler'
import { parseHTML } from 'linkedom'
export class ConvertkitHandler extends ContentHandler {
constructor() {
super()
this.name = 'convertkit'
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const readOnline = dom.querySelectorAll('table tr td a')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (e.textContent === 'View this email in your browser') {
res = e.getAttribute('href') || undefined
}
})
return res
}
async isNewsletter(input: {
postHeader: string
from: string
unSubHeader: string
html: string
}): Promise<boolean> {
const dom = parseHTML(input.html).document
return Promise.resolve(
dom.querySelectorAll(
'img[src*="convertkit.com"], img[src*="convertkit-mail.com"]'
).length > 0
)
}
async parseNewsletterUrl(
postHeader: string,
html: string
): Promise<string | undefined> {
return this.findNewsletterUrl(html)
}
}

View File

@ -0,0 +1,27 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
export class GolangHandler extends ContentHandler {
constructor() {
super()
this.senderRegex = /<.+@golangweekly.com>/
this.urlRegex = /<a href=["']([^"']*)["'].*>Read on the Web<\/a>/
this.name = 'golangweekly'
}
shouldPreHandle(url: string, dom?: Document): boolean {
const host = this.name + '.com'
// check if url ends with golangweekly.com
return new URL(url).hostname.endsWith(host)
}
async preHandle(url: string, dom: Document): Promise<PreHandleResult> {
const body = dom.querySelector('body')
// this removes the "Subscribe" button
body?.querySelector('.el-splitbar')?.remove()
// this removes the title
body?.querySelector('.el-masthead')?.remove()
return Promise.resolve({ dom })
}
}

View File

@ -0,0 +1,35 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
export class MorningBrewHandler extends ContentHandler {
constructor() {
super()
this.senderRegex = /Morning Brew <crew@morningbrew.com>/
this.urlRegex = /<a.* href=["']([^"']*)["'].*>View Online<\/a>/
this.name = 'morningbrew'
}
shouldPreHandle(url: string, dom?: Document): boolean {
const host = this.name + '.com'
// check if url ends with morningbrew.com
return new URL(url).hostname.endsWith(host)
}
async preHandle(url: string, dom: Document): Promise<PreHandleResult> {
// retain the width of the cells in the table of market info
dom.querySelectorAll('.markets-arrow-cell').forEach((td) => {
const table = td.closest('table')
if (table) {
const bubbleTable = table.querySelector('.markets-bubble')
if (bubbleTable) {
// replace the nested table with the text
const e = bubbleTable.querySelector('.markets-table-text')
e && bubbleTable.parentNode?.replaceChild(e, bubbleTable)
}
// set custom class for the table
table.className = 'morning-brew-markets'
}
})
return Promise.resolve({ dom })
}
}

View File

@ -0,0 +1,46 @@
import { ContentHandler } from '../content-handler'
import { parseHTML } from 'linkedom'
export class RevueHandler extends ContentHandler {
constructor() {
super()
this.name = 'revue'
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const viewOnline = dom.querySelectorAll('table tr td a[target="_blank"]')
let res: string | undefined = undefined
viewOnline.forEach((e) => {
if (e.textContent === 'View online') {
res = e.getAttribute('href') || undefined
}
})
return res
}
async isNewsletter(input: {
postHeader: string
from: string
unSubHeader: string
html: string
}): Promise<boolean> {
const dom = parseHTML(input.html).document
if (
dom.querySelectorAll('img[src*="getrevue.co"], img[src*="revue.email"]')
.length > 0
) {
const getrevueUrl = this.findNewsletterHeaderHref(dom)
if (getrevueUrl) {
return Promise.resolve(true)
}
}
return false
}
async parseNewsletterUrl(
postHeader: string,
html: string
): Promise<string | undefined> {
return this.findNewsletterUrl(html)
}
}

View File

@ -0,0 +1,90 @@
import addressparser from 'addressparser'
import { ContentHandler, PreHandleResult } from '../content-handler'
import { parseHTML } from 'linkedom'
export class SubstackHandler extends ContentHandler {
constructor() {
super()
this.name = 'substack'
}
shouldPreHandle(url: string, dom: Document): boolean {
const host = this.name + '.com'
// check if url ends with substack.com
// or has a profile image hosted at substack.com
return (
new URL(url).hostname.endsWith(host) ||
!!dom
.querySelector('.email-body img')
?.getAttribute('src')
?.includes(host)
)
}
async preHandle(url: string, dom: Document): Promise<PreHandleResult> {
const body = dom.querySelector('.email-body-container')
// this removes header and profile avatar
body?.querySelector('.header')?.remove()
body?.querySelector('.preamble')?.remove()
body?.querySelector('.meta-author-wrap')?.remove()
// this removes meta button
body?.querySelector('.post-meta')?.remove()
// this removes footer
body?.querySelector('.post-cta')?.remove()
body?.querySelector('.container-border')?.remove()
body?.querySelector('.footer')?.remove()
return Promise.resolve(dom)
}
findNewsletterHeaderHref(dom: Document): string | undefined {
// Substack header links
const postLink = dom.querySelector('h1 a ')
if (postLink) {
return postLink.getAttribute('href') || undefined
}
return undefined
}
async isNewsletter({
postHeader,
html,
}: {
postHeader: string
from: string
unSubHeader: string
html: string
}): Promise<boolean> {
if (postHeader) {
return Promise.resolve(true)
}
const dom = parseHTML(html).document
// substack newsletter emails have tables with a *post-meta class
if (dom.querySelector('table[class$="post-meta"]')) {
return true
}
// If the article has a header link, and substack icons its probably a newsletter
const href = this.findNewsletterHeaderHref(dom)
const heartIcon = dom.querySelector(
'table tbody td span a img[src*="HeartIcon"]'
)
const recommendIcon = dom.querySelector(
'table tbody td span a img[src*="RecommendIconRounded"]'
)
return Promise.resolve(!!(href && (heartIcon || recommendIcon)))
}
async parseNewsletterUrl(
postHeader: string,
html: string
): Promise<string | undefined> {
// raw SubStack newsletter url is like <https://hongbo130.substack.com/p/tldr>
// we need to get the real url from the raw url
if (postHeader && addressparser(postHeader).length > 0) {
return Promise.resolve(addressparser(postHeader)[0].name)
}
return this.findNewsletterUrl(html)
}
}

View File

@ -0,0 +1,31 @@
import axios from 'axios'
import { parseHTML } from 'linkedom'
import { ContentHandler, PreHandleResult } from '../content-handler'
export class AppleNewsHandler extends ContentHandler {
constructor() {
super()
this.name = 'Apple News'
}
shouldPreHandle(url: string, dom?: Document): boolean {
const u = new URL(url)
return u.hostname === 'apple.news'
}
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
const MOBILE_USER_AGENT =
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36'
const response = await axios.get(url, {
headers: { 'User-Agent': MOBILE_USER_AGENT },
})
const data = response.data as string
const dom = parseHTML(data).document
// make sure it's a valid URL by wrapping in new URL
const href = dom
.querySelector('span.click-here')
?.parentElement?.getAttribute('href')
const u = href ? new URL(href) : undefined
return { url: u?.href }
}
}

View File

@ -0,0 +1,41 @@
import axios from 'axios'
import { parseHTML } from 'linkedom'
import { ContentHandler, PreHandleResult } from '../content-handler'
export class BloombergHandler extends ContentHandler {
constructor() {
super()
this.name = 'Bloomberg'
}
shouldPreHandle(url: string, dom?: Document): boolean {
const BLOOMBERG_URL_MATCH =
/https?:\/\/(www\.)?bloomberg.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_+.~#?&/=]*)/
return BLOOMBERG_URL_MATCH.test(url.toString())
}
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
console.log('prehandling bloomberg url', url)
try {
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
params: {
api_key: process.env.SCRAPINGBEE_API_KEY,
url: url,
return_page_source: true,
block_ads: true,
block_resources: false,
},
})
const dom = parseHTML(response.data).document
return {
title: dom.title,
content: dom.querySelector('body')?.innerHTML,
url: url,
}
} catch (error) {
console.error('error prehandling bloomberg url', error)
throw error
}
}
}

View File

@ -0,0 +1,34 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
import axios from 'axios'
import { parseHTML } from 'linkedom'
export class DerstandardHandler extends ContentHandler {
constructor() {
super()
this.name = 'Derstandard'
}
shouldPreHandle(url: string, dom?: Document): boolean {
const u = new URL(url)
return u.hostname === 'www.derstandard.at'
}
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
const response = await axios.get(url, {
// set cookie to give consent to get the article
headers: {
cookie: `DSGVO_ZUSAGE_V1=true; consentUUID=2bacb9c1-1e80-4be0-9f7b-ee987cf4e7b0_6`,
},
})
const content = response.data as string
const dom = parseHTML(content).document
const titleElement = dom.querySelector('.article-title')
titleElement && titleElement.remove()
return {
content: dom.body.outerHTML,
title: titleElement?.textContent || undefined,
}
}
}

View File

@ -0,0 +1,32 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
export class ImageHandler extends ContentHandler {
constructor() {
super()
this.name = 'Image'
}
shouldPreHandle(url: string, dom?: Document): boolean {
const IMAGE_URL_PATTERN = /(https?:\/\/.*\.(?:jpg|jpeg|png|webp))/i
return IMAGE_URL_PATTERN.test(url.toString())
}
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
const title = url.toString().split('/').pop() || 'Image'
const content = `
<html>
<head>
<title>${title}</title>
<meta property="og:image" content="${url}" />
<meta property="og:title" content="${title}" />
</head>
<body>
<div>
<img src="${url}" alt="${title}">
</div>
</body>
</html>`
return Promise.resolve({ title, content })
}
}

View File

@ -0,0 +1,26 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
export class MediumHandler extends ContentHandler {
constructor() {
super()
this.name = 'Medium'
}
shouldPreHandle(url: string, dom?: Document): boolean {
const u = new URL(url)
return u.hostname.endsWith('medium.com')
}
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
console.log('prehandling medium url', url)
try {
const res = new URL(url)
res.searchParams.delete('source')
return Promise.resolve({ url: res.toString() })
} catch (error) {
console.error('error prehandling medium url', error)
throw error
}
}
}

View File

@ -0,0 +1,18 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
export class PdfHandler extends ContentHandler {
constructor() {
super()
this.name = 'PDF'
}
shouldPreHandle(url: string, dom?: Document): boolean {
const u = new URL(url)
const path = u.pathname.replace(u.search, '')
return path.endsWith('.pdf')
}
async preHandle(_url: string, document?: Document): Promise<PreHandleResult> {
return Promise.resolve({ contentType: 'application/pdf' })
}
}

View File

@ -0,0 +1,38 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
import axios from 'axios'
import { parseHTML } from 'linkedom'
export class ScrapingBeeHandler extends ContentHandler {
constructor() {
super()
this.name = 'ScrapingBee'
}
shouldPreHandle(url: string, dom?: Document): boolean {
const u = new URL(url)
const hostnames = ['nytimes.com', 'news.google.com']
return hostnames.some((h) => u.hostname.endsWith(h))
}
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
console.log('prehandling url with scrapingbee', url)
try {
const response = await axios.get('https://app.scrapingbee.com/api/v1', {
params: {
api_key: process.env.SCRAPINGBEE_API_KEY,
url: url,
return_page_source: true,
block_ads: true,
block_resources: false,
},
})
const dom = parseHTML(response.data).document
return { title: dom.title, content: response.data as string, url: url }
} catch (error) {
console.error('error prehandling url w/scrapingbee', error)
throw error
}
}
}

View File

@ -0,0 +1,26 @@
import { ContentHandler } from '../content-handler'
import axios from 'axios'
export class TDotCoHandler extends ContentHandler {
constructor() {
super()
this.name = 't.co'
}
shouldResolve(url: string): boolean {
const T_DOT_CO_URL_MATCH = /^https:\/\/(?:www\.)?t\.co\/.*$/
return T_DOT_CO_URL_MATCH.test(url)
}
async resolve(url: string) {
return axios
.get(url, { maxRedirects: 0, validateStatus: null })
.then((res) => {
return new URL(res.headers.location).href
})
.catch((err) => {
console.log('err with t.co url', err)
return undefined
})
}
}

View File

@ -0,0 +1,167 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
import axios from 'axios'
import { DateTime } from 'luxon'
import _ from 'underscore'
const TWITTER_BEARER_TOKEN = process.env.TWITTER_BEARER_TOKEN
const TWITTER_URL_MATCH =
/twitter\.com\/(?:#!\/)?(\w+)\/status(?:es)?\/(\d+)(?:\/.*)?/
const getTweetFields = () => {
const TWEET_FIELDS =
'&tweet.fields=attachments,author_id,conversation_id,created_at,' +
'entities,geo,in_reply_to_user_id,lang,possibly_sensitive,public_metrics,referenced_tweets,' +
'source,withheld'
const EXPANSIONS = '&expansions=author_id,attachments.media_keys'
const USER_FIELDS =
'&user.fields=created_at,description,entities,location,pinned_tweet_id,profile_image_url,protected,public_metrics,url,verified,withheld'
const MEDIA_FIELDS =
'&media.fields=duration_ms,height,preview_image_url,url,media_key,public_metrics,width'
return `${TWEET_FIELDS}${EXPANSIONS}${USER_FIELDS}${MEDIA_FIELDS}`
}
const getTweetById = async (id: string) => {
const BASE_ENDPOINT = 'https://api.twitter.com/2/tweets/'
const apiUrl = new URL(BASE_ENDPOINT + id + '?' + getTweetFields())
if (!TWITTER_BEARER_TOKEN) {
throw new Error('No Twitter bearer token found')
}
return axios.get(apiUrl.toString(), {
headers: {
Authorization: `Bearer ${TWITTER_BEARER_TOKEN}`,
redirect: 'follow',
},
})
}
const titleForAuthor = (author: { name: string }) => {
return `${author.name} on Twitter`
}
const tweetIdFromStatusUrl = (url: string): string | undefined => {
const match = url.toString().match(TWITTER_URL_MATCH)
return match?.[2]
}
const formatTimestamp = (timestamp: string) => {
return DateTime.fromJSDate(new Date(timestamp)).toLocaleString(
DateTime.DATETIME_FULL
)
}
export class TwitterHandler extends ContentHandler {
constructor() {
super()
this.name = 'Twitter'
}
shouldPreHandle(url: string, dom?: Document): boolean {
return !!TWITTER_BEARER_TOKEN && TWITTER_URL_MATCH.test(url.toString())
}
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
console.log('prehandling twitter url', url)
const tweetId = tweetIdFromStatusUrl(url)
if (!tweetId) {
throw new Error('could not find tweet id in url')
}
const tweetData = (await getTweetById(tweetId)).data as {
data: {
author_id: string
text: string
entities: {
urls: [
{
url: string
expanded_url: string
display_url: string
}
]
}
created_at: string
}
includes: {
users: [
{
id: string
name: string
profile_image_url: string
username: string
}
]
media: [
{
preview_image_url: string
type: string
url: string
}
]
}
}
const authorId = tweetData.data.author_id
const author = tweetData.includes.users.filter((u) => (u.id = authorId))[0]
// escape html entities in title
const title = _.escape(titleForAuthor(author))
const authorImage = author.profile_image_url.replace('_normal', '_400x400')
let text = tweetData.data.text
if (tweetData.data.entities && tweetData.data.entities.urls) {
for (const urlObj of tweetData.data.entities.urls) {
text = text.replace(
urlObj.url,
`<a href="${urlObj.expanded_url}">${urlObj.display_url}</a>`
)
}
}
const front = `
<div>
<p>${text}</p>
`
let includesHtml = ''
if (tweetData.includes.media) {
includesHtml = tweetData.includes.media
.map((m) => {
const linkUrl = m.type == 'photo' ? m.url : url
const previewUrl = m.type == 'photo' ? m.url : m.preview_image_url
const mediaOpen = `<a class="media-link" href=${linkUrl}>
<picture>
<img class="tweet-img" src=${previewUrl} />
</picture>
</a>`
return mediaOpen
})
.join('\n')
}
const back = `
— <a href="https://twitter.com/${author.username}">${
author.username
}</a> ${author.name} <a href="${url}">${formatTimestamp(
tweetData.data.created_at
)}</a>
</div>
`
const content = `
<head>
<meta property="og:image" content="${authorImage}" />
<meta property="og:image:secure_url" content="${authorImage}" />
<meta property="og:title" content="${title}" />
<meta property="og:description" content="${_.escape(
tweetData.data.text
)}" />
</head>
<body>
${front}
${includesHtml}
${back}
</body>`
return { content, url, title }
}
}

View File

@ -0,0 +1,20 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
export class WikipediaHandler extends ContentHandler {
constructor() {
super()
this.name = 'wikipedia'
}
shouldPreHandle(url: string, dom?: Document): boolean {
return new URL(url).hostname.endsWith('wikipedia.org')
}
async preHandle(url: string, dom: Document): Promise<PreHandleResult> {
// This removes the [edit] anchors from wikipedia pages
dom.querySelectorAll('.mw-editsection').forEach((e) => e.remove())
// this removes the sidebar
dom.querySelector('.infobox')?.remove()
return Promise.resolve({ dom })
}
}

View File

@ -0,0 +1,76 @@
import { ContentHandler, PreHandleResult } from '../content-handler'
import axios from 'axios'
import _ from 'underscore'
const YOUTUBE_URL_MATCH =
/^((?:https?:)?\/\/)?((?:www|m)\.)?((?:youtube\.com|youtu.be))(\/(?:[\w-]+\?v=|embed\/|v\/)?)([\w-]+)(\S+)?$/
export const getYoutubeVideoId = (url: string) => {
const u = new URL(url)
const videoId = u.searchParams.get('v')
if (!videoId) {
const match = url.toString().match(YOUTUBE_URL_MATCH)
if (match === null || match.length < 6 || !match[5]) {
return undefined
}
return match[5]
}
return videoId
}
export class YoutubeHandler extends ContentHandler {
constructor() {
super()
this.name = 'Youtube'
}
shouldPreHandle(url: string, dom?: Document): boolean {
return YOUTUBE_URL_MATCH.test(url.toString())
}
async preHandle(url: string, document?: Document): Promise<PreHandleResult> {
const videoId = getYoutubeVideoId(url)
if (!videoId) {
return {}
}
const oembedUrl =
`https://www.youtube.com/oembed?format=json&url=` +
encodeURIComponent(`https://www.youtube.com/watch?v=${videoId}`)
const oembed = (await axios.get(oembedUrl.toString())).data as {
title: string
width: number
height: number
thumbnail_url: string
author_name: string
author_url: string
}
// escape html entities in title
const title = _.escape(oembed.title)
const ratio = oembed.width / oembed.height
const thumbnail = oembed.thumbnail_url
const height = 350
const width = height * ratio
const authorName = _.escape(oembed.author_name)
const content = `
<html>
<head><title>${title}</title>
<meta property="og:image" content="${thumbnail}" />
<meta property="og:image:secure_url" content="${thumbnail}" />
<meta property="og:title" content="${title}" />
<meta property="og:description" content="" />
<meta property="og:article:author" content="${authorName}" />
</head>
<body>
<iframe width="${width}" height="${height}" src="https://www.youtube.com/embed/${videoId}" title="${title}" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>
<p><a href="${url}" target="_blank">${title}</a></p>
<p itemscope="" itemprop="author" itemtype="http://schema.org/Person">By <a href="${oembed.author_url}" target="_blank">${authorName}</a></p>
</body>
</html>`
console.log('got video id', videoId)
return { content, title: 'Youtube Content' }
}
}

View File

@ -0,0 +1,10 @@
import { AppleNewsHandler } from '../src/websites/apple-news-handler'
describe('open a simple web page', () => {
it('should return a response', async () => {
const response = await new AppleNewsHandler().preHandle(
'https://apple.news/AxjzaZaPvSn23b67LhXI5EQ'
)
console.log('response', response)
})
})

View File

@ -0,0 +1,3 @@
const register = require('@babel/register').default
register({ extensions: ['.ts', '.tsx', '.js', '.jsx'] })

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,191 @@
import 'mocha'
import * as chai from 'chai'
import { expect } from 'chai'
import chaiAsPromised from 'chai-as-promised'
import chaiString from 'chai-string'
import { SubstackHandler } from '../src/newsletters/substack-handler'
import { AxiosHandler } from '../src/newsletters/axios-handler'
import { BloombergNewsletterHandler } from '../src/newsletters/bloomberg-newsletter-handler'
import { GolangHandler } from '../src/newsletters/golang-handler'
import { MorningBrewHandler } from '../src/newsletters/morning-brew-handler'
import nock from 'nock'
import { generateUniqueUrl } from '../src/content-handler'
import fs from 'fs'
import { BeehiivHandler } from '../src/newsletters/beehiiv-handler'
chai.use(chaiAsPromised)
chai.use(chaiString)
const load = (path: string): string => {
return fs.readFileSync(path, 'utf8')
}
describe('Newsletter email test', () => {
describe('#getNewsletterUrl()', () => {
it('returns url when email is from SubStack', async () => {
const rawUrl = '<https://hongbo130.substack.com/p/tldr>'
await expect(
new SubstackHandler().parseNewsletterUrl(rawUrl, '')
).to.eventually.equal('https://hongbo130.substack.com/p/tldr')
})
it('returns url when email is from Axios', async () => {
const url = 'https://axios.com/blog/the-best-way-to-build-a-web-app'
const html = `View in browser at <a>${url}</a>`
await expect(
new AxiosHandler().parseNewsletterUrl('', html)
).to.eventually.equal(url)
})
it('returns url when email is from Bloomberg', async () => {
const url = 'https://www.bloomberg.com/news/google-is-now-a-partner'
const html = `
<a class="view-in-browser__url" href="${url}">
View in browser
</a>
`
await expect(
new BloombergNewsletterHandler().parseNewsletterUrl('', html)
).to.eventually.equal(url)
})
it('returns url when email is from Golang Weekly', async () => {
const url = 'https://www.golangweekly.com/first'
const html = `
<a href="${url}" style="text-decoration: none">Read on the Web</a>
`
await expect(
new GolangHandler().parseNewsletterUrl('', html)
).to.eventually.equal(url)
})
it('returns url when email is from Morning Brew', async () => {
const url = 'https://www.morningbrew.com/daily/issues/first'
const html = `
<a style="color: #000000; text-decoration: none;" target="_blank" rel="noopener" href="${url}">View Online</a>
`
await expect(
new MorningBrewHandler().parseNewsletterUrl('', html)
).to.eventually.equal(url)
})
})
describe('get author from email address', () => {
it('returns author when email is from Substack', () => {
const from = 'Jackson Harper from Omnivore App <jacksonh@substack.com>'
expect(new AxiosHandler().parseAuthor(from)).to.equal(
'Jackson Harper from Omnivore App'
)
})
it('returns author when email is from Axios', () => {
const from = 'Mike Allen <mike@axios.com>'
expect(new AxiosHandler().parseAuthor(from)).to.equal('Mike Allen')
})
})
describe('isProbablyNewsletter', () => {
it('returns true for substack newsletter', async () => {
const html = load('./test/data/substack-forwarded-newsletter.html')
await expect(
new SubstackHandler().isNewsletter({
html,
postHeader: '',
from: '',
unSubHeader: '',
})
).to.eventually.be.true
})
it('returns true for private forwarded substack newsletter', async () => {
const html = load(
'./test/data/substack-private-forwarded-newsletter.html'
)
await expect(
new SubstackHandler().isNewsletter({
html,
postHeader: '',
from: '',
unSubHeader: '',
})
).to.eventually.be.true
})
it('returns false for substack welcome email', async () => {
const html = load('./test/data/substack-forwarded-welcome-email.html')
await expect(
new SubstackHandler().isNewsletter({
html,
postHeader: '',
from: '',
unSubHeader: '',
})
).to.eventually.be.false
})
it('returns true for beehiiv.com newsletter', async () => {
const html = load('./test/data/beehiiv-newsletter.html')
await expect(
new BeehiivHandler().isNewsletter({
html,
postHeader: '',
from: '',
unSubHeader: '',
})
).to.eventually.be.true
})
})
describe('findNewsletterUrl', async () => {
it('gets the URL from the header if it is a substack newsletter', async () => {
nock('https://email.mg2.substack.com')
.head(
'/c/eJxNkk2TojAQhn-N3KTyQfg4cGDGchdnYcsZx9K5UCE0EMVAkTiKv36iHnarupNUd7rfVJ4W3EDTj1M89No496Uw0wCxgovuwBgYnbOGsZBVjDHzKPWYU8VehUMWOlIX9Qhw4rKLzXgGZziXnRTcyF7dK0iIGMVOG_OS1aTmKPRDilgVhTQUPCQIcE0x-MFTmJ8rCUpA3KtuenR2urg1ZtAzmszI0tq_Z7m66y-ilQo0uAqMTQ7WRX8auJKg56blZg7WB-iHDuYEBzO6NP0R1IwuYFphQbbTjnTH9NBfs80nym4Zyj8uUvyKbtUyGr5eUz9fNDQ7JCxfJDo9dW1lY9lmj_JNivPbGmf2Pt_lN9tDit9b-WeTetni85Z9pDpVOd7L1E_Vy7egayNO23ZP34eSeLJeux1b0rer_xaZ7ykS78nuSjMY-nL98rparNZNcv07JCjN06_EkTFBxBqOUMACErnELUNMSxTUjLDQZwzcqa4bRjCfeejUEFefS224OLr2S5wxPtij7lVrs80d2CNseRV2P52VNFMBipcdVE-U5jkRD7hFAwpGOylVwU2Mfc9qBh7DoR89yVnWXhgQFHnIsbpVb6tU_B-hH_2yzWY'
)
.reply(302, undefined, {
Location:
'https://newsletter.slowchinese.net/p/companies-that-eat-people-217',
})
.get('/p/companies-that-eat-people-217')
.reply(200, '')
const html = load('./test/data/substack-forwarded-newsletter.html')
const url = await new SubstackHandler().findNewsletterUrl(html)
// Not sure if the redirects from substack expire, this test could eventually fail
expect(url).to.startWith(
'https://newsletter.slowchinese.net/p/companies-that-eat-people-217'
)
}).timeout(10000)
it('gets the URL from the header if it is a beehiiv newsletter', async () => {
nock('https://u23463625.ct.sendgrid.net')
.head(
'/ss/c/AX1lEgEQaxtvFxLaVo0GBo_geajNrlI1TGeIcmMViR3pL3fEDZnbbkoeKcaY62QZk0KPFudUiUXc_uMLerV4nA/3k5/3TFZmreTR0qKSCgowABnVg/h30/zzLik7UXd1H_n4oyd5W8Xu639AYQQB2UXz-CsssSnno'
)
.reply(302, undefined, {
Location: 'https://www.milkroad.com/p/talked-guy-spent-30m-beeple',
})
.get('/p/talked-guy-spent-30m-beeple')
.reply(200, '')
const html = load('./test/data/beehiiv-newsletter.html')
const url = await new BeehiivHandler().findNewsletterUrl(html)
expect(url).to.startWith(
'https://www.milkroad.com/p/talked-guy-spent-30m-beeple'
)
})
it('returns undefined if it is not a newsletter', async () => {
const html = load('./test/data/substack-forwarded-welcome-email.html')
const url = await new SubstackHandler().findNewsletterUrl(html)
expect(url).to.be.undefined
})
})
describe('generateUniqueUrl', () => {
it('generates a unique URL', () => {
const url1 = generateUniqueUrl()
const url2 = generateUniqueUrl()
expect(url1).to.not.eql(url2)
})
})
})

View File

@ -0,0 +1,25 @@
import { expect } from 'chai'
import 'mocha'
import { getYoutubeVideoId } from '../src/websites/youtube-handler'
describe('getYoutubeVideoId', () => {
it('should parse video id out of a URL', async () => {
expect('BnSUk0je6oo').to.eq(
getYoutubeVideoId('https://www.youtube.com/watch?v=BnSUk0je6oo&t=269s')
)
expect('vFD2gu007dc').to.eq(
getYoutubeVideoId(
'https://www.youtube.com/watch?v=vFD2gu007dc&list=RDvFD2gu007dc&start_radio=1'
)
)
expect('vFD2gu007dc').to.eq(
getYoutubeVideoId('https://youtu.be/vFD2gu007dc')
)
expect('BMFVCnbRaV4').to.eq(
getYoutubeVideoId('https://youtube.com/watch?v=BMFVCnbRaV4&feature=share')
)
expect('cg9b4RC87LI').to.eq(
getYoutubeVideoId('https://youtu.be/cg9b4RC87LI?t=116')
)
})
})

View File

@ -0,0 +1,10 @@
{
"extends": "@tsconfig/node14/tsconfig.json",
"compilerOptions": {
"rootDir": ".",
"declaration": true,
"outDir": "build",
"lib": ["dom"]
},
"include": ["src"]
}

View File

@ -6,6 +6,7 @@
"dependencies": { "dependencies": {
"@google-cloud/logging-winston": "^5.1.1", "@google-cloud/logging-winston": "^5.1.1",
"@google-cloud/storage": "^5.18.1", "@google-cloud/storage": "^5.18.1",
"@omnivore/content-handler": "file:./../content-handler",
"@sentry/serverless": "^6.13.3", "@sentry/serverless": "^6.13.3",
"axios": "^0.27.2", "axios": "^0.27.2",
"chrome-aws-lambda": "^10.1.0", "chrome-aws-lambda": "^10.1.0",

View File

@ -4312,6 +4312,17 @@
dependencies: dependencies:
"@octokit/openapi-types" "^9.5.0" "@octokit/openapi-types" "^9.5.0"
"@omnivore/content-handler@file:./packages/content-handler":
version "1.0.0"
dependencies:
addressparser "^1.0.1"
axios "^0.27.2"
linkedom "^0.14.16"
luxon "^3.0.4"
rfc2047 "^4.0.1"
underscore "^1.13.6"
uuid "^9.0.0"
"@opentelemetry/api-metrics@0.27.0": "@opentelemetry/api-metrics@0.27.0":
version "0.27.0" version "0.27.0"
resolved "https://registry.yarnpkg.com/@opentelemetry/api-metrics/-/api-metrics-0.27.0.tgz#d8eca344ed1155f3ea8a8133ade827b4bb90efbf" resolved "https://registry.yarnpkg.com/@opentelemetry/api-metrics/-/api-metrics-0.27.0.tgz#d8eca344ed1155f3ea8a8133ade827b4bb90efbf"