Add other newsletter handlers

This commit is contained in:
Hongbo Wu
2022-09-30 12:42:41 +08:00
parent 9b209314a6
commit b00a516737
23 changed files with 488 additions and 227 deletions

View File

@ -15,7 +15,12 @@
"build": "tsc"
},
"devDependencies": {
"eslint-plugin-prettier": "^4.0.0"
"chai": "^4.3.6",
"chai-as-promised": "^7.1.1",
"chai-string": "^1.5.0",
"eslint-plugin-prettier": "^4.0.0",
"mocha": "^10.0.0",
"nock": "^13.2.9"
},
"dependencies": {
"addressparser": "^1.0.1",

View File

@ -1,6 +1,8 @@
import addressparser from 'addressparser'
import rfc2047 from 'rfc2047'
import { v4 as uuidv4 } from 'uuid'
import { v4 as uuid } from 'uuid'
import { parseHTML } from 'linkedom'
import axios from 'axios'
interface Unsubscribe {
mailTo?: string
@ -34,16 +36,17 @@ export interface PreHandleResult {
dom?: Document
}
export const FAKE_URL_PREFIX = 'https://omnivore.app/no_url?q='
export const generateUniqueUrl = () => FAKE_URL_PREFIX + uuid()
export abstract class ContentHandler {
protected senderRegex: RegExp
protected urlRegex: RegExp
protected defaultUrl: string
public name: string
name: string
protected constructor() {
this.senderRegex = new RegExp(/NEWSLETTER_SENDER_REGEX/)
this.urlRegex = new RegExp(/NEWSLETTER_URL_REGEX/)
this.defaultUrl = 'NEWSLETTER_DEFAULT_URL'
this.name = 'Handler name'
}
@ -63,17 +66,57 @@ export abstract class ContentHandler {
return Promise.resolve({ url, dom })
}
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
return false
async isNewsletter(input: {
postHeader: string
from: string
unSubHeader: string
html?: string
}): Promise<boolean> {
const re = new RegExp(this.senderRegex)
return Promise.resolve(
re.test(input.from) && (!!input.postHeader || !!input.unSubHeader)
)
}
parseNewsletterUrl(_postHeader: string, html: string): string | undefined {
findNewsletterHeaderHref(dom: Document): string | undefined {
return undefined
}
// Given an HTML blob tries to find a URL to use for
// a canonical URL.
async findNewsletterUrl(html: string): Promise<string | undefined> {
const dom = parseHTML(html).document
// Check if this is a substack newsletter
const href = this.findNewsletterHeaderHref(dom)
if (href) {
// Try to make a HEAD request, so we get the redirected URL, since these
// will usually be behind tracking url redirects
try {
const response = await axios.head(href, { timeout: 5000 })
return Promise.resolve(
// eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
response.request.res.responseUrl as string | undefined
)
} catch (e) {
console.log('error making HEAD request', e)
return Promise.resolve(href)
}
}
return Promise.resolve(undefined)
}
async parseNewsletterUrl(
_postHeader: string,
html: string
): Promise<string | undefined> {
// get newsletter url from html
const matches = html.match(this.urlRegex)
if (matches) {
return matches[1]
return Promise.resolve(matches[1])
}
return undefined
return Promise.resolve(undefined)
}
parseAuthor(from: string): string {
@ -97,14 +140,14 @@ export abstract class ContentHandler {
}
}
handleNewsletter({
async handleNewsletter({
email,
html,
postHeader,
title,
from,
unSubHeader,
}: NewsletterInput): NewsletterResult {
}: NewsletterInput): Promise<NewsletterResult> {
console.log('handleNewsletter', email, postHeader, title, from)
if (!email || !html || !title || !from) {
@ -115,8 +158,7 @@ export abstract class ContentHandler {
// fallback to default url if newsletter url does not exist
// assign a random uuid to the default url to avoid duplicate url
const url =
this.parseNewsletterUrl(postHeader, html) ||
`${this.defaultUrl}?source=newsletters&id=${uuidv4()}`
(await this.parseNewsletterUrl(postHeader, html)) || generateUniqueUrl()
const author = this.parseAuthor(from)
const unsubscribe = this.parseUnsubscribe(unSubHeader)

View File

@ -1,13 +1,14 @@
import { AppleNewsHandler } from './content/apple-news-handler'
import { BloombergHandler } from './content/bloomberg-handler'
import { DerstandardHandler } from './content/derstandard-handler'
import { ImageHandler } from './content/image-handler'
import { MediumHandler } from './content/medium-handler'
import { PdfHandler } from './content/pdf-handler'
import { ScrapingBeeHandler } from './content/scrapingBee-handler'
import { TDotCoHandler } from './content/t-dot-co-handler'
import { TwitterHandler } from './content/twitter-handler'
import { YoutubeHandler } from './content/youtube-handler'
import { AppleNewsHandler } from './websites/apple-news-handler'
import { BloombergHandler } from './websites/bloomberg-handler'
import { DerstandardHandler } from './websites/derstandard-handler'
import { ImageHandler } from './websites/image-handler'
import { MediumHandler } from './websites/medium-handler'
import { PdfHandler } from './websites/pdf-handler'
import { ScrapingBeeHandler } from './websites/scrapingBee-handler'
import { TDotCoHandler } from './websites/t-dot-co-handler'
import { TwitterHandler } from './websites/twitter-handler'
import { YoutubeHandler } from './websites/youtube-handler'
import { WikipediaHandler } from './websites/wikipedia-handler'
import {
ContentHandler,
NewsletterInput,
@ -19,7 +20,9 @@ import { AxiosHandler } from './newsletters/axios-handler'
import { GolangHandler } from './newsletters/golang-handler'
import { MorningBrewHandler } from './newsletters/morning-brew-handler'
import { BloombergNewsletterHandler } from './newsletters/bloomberg-newsletter-handler'
import { WikipediaHandler } from './content/wikipedia-handler'
import { BeehiivHandler } from './newsletters/beehiiv-handler'
import { ConvertkitHandler } from './newsletters/convertkit-handler'
import { RevueHandler } from './newsletters/revue-handler'
const validateUrlString = (url: string) => {
const u = new URL(url)
@ -57,6 +60,10 @@ const newsletterHandlers: ContentHandler[] = [
new GolangHandler(),
new SubstackHandler(),
new MorningBrewHandler(),
new SubstackHandler(),
new BeehiivHandler(),
new ConvertkitHandler(),
new RevueHandler(),
]
export const preHandleContent = async (
@ -91,11 +98,11 @@ export const preHandleContent = async (
return undefined
}
export const handleNewsletter = (
export const handleNewsletter = async (
input: NewsletterInput
): NewsletterResult | undefined => {
): Promise<NewsletterResult | undefined> => {
for (const handler of newsletterHandlers) {
if (handler.isNewsletter(input.postHeader, input.from, input.unSubHeader)) {
if (await handler.isNewsletter(input)) {
return handler.handleNewsletter(input)
}
}

View File

@ -5,7 +5,6 @@ export class AxiosHandler extends ContentHandler {
super()
this.senderRegex = /<.+@axios.com>/
this.urlRegex = /View in browser at <a.*>(.*)<\/a>/
this.defaultUrl = 'https://axios.com'
this.name = 'axios'
}
@ -44,10 +43,4 @@ export class AxiosHandler extends ContentHandler {
return Promise.resolve({ dom })
}
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
// Axios newsletter is from <xx@axios.com>
const re = new RegExp(this.senderRegex)
return re.test(from) && (!!postHeader || !!unSubHeader)
}
}

View File

@ -0,0 +1,43 @@
import { ContentHandler } from '../content-handler'
import { parseHTML } from 'linkedom'
export class BeehiivHandler extends ContentHandler {
constructor() {
super()
this.name = 'beehiiv'
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const readOnline = dom.querySelectorAll('table tr td div a[class*="link"]')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (e.textContent === 'Read Online') {
res = e.getAttribute('href') || undefined
}
})
return res
}
async isNewsletter(input: {
postHeader: string
from: string
unSubHeader: string
html: string
}): Promise<boolean> {
const dom = parseHTML(input.html).document
if (dom.querySelectorAll('img[src*="beehiiv.net"]').length > 0) {
const beehiivUrl = this.findNewsletterHeaderHref(dom)
if (beehiivUrl) {
return Promise.resolve(true)
}
}
return false
}
async parseNewsletterUrl(
postHeader: string,
html: string
): Promise<string | undefined> {
return this.findNewsletterUrl(html)
}
}

View File

@ -5,7 +5,6 @@ export class BloombergNewsletterHandler extends ContentHandler {
super()
this.senderRegex = /<.+@mail.bloomberg.*.com>/
this.urlRegex = /<a class="view-in-browser__url" href=["']([^"']*)["']/
this.defaultUrl = 'https://www.bloomberg.com'
this.name = 'bloomberg'
}
@ -35,10 +34,4 @@ export class BloombergNewsletterHandler extends ContentHandler {
return Promise.resolve({ dom })
}
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
// Axios newsletter is from <xx@axios.com>
const re = new RegExp(this.senderRegex)
return re.test(from) && (!!postHeader || !!unSubHeader)
}
}

View File

@ -0,0 +1,41 @@
import { ContentHandler } from '../content-handler'
import { parseHTML } from 'linkedom'
export class ConvertkitHandler extends ContentHandler {
constructor() {
super()
this.name = 'convertkit'
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const readOnline = dom.querySelectorAll('table tr td a')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (e.textContent === 'View this email in your browser') {
res = e.getAttribute('href') || undefined
}
})
return res
}
async isNewsletter(input: {
postHeader: string
from: string
unSubHeader: string
html: string
}): Promise<boolean> {
const dom = parseHTML(input.html).document
return Promise.resolve(
dom.querySelectorAll(
'img[src*="convertkit.com"], img[src*="convertkit-mail.com"]'
).length > 0
)
}
async parseNewsletterUrl(
postHeader: string,
html: string
): Promise<string | undefined> {
return this.findNewsletterUrl(html)
}
}

View File

@ -5,7 +5,6 @@ export class GolangHandler extends ContentHandler {
super()
this.senderRegex = /<.+@golangweekly.com>/
this.urlRegex = /<a href=["']([^"']*)["'].*>Read on the Web<\/a>/
this.defaultUrl = 'https://golangweekly.com'
this.name = 'golangweekly'
}
@ -25,10 +24,4 @@ export class GolangHandler extends ContentHandler {
return Promise.resolve({ dom })
}
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
// Axios newsletter is from <xx@axios.com>
const re = new RegExp(this.senderRegex)
return re.test(from) && (!!postHeader || !!unSubHeader)
}
}

View File

@ -5,7 +5,6 @@ export class MorningBrewHandler extends ContentHandler {
super()
this.senderRegex = /Morning Brew <crew@morningbrew.com>/
this.urlRegex = /<a.* href=["']([^"']*)["'].*>View Online<\/a>/
this.defaultUrl = 'https://www.morningbrew.com'
this.name = 'morningbrew'
}
@ -33,10 +32,4 @@ export class MorningBrewHandler extends ContentHandler {
return Promise.resolve({ dom })
}
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
// Axios newsletter is from <xx@axios.com>
const re = new RegExp(this.senderRegex)
return re.test(from) && (!!postHeader || !!unSubHeader)
}
}

View File

@ -0,0 +1,46 @@
import { ContentHandler } from '../content-handler'
import { parseHTML } from 'linkedom'
export class RevueHandler extends ContentHandler {
constructor() {
super()
this.name = 'revue'
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const viewOnline = dom.querySelectorAll('table tr td a[target="_blank"]')
let res: string | undefined = undefined
viewOnline.forEach((e) => {
if (e.textContent === 'View online') {
res = e.getAttribute('href') || undefined
}
})
return res
}
async isNewsletter(input: {
postHeader: string
from: string
unSubHeader: string
html: string
}): Promise<boolean> {
const dom = parseHTML(input.html).document
if (
dom.querySelectorAll('img[src*="getrevue.co"], img[src*="revue.email"]')
.length > 0
) {
const getrevueUrl = this.findNewsletterHeaderHref(dom)
if (getrevueUrl) {
return Promise.resolve(true)
}
}
return false
}
async parseNewsletterUrl(
postHeader: string,
html: string
): Promise<string | undefined> {
return this.findNewsletterUrl(html)
}
}

View File

@ -1,10 +1,10 @@
import addressparser from 'addressparser'
import { ContentHandler, PreHandleResult } from '../content-handler'
import { parseHTML } from 'linkedom'
export class SubstackHandler extends ContentHandler {
constructor() {
super()
this.defaultUrl = 'https://www.substack.com'
this.name = 'substack'
}
@ -38,15 +38,53 @@ export class SubstackHandler extends ContentHandler {
return Promise.resolve(dom)
}
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
return !!postHeader
findNewsletterHeaderHref(dom: Document): string | undefined {
// Substack header links
const postLink = dom.querySelector('h1 a ')
if (postLink) {
return postLink.getAttribute('href') || undefined
}
return undefined
}
parseNewsletterUrl(postHeader: string, html: string): string | undefined {
async isNewsletter({
postHeader,
html,
}: {
postHeader: string
from: string
unSubHeader: string
html: string
}): Promise<boolean> {
if (postHeader) {
return Promise.resolve(true)
}
const dom = parseHTML(html).document
// substack newsletter emails have tables with a *post-meta class
if (dom.querySelector('table[class$="post-meta"]')) {
return true
}
// If the article has a header link, and substack icons its probably a newsletter
const href = this.findNewsletterHeaderHref(dom)
const heartIcon = dom.querySelector(
'table tbody td span a img[src*="HeartIcon"]'
)
const recommendIcon = dom.querySelector(
'table tbody td span a img[src*="RecommendIconRounded"]'
)
return Promise.resolve(!!(href && (heartIcon || recommendIcon)))
}
async parseNewsletterUrl(
postHeader: string,
html: string
): Promise<string | undefined> {
// raw SubStack newsletter url is like <https://hongbo130.substack.com/p/tldr>
// we need to get the real url from the raw url
return addressparser(postHeader).length > 0
? addressparser(postHeader)[0].name
: undefined
if (postHeader && addressparser(postHeader).length > 0) {
return Promise.resolve(addressparser(postHeader)[0].name)
}
return this.findNewsletterUrl(html)
}
}

View File

@ -1,4 +1,4 @@
import { AppleNewsHandler } from '../src/content/apple-news-handler'
import { AppleNewsHandler } from '../src/websites/apple-news-handler'
describe('open a simple web page', () => {
it('should return a response', async () => {

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -1,28 +1,45 @@
import 'mocha'
import * as chai from 'chai'
import { expect } from 'chai'
import chaiAsPromised from 'chai-as-promised'
import chaiString from 'chai-string'
import { SubstackHandler } from '../src/newsletters/substack-handler'
import { AxiosHandler } from '../src/newsletters/axios-handler'
import { BloombergNewsletterHandler } from '../src/newsletters/bloomberg-newsletter-handler'
import { GolangHandler } from '../src/newsletters/golang-handler'
import { MorningBrewHandler } from '../src/newsletters/morning-brew-handler'
import nock from 'nock'
import { generateUniqueUrl } from '../src/content-handler'
import fs from 'fs'
import { BeehiivHandler } from '../src/newsletters/beehiiv-handler'
chai.use(chaiAsPromised)
chai.use(chaiString)
const load = (path: string): string => {
return fs.readFileSync(path, 'utf8')
}
describe('Newsletter email test', () => {
describe('#getNewsletterUrl()', () => {
it('returns url when email is from SubStack', () => {
it('returns url when email is from SubStack', async () => {
const rawUrl = '<https://hongbo130.substack.com/p/tldr>'
expect(new SubstackHandler().parseNewsletterUrl(rawUrl, '')).to.equal(
'https://hongbo130.substack.com/p/tldr'
)
await expect(
new SubstackHandler().parseNewsletterUrl(rawUrl, '')
).to.eventually.equal('https://hongbo130.substack.com/p/tldr')
})
it('returns url when email is from Axios', () => {
it('returns url when email is from Axios', async () => {
const url = 'https://axios.com/blog/the-best-way-to-build-a-web-app'
const html = `View in browser at <a>${url}</a>`
expect(new AxiosHandler().parseNewsletterUrl('', html)).to.equal(url)
await expect(
new AxiosHandler().parseNewsletterUrl('', html)
).to.eventually.equal(url)
})
it('returns url when email is from Bloomberg', () => {
it('returns url when email is from Bloomberg', async () => {
const url = 'https://www.bloomberg.com/news/google-is-now-a-partner'
const html = `
<a class="view-in-browser__url" href="${url}">
@ -30,29 +47,31 @@ describe('Newsletter email test', () => {
</a>
`
expect(
await expect(
new BloombergNewsletterHandler().parseNewsletterUrl('', html)
).to.equal(url)
).to.eventually.equal(url)
})
it('returns url when email is from Golang Weekly', () => {
it('returns url when email is from Golang Weekly', async () => {
const url = 'https://www.golangweekly.com/first'
const html = `
<a href="${url}" style="text-decoration: none">Read on the Web</a>
`
expect(new GolangHandler().parseNewsletterUrl('', html)).to.equal(url)
await expect(
new GolangHandler().parseNewsletterUrl('', html)
).to.eventually.equal(url)
})
it('returns url when email is from Morning Brew', () => {
it('returns url when email is from Morning Brew', async () => {
const url = 'https://www.morningbrew.com/daily/issues/first'
const html = `
<a style="color: #000000; text-decoration: none;" target="_blank" rel="noopener" href="${url}">View Online</a>
`
expect(new MorningBrewHandler().parseNewsletterUrl('', html)).to.equal(
url
)
await expect(
new MorningBrewHandler().parseNewsletterUrl('', html)
).to.eventually.equal(url)
})
})
@ -69,4 +88,104 @@ describe('Newsletter email test', () => {
expect(new AxiosHandler().parseAuthor(from)).to.equal('Mike Allen')
})
})
describe('isProbablyNewsletter', () => {
it('returns true for substack newsletter', async () => {
const html = load('./test/data/substack-forwarded-newsletter.html')
await expect(
new SubstackHandler().isNewsletter({
html,
postHeader: '',
from: '',
unSubHeader: '',
})
).to.eventually.be.true
})
it('returns true for private forwarded substack newsletter', async () => {
const html = load(
'./test/data/substack-private-forwarded-newsletter.html'
)
await expect(
new SubstackHandler().isNewsletter({
html,
postHeader: '',
from: '',
unSubHeader: '',
})
).to.eventually.be.true
})
it('returns false for substack welcome email', async () => {
const html = load('./test/data/substack-forwarded-welcome-email.html')
await expect(
new SubstackHandler().isNewsletter({
html,
postHeader: '',
from: '',
unSubHeader: '',
})
).to.eventually.be.false
})
it('returns true for beehiiv.com newsletter', async () => {
const html = load('./test/data/beehiiv-newsletter.html')
await expect(
new BeehiivHandler().isNewsletter({
html,
postHeader: '',
from: '',
unSubHeader: '',
})
).to.eventually.be.true
})
})
describe('findNewsletterUrl', async () => {
it('gets the URL from the header if it is a substack newsletter', async () => {
nock('https://email.mg2.substack.com')
.head(
'/c/eJxNkk2TojAQhn-N3KTyQfg4cGDGchdnYcsZx9K5UCE0EMVAkTiKv36iHnarupNUd7rfVJ4W3EDTj1M89No496Uw0wCxgovuwBgYnbOGsZBVjDHzKPWYU8VehUMWOlIX9Qhw4rKLzXgGZziXnRTcyF7dK0iIGMVOG_OS1aTmKPRDilgVhTQUPCQIcE0x-MFTmJ8rCUpA3KtuenR2urg1ZtAzmszI0tq_Z7m66y-ilQo0uAqMTQ7WRX8auJKg56blZg7WB-iHDuYEBzO6NP0R1IwuYFphQbbTjnTH9NBfs80nym4Zyj8uUvyKbtUyGr5eUz9fNDQ7JCxfJDo9dW1lY9lmj_JNivPbGmf2Pt_lN9tDit9b-WeTetni85Z9pDpVOd7L1E_Vy7egayNO23ZP34eSeLJeux1b0rer_xaZ7ykS78nuSjMY-nL98rparNZNcv07JCjN06_EkTFBxBqOUMACErnELUNMSxTUjLDQZwzcqa4bRjCfeejUEFefS224OLr2S5wxPtij7lVrs80d2CNseRV2P52VNFMBipcdVE-U5jkRD7hFAwpGOylVwU2Mfc9qBh7DoR89yVnWXhgQFHnIsbpVb6tU_B-hH_2yzWY'
)
.reply(302, undefined, {
Location:
'https://newsletter.slowchinese.net/p/companies-that-eat-people-217',
})
.get('/p/companies-that-eat-people-217')
.reply(200, '')
const html = load('./test/data/substack-forwarded-newsletter.html')
const url = await new SubstackHandler().findNewsletterUrl(html)
// Not sure if the redirects from substack expire, this test could eventually fail
expect(url).to.startWith(
'https://newsletter.slowchinese.net/p/companies-that-eat-people-217'
)
})
it('gets the URL from the header if it is a beehiiv newsletter', async () => {
nock('https://u23463625.ct.sendgrid.net')
.head(
'/ss/c/AX1lEgEQaxtvFxLaVo0GBo_geajNrlI1TGeIcmMViR3pL3fEDZnbbkoeKcaY62QZk0KPFudUiUXc_uMLerV4nA/3k5/3TFZmreTR0qKSCgowABnVg/h30/zzLik7UXd1H_n4oyd5W8Xu639AYQQB2UXz-CsssSnno'
)
.reply(302, undefined, {
Location: 'https://www.milkroad.com/p/talked-guy-spent-30m-beeple',
})
.get('/p/talked-guy-spent-30m-beeple')
.reply(200, '')
const html = load('./test/data/beehiiv-newsletter.html')
const url = await new BeehiivHandler().findNewsletterUrl(html)
expect(url).to.startWith(
'https://www.milkroad.com/p/talked-guy-spent-30m-beeple'
)
})
it('returns undefined if it is not a newsletter', async () => {
const html = load('./test/data/substack-forwarded-welcome-email.html')
const url = await new SubstackHandler().findNewsletterUrl(html)
expect(url).to.be.undefined
})
})
describe('generateUniqueUrl', () => {
it('generates a unique URL', () => {
const url1 = generateUniqueUrl()
const url2 = generateUniqueUrl()
expect(url1).to.not.eql(url2)
})
})
})

View File

@ -1,6 +1,6 @@
import { expect } from 'chai'
import 'mocha'
import { getYoutubeVideoId } from '../src/content/youtube-handler'
import { getYoutubeVideoId } from '../src/websites/youtube-handler'
describe('getYoutubeVideoId', () => {
it('should parse video id out of a URL', async () => {