Add other newsletter handlers

This commit is contained in:
Hongbo Wu
2022-09-30 12:42:41 +08:00
parent 9b209314a6
commit b00a516737
23 changed files with 488 additions and 227 deletions

View File

@ -8,14 +8,11 @@ import { analytics } from '../../utils/analytics'
import { getNewsletterEmail } from '../../services/newsletters'
import { env } from '../../env'
import {
findNewsletterUrl,
generateUniqueUrl,
getTitleFromEmailSubject,
isProbablyArticle,
isProbablyNewsletter,
parseEmailAddress,
} from '../../utils/parser'
import { saveNewsletterEmail } from '../../services/save_newsletter_email'
import { saveEmail } from '../../services/save_email'
import { buildLogger } from '../../utils/logger'
@ -80,25 +77,6 @@ export function emailsServiceRouter() {
const ctx = { pubsub: createPubSubClient(), uid: user.id }
const parsedFrom = parseEmailAddress(data.from)
if (await isProbablyNewsletter(data.html)) {
logger.info('handling as newsletter', data)
await saveNewsletterEmail(
{
email: data.to,
title: data.subject,
content: data.html,
author: parsedFrom.name,
url: (await findNewsletterUrl(data.html)) || generateUniqueUrl(),
unsubMailTo: data.unsubMailTo,
unsubHttpUrl: data.unsubHttpUrl,
newsletterEmail,
},
ctx
)
res.status(200).send('Newsletter')
return
}
if (
await isProbablyArticle(
data.forwardedFrom || parsedFrom.address,

View File

@ -52,35 +52,8 @@ describe('Emails Router', () => {
sinon.restore()
})
context('when email is a newsletter', () => {
before(() => {
sinon.replace(parser, 'isProbablyNewsletter', sinon.fake.resolves(true))
})
it('saves the email as a newsletter', async () => {
const data = {
message: {
data: Buffer.from(
JSON.stringify({ from, to, subject, html })
).toString('base64'),
publishTime: new Date().toISOString(),
},
}
const res = await request
.post(`/svc/pubsub/emails/forward?token=${token}`)
.send(data)
.expect(200)
expect(res.text).to.eql('Newsletter')
})
})
context('when email is an article', () => {
before(() => {
sinon.replace(
parser,
'isProbablyNewsletter',
sinon.fake.resolves(false)
)
sinon.replace(parser, 'isProbablyArticle', sinon.fake.resolves(true))
})
@ -103,11 +76,6 @@ describe('Emails Router', () => {
context('when email is a regular email', () => {
before(() => {
sinon.replace(
parser,
'isProbablyNewsletter',
sinon.fake.resolves(false)
)
sinon.replace(parser, 'isProbablyArticle', sinon.fake.resolves(false))
})

View File

@ -4,11 +4,8 @@ import { expect } from 'chai'
import 'chai/register-should'
import fs from 'fs'
import {
findNewsletterUrl,
generateUniqueUrl,
getTitleFromEmailSubject,
isProbablyArticle,
isProbablyNewsletter,
parseEmailAddress,
parsePageMetadata,
parsePreparedContent,
@ -24,69 +21,6 @@ const load = (path: string): string => {
return fs.readFileSync(path, 'utf8')
}
describe('isProbablyNewsletter', () => {
it('returns true for substack newsletter', async () => {
const html = load('./test/utils/data/substack-forwarded-newsletter.html')
await expect(isProbablyNewsletter(html)).to.eventually.be.true
})
it('returns true for private forwarded substack newsletter', async () => {
const html = load(
'./test/utils/data/substack-private-forwarded-newsletter.html'
)
await expect(isProbablyNewsletter(html)).to.eventually.be.true
})
it('returns false for substack welcome email', async () => {
const html = load('./test/utils/data/substack-forwarded-welcome-email.html')
await expect(isProbablyNewsletter(html)).to.eventually.be.false
})
it('returns true for beehiiv.com newsletter', async () => {
const html = load('./test/utils/data/beehiiv-newsletter.html')
await expect(isProbablyNewsletter(html)).to.eventually.be.true
})
})
describe('findNewsletterUrl', async () => {
it('gets the URL from the header if it is a substack newsletter', async () => {
nock('https://email.mg2.substack.com')
.head(
'/c/eJxNkk2TojAQhn-N3KTyQfg4cGDGchdnYcsZx9K5UCE0EMVAkTiKv36iHnarupNUd7rfVJ4W3EDTj1M89No496Uw0wCxgovuwBgYnbOGsZBVjDHzKPWYU8VehUMWOlIX9Qhw4rKLzXgGZziXnRTcyF7dK0iIGMVOG_OS1aTmKPRDilgVhTQUPCQIcE0x-MFTmJ8rCUpA3KtuenR2urg1ZtAzmszI0tq_Z7m66y-ilQo0uAqMTQ7WRX8auJKg56blZg7WB-iHDuYEBzO6NP0R1IwuYFphQbbTjnTH9NBfs80nym4Zyj8uUvyKbtUyGr5eUz9fNDQ7JCxfJDo9dW1lY9lmj_JNivPbGmf2Pt_lN9tDit9b-WeTetni85Z9pDpVOd7L1E_Vy7egayNO23ZP34eSeLJeux1b0rer_xaZ7ykS78nuSjMY-nL98rparNZNcv07JCjN06_EkTFBxBqOUMACErnELUNMSxTUjLDQZwzcqa4bRjCfeejUEFefS224OLr2S5wxPtij7lVrs80d2CNseRV2P52VNFMBipcdVE-U5jkRD7hFAwpGOylVwU2Mfc9qBh7DoR89yVnWXhgQFHnIsbpVb6tU_B-hH_2yzWY'
)
.reply(302, undefined, {
Location:
'https://newsletter.slowchinese.net/p/companies-that-eat-people-217',
})
.get('/p/companies-that-eat-people-217')
.reply(200, '')
const html = load('./test/utils/data/substack-forwarded-newsletter.html')
const url = await findNewsletterUrl(html)
// Not sure if the redirects from substack expire, this test could eventually fail
expect(url).to.startWith(
'https://newsletter.slowchinese.net/p/companies-that-eat-people-217'
)
})
it('gets the URL from the header if it is a beehiiv newsletter', async () => {
nock('https://u23463625.ct.sendgrid.net')
.head(
'/ss/c/AX1lEgEQaxtvFxLaVo0GBo_geajNrlI1TGeIcmMViR3pL3fEDZnbbkoeKcaY62QZk0KPFudUiUXc_uMLerV4nA/3k5/3TFZmreTR0qKSCgowABnVg/h30/zzLik7UXd1H_n4oyd5W8Xu639AYQQB2UXz-CsssSnno'
)
.reply(302, undefined, {
Location: 'https://www.milkroad.com/p/talked-guy-spent-30m-beeple',
})
.get('/p/talked-guy-spent-30m-beeple')
.reply(200, '')
const html = load('./test/utils/data/beehiiv-newsletter.html')
const url = await findNewsletterUrl(html)
expect(url).to.startWith(
'https://www.milkroad.com/p/talked-guy-spent-30m-beeple'
)
})
it('returns undefined if it is not a newsletter', async () => {
const html = load('./test/utils/data/substack-forwarded-welcome-email.html')
const url = await findNewsletterUrl(html)
expect(url).to.be.undefined
})
})
describe('parseMetadata', async () => {
it('gets author, title, image, description', async () => {
const html = load('./test/utils/data/substack-post.html')
@ -164,15 +98,6 @@ describe('isProbablyArticle', () => {
})
})
describe('generateUniqueUrl', () => {
it('generates a unique URL', () => {
const url1 = generateUniqueUrl()
const url2 = generateUniqueUrl()
expect(url1).to.not.eql(url2)
})
})
describe('getTitleFromEmailSubject', () => {
it('returns the title from the email subject', () => {
const title = 'test subject'

View File

@ -15,7 +15,12 @@
"build": "tsc"
},
"devDependencies": {
"eslint-plugin-prettier": "^4.0.0"
"chai": "^4.3.6",
"chai-as-promised": "^7.1.1",
"chai-string": "^1.5.0",
"eslint-plugin-prettier": "^4.0.0",
"mocha": "^10.0.0",
"nock": "^13.2.9"
},
"dependencies": {
"addressparser": "^1.0.1",

View File

@ -1,6 +1,8 @@
import addressparser from 'addressparser'
import rfc2047 from 'rfc2047'
import { v4 as uuidv4 } from 'uuid'
import { v4 as uuid } from 'uuid'
import { parseHTML } from 'linkedom'
import axios from 'axios'
interface Unsubscribe {
mailTo?: string
@ -34,16 +36,17 @@ export interface PreHandleResult {
dom?: Document
}
export const FAKE_URL_PREFIX = 'https://omnivore.app/no_url?q='
export const generateUniqueUrl = () => FAKE_URL_PREFIX + uuid()
export abstract class ContentHandler {
protected senderRegex: RegExp
protected urlRegex: RegExp
protected defaultUrl: string
public name: string
name: string
protected constructor() {
this.senderRegex = new RegExp(/NEWSLETTER_SENDER_REGEX/)
this.urlRegex = new RegExp(/NEWSLETTER_URL_REGEX/)
this.defaultUrl = 'NEWSLETTER_DEFAULT_URL'
this.name = 'Handler name'
}
@ -63,17 +66,57 @@ export abstract class ContentHandler {
return Promise.resolve({ url, dom })
}
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
return false
async isNewsletter(input: {
postHeader: string
from: string
unSubHeader: string
html?: string
}): Promise<boolean> {
const re = new RegExp(this.senderRegex)
return Promise.resolve(
re.test(input.from) && (!!input.postHeader || !!input.unSubHeader)
)
}
parseNewsletterUrl(_postHeader: string, html: string): string | undefined {
findNewsletterHeaderHref(dom: Document): string | undefined {
return undefined
}
// Given an HTML blob tries to find a URL to use for
// a canonical URL.
async findNewsletterUrl(html: string): Promise<string | undefined> {
const dom = parseHTML(html).document
// Check if this is a substack newsletter
const href = this.findNewsletterHeaderHref(dom)
if (href) {
// Try to make a HEAD request, so we get the redirected URL, since these
// will usually be behind tracking url redirects
try {
const response = await axios.head(href, { timeout: 5000 })
return Promise.resolve(
// eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
response.request.res.responseUrl as string | undefined
)
} catch (e) {
console.log('error making HEAD request', e)
return Promise.resolve(href)
}
}
return Promise.resolve(undefined)
}
async parseNewsletterUrl(
_postHeader: string,
html: string
): Promise<string | undefined> {
// get newsletter url from html
const matches = html.match(this.urlRegex)
if (matches) {
return matches[1]
return Promise.resolve(matches[1])
}
return undefined
return Promise.resolve(undefined)
}
parseAuthor(from: string): string {
@ -97,14 +140,14 @@ export abstract class ContentHandler {
}
}
handleNewsletter({
async handleNewsletter({
email,
html,
postHeader,
title,
from,
unSubHeader,
}: NewsletterInput): NewsletterResult {
}: NewsletterInput): Promise<NewsletterResult> {
console.log('handleNewsletter', email, postHeader, title, from)
if (!email || !html || !title || !from) {
@ -115,8 +158,7 @@ export abstract class ContentHandler {
// fallback to default url if newsletter url does not exist
// assign a random uuid to the default url to avoid duplicate url
const url =
this.parseNewsletterUrl(postHeader, html) ||
`${this.defaultUrl}?source=newsletters&id=${uuidv4()}`
(await this.parseNewsletterUrl(postHeader, html)) || generateUniqueUrl()
const author = this.parseAuthor(from)
const unsubscribe = this.parseUnsubscribe(unSubHeader)

View File

@ -1,13 +1,14 @@
import { AppleNewsHandler } from './content/apple-news-handler'
import { BloombergHandler } from './content/bloomberg-handler'
import { DerstandardHandler } from './content/derstandard-handler'
import { ImageHandler } from './content/image-handler'
import { MediumHandler } from './content/medium-handler'
import { PdfHandler } from './content/pdf-handler'
import { ScrapingBeeHandler } from './content/scrapingBee-handler'
import { TDotCoHandler } from './content/t-dot-co-handler'
import { TwitterHandler } from './content/twitter-handler'
import { YoutubeHandler } from './content/youtube-handler'
import { AppleNewsHandler } from './websites/apple-news-handler'
import { BloombergHandler } from './websites/bloomberg-handler'
import { DerstandardHandler } from './websites/derstandard-handler'
import { ImageHandler } from './websites/image-handler'
import { MediumHandler } from './websites/medium-handler'
import { PdfHandler } from './websites/pdf-handler'
import { ScrapingBeeHandler } from './websites/scrapingBee-handler'
import { TDotCoHandler } from './websites/t-dot-co-handler'
import { TwitterHandler } from './websites/twitter-handler'
import { YoutubeHandler } from './websites/youtube-handler'
import { WikipediaHandler } from './websites/wikipedia-handler'
import {
ContentHandler,
NewsletterInput,
@ -19,7 +20,9 @@ import { AxiosHandler } from './newsletters/axios-handler'
import { GolangHandler } from './newsletters/golang-handler'
import { MorningBrewHandler } from './newsletters/morning-brew-handler'
import { BloombergNewsletterHandler } from './newsletters/bloomberg-newsletter-handler'
import { WikipediaHandler } from './content/wikipedia-handler'
import { BeehiivHandler } from './newsletters/beehiiv-handler'
import { ConvertkitHandler } from './newsletters/convertkit-handler'
import { RevueHandler } from './newsletters/revue-handler'
const validateUrlString = (url: string) => {
const u = new URL(url)
@ -57,6 +60,10 @@ const newsletterHandlers: ContentHandler[] = [
new GolangHandler(),
new SubstackHandler(),
new MorningBrewHandler(),
new SubstackHandler(),
new BeehiivHandler(),
new ConvertkitHandler(),
new RevueHandler(),
]
export const preHandleContent = async (
@ -91,11 +98,11 @@ export const preHandleContent = async (
return undefined
}
export const handleNewsletter = (
export const handleNewsletter = async (
input: NewsletterInput
): NewsletterResult | undefined => {
): Promise<NewsletterResult | undefined> => {
for (const handler of newsletterHandlers) {
if (handler.isNewsletter(input.postHeader, input.from, input.unSubHeader)) {
if (await handler.isNewsletter(input)) {
return handler.handleNewsletter(input)
}
}

View File

@ -5,7 +5,6 @@ export class AxiosHandler extends ContentHandler {
super()
this.senderRegex = /<.+@axios.com>/
this.urlRegex = /View in browser at <a.*>(.*)<\/a>/
this.defaultUrl = 'https://axios.com'
this.name = 'axios'
}
@ -44,10 +43,4 @@ export class AxiosHandler extends ContentHandler {
return Promise.resolve({ dom })
}
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
// Axios newsletter is from <xx@axios.com>
const re = new RegExp(this.senderRegex)
return re.test(from) && (!!postHeader || !!unSubHeader)
}
}

View File

@ -0,0 +1,43 @@
import { ContentHandler } from '../content-handler'
import { parseHTML } from 'linkedom'
export class BeehiivHandler extends ContentHandler {
constructor() {
super()
this.name = 'beehiiv'
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const readOnline = dom.querySelectorAll('table tr td div a[class*="link"]')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (e.textContent === 'Read Online') {
res = e.getAttribute('href') || undefined
}
})
return res
}
async isNewsletter(input: {
postHeader: string
from: string
unSubHeader: string
html: string
}): Promise<boolean> {
const dom = parseHTML(input.html).document
if (dom.querySelectorAll('img[src*="beehiiv.net"]').length > 0) {
const beehiivUrl = this.findNewsletterHeaderHref(dom)
if (beehiivUrl) {
return Promise.resolve(true)
}
}
return false
}
async parseNewsletterUrl(
postHeader: string,
html: string
): Promise<string | undefined> {
return this.findNewsletterUrl(html)
}
}

View File

@ -5,7 +5,6 @@ export class BloombergNewsletterHandler extends ContentHandler {
super()
this.senderRegex = /<.+@mail.bloomberg.*.com>/
this.urlRegex = /<a class="view-in-browser__url" href=["']([^"']*)["']/
this.defaultUrl = 'https://www.bloomberg.com'
this.name = 'bloomberg'
}
@ -35,10 +34,4 @@ export class BloombergNewsletterHandler extends ContentHandler {
return Promise.resolve({ dom })
}
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
// Axios newsletter is from <xx@axios.com>
const re = new RegExp(this.senderRegex)
return re.test(from) && (!!postHeader || !!unSubHeader)
}
}

View File

@ -0,0 +1,41 @@
import { ContentHandler } from '../content-handler'
import { parseHTML } from 'linkedom'
export class ConvertkitHandler extends ContentHandler {
constructor() {
super()
this.name = 'convertkit'
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const readOnline = dom.querySelectorAll('table tr td a')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (e.textContent === 'View this email in your browser') {
res = e.getAttribute('href') || undefined
}
})
return res
}
async isNewsletter(input: {
postHeader: string
from: string
unSubHeader: string
html: string
}): Promise<boolean> {
const dom = parseHTML(input.html).document
return Promise.resolve(
dom.querySelectorAll(
'img[src*="convertkit.com"], img[src*="convertkit-mail.com"]'
).length > 0
)
}
async parseNewsletterUrl(
postHeader: string,
html: string
): Promise<string | undefined> {
return this.findNewsletterUrl(html)
}
}

View File

@ -5,7 +5,6 @@ export class GolangHandler extends ContentHandler {
super()
this.senderRegex = /<.+@golangweekly.com>/
this.urlRegex = /<a href=["']([^"']*)["'].*>Read on the Web<\/a>/
this.defaultUrl = 'https://golangweekly.com'
this.name = 'golangweekly'
}
@ -25,10 +24,4 @@ export class GolangHandler extends ContentHandler {
return Promise.resolve({ dom })
}
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
// Axios newsletter is from <xx@axios.com>
const re = new RegExp(this.senderRegex)
return re.test(from) && (!!postHeader || !!unSubHeader)
}
}

View File

@ -5,7 +5,6 @@ export class MorningBrewHandler extends ContentHandler {
super()
this.senderRegex = /Morning Brew <crew@morningbrew.com>/
this.urlRegex = /<a.* href=["']([^"']*)["'].*>View Online<\/a>/
this.defaultUrl = 'https://www.morningbrew.com'
this.name = 'morningbrew'
}
@ -33,10 +32,4 @@ export class MorningBrewHandler extends ContentHandler {
return Promise.resolve({ dom })
}
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
// Axios newsletter is from <xx@axios.com>
const re = new RegExp(this.senderRegex)
return re.test(from) && (!!postHeader || !!unSubHeader)
}
}

View File

@ -0,0 +1,46 @@
import { ContentHandler } from '../content-handler'
import { parseHTML } from 'linkedom'
export class RevueHandler extends ContentHandler {
constructor() {
super()
this.name = 'revue'
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const viewOnline = dom.querySelectorAll('table tr td a[target="_blank"]')
let res: string | undefined = undefined
viewOnline.forEach((e) => {
if (e.textContent === 'View online') {
res = e.getAttribute('href') || undefined
}
})
return res
}
async isNewsletter(input: {
postHeader: string
from: string
unSubHeader: string
html: string
}): Promise<boolean> {
const dom = parseHTML(input.html).document
if (
dom.querySelectorAll('img[src*="getrevue.co"], img[src*="revue.email"]')
.length > 0
) {
const getrevueUrl = this.findNewsletterHeaderHref(dom)
if (getrevueUrl) {
return Promise.resolve(true)
}
}
return false
}
async parseNewsletterUrl(
postHeader: string,
html: string
): Promise<string | undefined> {
return this.findNewsletterUrl(html)
}
}

View File

@ -1,10 +1,10 @@
import addressparser from 'addressparser'
import { ContentHandler, PreHandleResult } from '../content-handler'
import { parseHTML } from 'linkedom'
export class SubstackHandler extends ContentHandler {
constructor() {
super()
this.defaultUrl = 'https://www.substack.com'
this.name = 'substack'
}
@ -38,15 +38,53 @@ export class SubstackHandler extends ContentHandler {
return Promise.resolve(dom)
}
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
return !!postHeader
findNewsletterHeaderHref(dom: Document): string | undefined {
// Substack header links
const postLink = dom.querySelector('h1 a ')
if (postLink) {
return postLink.getAttribute('href') || undefined
}
return undefined
}
parseNewsletterUrl(postHeader: string, html: string): string | undefined {
async isNewsletter({
postHeader,
html,
}: {
postHeader: string
from: string
unSubHeader: string
html: string
}): Promise<boolean> {
if (postHeader) {
return Promise.resolve(true)
}
const dom = parseHTML(html).document
// substack newsletter emails have tables with a *post-meta class
if (dom.querySelector('table[class$="post-meta"]')) {
return true
}
// If the article has a header link, and substack icons its probably a newsletter
const href = this.findNewsletterHeaderHref(dom)
const heartIcon = dom.querySelector(
'table tbody td span a img[src*="HeartIcon"]'
)
const recommendIcon = dom.querySelector(
'table tbody td span a img[src*="RecommendIconRounded"]'
)
return Promise.resolve(!!(href && (heartIcon || recommendIcon)))
}
async parseNewsletterUrl(
postHeader: string,
html: string
): Promise<string | undefined> {
// raw SubStack newsletter url is like <https://hongbo130.substack.com/p/tldr>
// we need to get the real url from the raw url
return addressparser(postHeader).length > 0
? addressparser(postHeader)[0].name
: undefined
if (postHeader && addressparser(postHeader).length > 0) {
return Promise.resolve(addressparser(postHeader)[0].name)
}
return this.findNewsletterUrl(html)
}
}

View File

@ -1,4 +1,4 @@
import { AppleNewsHandler } from '../src/content/apple-news-handler'
import { AppleNewsHandler } from '../src/websites/apple-news-handler'
describe('open a simple web page', () => {
it('should return a response', async () => {

View File

@ -1,28 +1,45 @@
import 'mocha'
import * as chai from 'chai'
import { expect } from 'chai'
import chaiAsPromised from 'chai-as-promised'
import chaiString from 'chai-string'
import { SubstackHandler } from '../src/newsletters/substack-handler'
import { AxiosHandler } from '../src/newsletters/axios-handler'
import { BloombergNewsletterHandler } from '../src/newsletters/bloomberg-newsletter-handler'
import { GolangHandler } from '../src/newsletters/golang-handler'
import { MorningBrewHandler } from '../src/newsletters/morning-brew-handler'
import nock from 'nock'
import { generateUniqueUrl } from '../src/content-handler'
import fs from 'fs'
import { BeehiivHandler } from '../src/newsletters/beehiiv-handler'
chai.use(chaiAsPromised)
chai.use(chaiString)
const load = (path: string): string => {
return fs.readFileSync(path, 'utf8')
}
describe('Newsletter email test', () => {
describe('#getNewsletterUrl()', () => {
it('returns url when email is from SubStack', () => {
it('returns url when email is from SubStack', async () => {
const rawUrl = '<https://hongbo130.substack.com/p/tldr>'
expect(new SubstackHandler().parseNewsletterUrl(rawUrl, '')).to.equal(
'https://hongbo130.substack.com/p/tldr'
)
await expect(
new SubstackHandler().parseNewsletterUrl(rawUrl, '')
).to.eventually.equal('https://hongbo130.substack.com/p/tldr')
})
it('returns url when email is from Axios', () => {
it('returns url when email is from Axios', async () => {
const url = 'https://axios.com/blog/the-best-way-to-build-a-web-app'
const html = `View in browser at <a>${url}</a>`
expect(new AxiosHandler().parseNewsletterUrl('', html)).to.equal(url)
await expect(
new AxiosHandler().parseNewsletterUrl('', html)
).to.eventually.equal(url)
})
it('returns url when email is from Bloomberg', () => {
it('returns url when email is from Bloomberg', async () => {
const url = 'https://www.bloomberg.com/news/google-is-now-a-partner'
const html = `
<a class="view-in-browser__url" href="${url}">
@ -30,29 +47,31 @@ describe('Newsletter email test', () => {
</a>
`
expect(
await expect(
new BloombergNewsletterHandler().parseNewsletterUrl('', html)
).to.equal(url)
).to.eventually.equal(url)
})
it('returns url when email is from Golang Weekly', () => {
it('returns url when email is from Golang Weekly', async () => {
const url = 'https://www.golangweekly.com/first'
const html = `
<a href="${url}" style="text-decoration: none">Read on the Web</a>
`
expect(new GolangHandler().parseNewsletterUrl('', html)).to.equal(url)
await expect(
new GolangHandler().parseNewsletterUrl('', html)
).to.eventually.equal(url)
})
it('returns url when email is from Morning Brew', () => {
it('returns url when email is from Morning Brew', async () => {
const url = 'https://www.morningbrew.com/daily/issues/first'
const html = `
<a style="color: #000000; text-decoration: none;" target="_blank" rel="noopener" href="${url}">View Online</a>
`
expect(new MorningBrewHandler().parseNewsletterUrl('', html)).to.equal(
url
)
await expect(
new MorningBrewHandler().parseNewsletterUrl('', html)
).to.eventually.equal(url)
})
})
@ -69,4 +88,104 @@ describe('Newsletter email test', () => {
expect(new AxiosHandler().parseAuthor(from)).to.equal('Mike Allen')
})
})
describe('isProbablyNewsletter', () => {
it('returns true for substack newsletter', async () => {
const html = load('./test/data/substack-forwarded-newsletter.html')
await expect(
new SubstackHandler().isNewsletter({
html,
postHeader: '',
from: '',
unSubHeader: '',
})
).to.eventually.be.true
})
it('returns true for private forwarded substack newsletter', async () => {
const html = load(
'./test/data/substack-private-forwarded-newsletter.html'
)
await expect(
new SubstackHandler().isNewsletter({
html,
postHeader: '',
from: '',
unSubHeader: '',
})
).to.eventually.be.true
})
it('returns false for substack welcome email', async () => {
const html = load('./test/data/substack-forwarded-welcome-email.html')
await expect(
new SubstackHandler().isNewsletter({
html,
postHeader: '',
from: '',
unSubHeader: '',
})
).to.eventually.be.false
})
it('returns true for beehiiv.com newsletter', async () => {
const html = load('./test/data/beehiiv-newsletter.html')
await expect(
new BeehiivHandler().isNewsletter({
html,
postHeader: '',
from: '',
unSubHeader: '',
})
).to.eventually.be.true
})
})
describe('findNewsletterUrl', async () => {
it('gets the URL from the header if it is a substack newsletter', async () => {
nock('https://email.mg2.substack.com')
.head(
'/c/eJxNkk2TojAQhn-N3KTyQfg4cGDGchdnYcsZx9K5UCE0EMVAkTiKv36iHnarupNUd7rfVJ4W3EDTj1M89No496Uw0wCxgovuwBgYnbOGsZBVjDHzKPWYU8VehUMWOlIX9Qhw4rKLzXgGZziXnRTcyF7dK0iIGMVOG_OS1aTmKPRDilgVhTQUPCQIcE0x-MFTmJ8rCUpA3KtuenR2urg1ZtAzmszI0tq_Z7m66y-ilQo0uAqMTQ7WRX8auJKg56blZg7WB-iHDuYEBzO6NP0R1IwuYFphQbbTjnTH9NBfs80nym4Zyj8uUvyKbtUyGr5eUz9fNDQ7JCxfJDo9dW1lY9lmj_JNivPbGmf2Pt_lN9tDit9b-WeTetni85Z9pDpVOd7L1E_Vy7egayNO23ZP34eSeLJeux1b0rer_xaZ7ykS78nuSjMY-nL98rparNZNcv07JCjN06_EkTFBxBqOUMACErnELUNMSxTUjLDQZwzcqa4bRjCfeejUEFefS224OLr2S5wxPtij7lVrs80d2CNseRV2P52VNFMBipcdVE-U5jkRD7hFAwpGOylVwU2Mfc9qBh7DoR89yVnWXhgQFHnIsbpVb6tU_B-hH_2yzWY'
)
.reply(302, undefined, {
Location:
'https://newsletter.slowchinese.net/p/companies-that-eat-people-217',
})
.get('/p/companies-that-eat-people-217')
.reply(200, '')
const html = load('./test/data/substack-forwarded-newsletter.html')
const url = await new SubstackHandler().findNewsletterUrl(html)
// Not sure if the redirects from substack expire, this test could eventually fail
expect(url).to.startWith(
'https://newsletter.slowchinese.net/p/companies-that-eat-people-217'
)
})
it('gets the URL from the header if it is a beehiiv newsletter', async () => {
nock('https://u23463625.ct.sendgrid.net')
.head(
'/ss/c/AX1lEgEQaxtvFxLaVo0GBo_geajNrlI1TGeIcmMViR3pL3fEDZnbbkoeKcaY62QZk0KPFudUiUXc_uMLerV4nA/3k5/3TFZmreTR0qKSCgowABnVg/h30/zzLik7UXd1H_n4oyd5W8Xu639AYQQB2UXz-CsssSnno'
)
.reply(302, undefined, {
Location: 'https://www.milkroad.com/p/talked-guy-spent-30m-beeple',
})
.get('/p/talked-guy-spent-30m-beeple')
.reply(200, '')
const html = load('./test/data/beehiiv-newsletter.html')
const url = await new BeehiivHandler().findNewsletterUrl(html)
expect(url).to.startWith(
'https://www.milkroad.com/p/talked-guy-spent-30m-beeple'
)
})
it('returns undefined if it is not a newsletter', async () => {
const html = load('./test/data/substack-forwarded-welcome-email.html')
const url = await new SubstackHandler().findNewsletterUrl(html)
expect(url).to.be.undefined
})
})
describe('generateUniqueUrl', () => {
it('generates a unique URL', () => {
const url1 = generateUniqueUrl()
const url2 = generateUniqueUrl()
expect(url1).to.not.eql(url2)
})
})
})

View File

@ -1,6 +1,6 @@
import { expect } from 'chai'
import 'mocha'
import { getYoutubeVideoId } from '../src/content/youtube-handler'
import { getYoutubeVideoId } from '../src/websites/youtube-handler'
describe('getYoutubeVideoId', () => {
it('should parse video id out of a URL', async () => {

View File

@ -78,7 +78,7 @@ export const inboundEmailHandler = Sentry.GCPFunction.wrapHttpFunction(
try {
// check if it is a confirmation email or forwarding newsletter
const newsletterMessage = handleNewsletter({
const newsletterMessage = await handleNewsletter({
from,
html,
postHeader,

111
yarn.lock
View File

@ -10104,6 +10104,13 @@ brace-expansion@^1.1.7:
balanced-match "^1.0.0"
concat-map "0.0.1"
brace-expansion@^2.0.1:
version "2.0.1"
resolved "https://registry.yarnpkg.com/brace-expansion/-/brace-expansion-2.0.1.tgz#1edc459e0f0c548486ecf9fc99f2221364b9a0ae"
integrity sha512-XnAIvQ8eM+kC6aULx6wuQiwVsnzsi9d3WxzV3FpWTGA19F621kwdbsAcFKXgKUHZWsy+mY6iL1sHTxWEFCytDA==
dependencies:
balanced-match "^1.0.0"
braces@^2.3.1, braces@^2.3.2:
version "2.3.2"
resolved "https://registry.yarnpkg.com/braces/-/braces-2.3.2.tgz#5979fd3f14cd531565e5fa2df1abfff1dfaee729"
@ -10579,6 +10586,19 @@ chai@^4.3.4:
pathval "^1.1.1"
type-detect "^4.0.5"
chai@^4.3.6:
version "4.3.6"
resolved "https://registry.yarnpkg.com/chai/-/chai-4.3.6.tgz#ffe4ba2d9fa9d6680cc0b370adae709ec9011e9c"
integrity sha512-bbcp3YfHCUzMOvKqsztczerVgBKSsEijCySNlHHbX3VG1nskvqjz5Rfso1gGwD6w6oOV3eI60pKuMOV5MV7p3Q==
dependencies:
assertion-error "^1.1.0"
check-error "^1.0.2"
deep-eql "^3.0.1"
get-func-name "^2.0.0"
loupe "^2.3.1"
pathval "^1.1.1"
type-detect "^4.0.5"
chalk@^1.0.0, chalk@^1.1.3:
version "1.1.3"
resolved "https://registry.yarnpkg.com/chalk/-/chalk-1.1.3.tgz#a8115c55e4a702fe4d150abd3872822a7e09fc98"
@ -10748,6 +10768,21 @@ chokidar@3.5.2:
optionalDependencies:
fsevents "~2.3.2"
chokidar@3.5.3, chokidar@^3.4.1, chokidar@^3.4.2, chokidar@^3.5.1, chokidar@^3.5.2, chokidar@^3.5.3:
version "3.5.3"
resolved "https://registry.yarnpkg.com/chokidar/-/chokidar-3.5.3.tgz#1cf37c8707b932bd1af1ae22c0432e2acd1903bd"
integrity sha512-Dr3sfKRP6oTcjf2JmUmFJfeVMvXBdegxB0iVQ5eb2V10uFJUCAS8OByZdVAyVb8xXNz3GjjTgj9kLWsZTqE6kw==
dependencies:
anymatch "~3.1.2"
braces "~3.0.2"
glob-parent "~5.1.2"
is-binary-path "~2.1.0"
is-glob "~4.0.1"
normalize-path "~3.0.0"
readdirp "~3.6.0"
optionalDependencies:
fsevents "~2.3.2"
chokidar@^2.1.8:
version "2.1.8"
resolved "https://registry.yarnpkg.com/chokidar/-/chokidar-2.1.8.tgz#804b3a7b6a99358c3c5c61e71d8728f041cff917"
@ -10767,21 +10802,6 @@ chokidar@^2.1.8:
optionalDependencies:
fsevents "^1.2.7"
chokidar@^3.4.1, chokidar@^3.4.2, chokidar@^3.5.1, chokidar@^3.5.2, chokidar@^3.5.3:
version "3.5.3"
resolved "https://registry.yarnpkg.com/chokidar/-/chokidar-3.5.3.tgz#1cf37c8707b932bd1af1ae22c0432e2acd1903bd"
integrity sha512-Dr3sfKRP6oTcjf2JmUmFJfeVMvXBdegxB0iVQ5eb2V10uFJUCAS8OByZdVAyVb8xXNz3GjjTgj9kLWsZTqE6kw==
dependencies:
anymatch "~3.1.2"
braces "~3.0.2"
glob-parent "~5.1.2"
is-binary-path "~2.1.0"
is-glob "~4.0.1"
normalize-path "~3.0.0"
readdirp "~3.6.0"
optionalDependencies:
fsevents "~2.3.2"
chownr@^1.1.1, chownr@^1.1.4:
version "1.1.4"
resolved "https://registry.yarnpkg.com/chownr/-/chownr-1.1.4.tgz#6fc9d7b42d32a583596337666e7d08084da2cc6b"
@ -14489,7 +14509,7 @@ glob@7.1.7:
once "^1.3.0"
path-is-absolute "^1.0.0"
glob@^7.1.1, glob@^7.1.2, glob@^7.1.3, glob@^7.1.4, glob@^7.1.6, glob@^7.2.0:
glob@7.2.0, glob@^7.1.1, glob@^7.1.2, glob@^7.1.3, glob@^7.1.4, glob@^7.1.6, glob@^7.2.0:
version "7.2.0"
resolved "https://registry.yarnpkg.com/glob/-/glob-7.2.0.tgz#d15535af7732e02e948f4c41628bd910293f6023"
integrity sha512-lmLf6gtyrPq8tTjSmrO94wBeQbFR3HbLHbuyD69wuyQkImp2hWqMGB47OX65FBkPffO641IP9jWa1z4ivqG26Q==
@ -18065,6 +18085,13 @@ loose-envify@^1.0.0, loose-envify@^1.1.0, loose-envify@^1.4.0:
dependencies:
js-tokens "^3.0.0 || ^4.0.0"
loupe@^2.3.1:
version "2.3.4"
resolved "https://registry.yarnpkg.com/loupe/-/loupe-2.3.4.tgz#7e0b9bffc76f148f9be769cb1321d3dcf3cb25f3"
integrity sha512-OvKfgCC2Ndby6aSTREl5aCCPTNIzlDfQZvZxNUrBrihDhL3xcrYegTblhmEiCrg2kKQz4XsFIaemE5BF4ybSaQ==
dependencies:
get-func-name "^2.0.0"
lower-case-first@^1.0.0:
version "1.0.2"
resolved "https://registry.yarnpkg.com/lower-case-first/-/lower-case-first-1.0.2.tgz#e5da7c26f29a7073be02d52bac9980e5922adfa1"
@ -18621,6 +18648,13 @@ minimatch@3.0.4:
dependencies:
brace-expansion "^1.1.7"
minimatch@5.0.1:
version "5.0.1"
resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-5.0.1.tgz#fb9022f7528125187c92bd9e9b6366be1cf3415b"
integrity sha512-nLDxIFRyhDblz3qMuq+SoRZED4+miJ/G+tdDrjkkkRnjAsBexeGpgjLEQ0blJy7rHhR2b93rhQY4SvyWu9v03g==
dependencies:
brace-expansion "^2.0.1"
minimatch@^3.0.2, minimatch@^3.0.4:
version "3.1.2"
resolved "https://registry.yarnpkg.com/minimatch/-/minimatch-3.1.2.tgz#19cd194bfd3e428f049a70817c038d89ab4be35b"
@ -18803,6 +18837,34 @@ mocha-unfunk-reporter@^0.4.0:
miniwrite "~0.1.3"
unfunk-diff "~0.0.1"
mocha@^10.0.0:
version "10.0.0"
resolved "https://registry.yarnpkg.com/mocha/-/mocha-10.0.0.tgz#205447d8993ec755335c4b13deba3d3a13c4def9"
integrity sha512-0Wl+elVUD43Y0BqPZBzZt8Tnkw9CMUdNYnUsTfOM1vuhJVZL+kiesFYsqwBkEEuEixaiPe5ZQdqDgX2jddhmoA==
dependencies:
"@ungap/promise-all-settled" "1.1.2"
ansi-colors "4.1.1"
browser-stdout "1.3.1"
chokidar "3.5.3"
debug "4.3.4"
diff "5.0.0"
escape-string-regexp "4.0.0"
find-up "5.0.0"
glob "7.2.0"
he "1.2.0"
js-yaml "4.1.0"
log-symbols "4.1.0"
minimatch "5.0.1"
ms "2.1.3"
nanoid "3.3.3"
serialize-javascript "6.0.0"
strip-json-comments "3.1.1"
supports-color "8.1.1"
workerpool "6.2.1"
yargs "16.2.0"
yargs-parser "20.2.4"
yargs-unparser "2.0.0"
mocha@^8.2.0:
version "8.4.0"
resolved "https://registry.yarnpkg.com/mocha/-/mocha-8.4.0.tgz#677be88bf15980a3cae03a73e10a0fc3997f0cff"
@ -18965,7 +19027,7 @@ nan@^2.12.1:
resolved "https://registry.yarnpkg.com/nan/-/nan-2.15.0.tgz#3f34a473ff18e15c1b5626b62903b5ad6e665fee"
integrity sha512-8ZtvEnA2c5aYCZYd1cvgdnU6cqwixRoYg70xPLWUws5ORTa/lnw+u4amixRS/Ac5U5mQVgp9pnlSUnbNWFaWZQ==
nanoid@*, nanoid@^3.1.23, nanoid@^3.1.25, nanoid@^3.1.29, nanoid@^3.1.30, nanoid@^3.3.1:
nanoid@*, nanoid@3.3.3, nanoid@^3.1.23, nanoid@^3.1.25, nanoid@^3.1.29, nanoid@^3.1.30, nanoid@^3.3.1:
version "3.3.3"
resolved "https://registry.yarnpkg.com/nanoid/-/nanoid-3.3.3.tgz#fd8e8b7aa761fe807dba2d1b98fb7241bb724a25"
integrity sha512-p1sjXuopFs0xg+fPASzQ28agW1oHD7xDsd9Xkf3T15H3c/cifrFHVwrh74PdoklAPi+i7MdRsE47vm2r6JoB+w==
@ -19136,6 +19198,16 @@ nock@^13.2.4:
lodash.set "^4.3.2"
propagate "^2.0.0"
nock@^13.2.9:
version "13.2.9"
resolved "https://registry.yarnpkg.com/nock/-/nock-13.2.9.tgz#4faf6c28175d36044da4cfa68e33e5a15086ad4c"
integrity sha512-1+XfJNYF1cjGB+TKMWi29eZ0b82QOvQs2YoLNzbpWGqFMtRQHTa57osqdGj4FrFPgkO4D4AZinzUJR9VvW3QUA==
dependencies:
debug "^4.1.0"
json-stringify-safe "^5.0.1"
lodash "^4.17.21"
propagate "^2.0.0"
node-addon-api@^1.2.0:
version "1.7.2"
resolved "https://registry.yarnpkg.com/node-addon-api/-/node-addon-api-1.7.2.tgz#3df30b95720b53c24e59948b49532b662444f54d"
@ -25608,6 +25680,11 @@ workerpool@6.1.5:
resolved "https://registry.yarnpkg.com/workerpool/-/workerpool-6.1.5.tgz#0f7cf076b6215fd7e1da903ff6f22ddd1886b581"
integrity sha512-XdKkCK0Zqc6w3iTxLckiuJ81tiD/o5rBE/m+nXpRCB+/Sq4DqkfXZ/x0jW02DG1tGsfUGXbTJyZDP+eu67haSw==
workerpool@6.2.1:
version "6.2.1"
resolved "https://registry.yarnpkg.com/workerpool/-/workerpool-6.2.1.tgz#46fc150c17d826b86a008e5a4508656777e9c343"
integrity sha512-ILEIE97kDZvF9Wb9f6h5aXK4swSlKGUcOEGiIYb2OOu/IrDU9iwj0fD//SsA6E5ibwJxpEvhullJY4Sl4GcpAw==
wrap-ansi@^3.0.1:
version "3.0.1"
resolved "https://registry.yarnpkg.com/wrap-ansi/-/wrap-ansi-3.0.1.tgz#288a04d87eda5c286e060dfe8f135ce8d007f8ba"