Add other newsletter handlers
This commit is contained in:
@ -1,6 +1,8 @@
|
||||
import addressparser from 'addressparser'
|
||||
import rfc2047 from 'rfc2047'
|
||||
import { v4 as uuidv4 } from 'uuid'
|
||||
import { v4 as uuid } from 'uuid'
|
||||
import { parseHTML } from 'linkedom'
|
||||
import axios from 'axios'
|
||||
|
||||
interface Unsubscribe {
|
||||
mailTo?: string
|
||||
@ -34,16 +36,17 @@ export interface PreHandleResult {
|
||||
dom?: Document
|
||||
}
|
||||
|
||||
export const FAKE_URL_PREFIX = 'https://omnivore.app/no_url?q='
|
||||
export const generateUniqueUrl = () => FAKE_URL_PREFIX + uuid()
|
||||
|
||||
export abstract class ContentHandler {
|
||||
protected senderRegex: RegExp
|
||||
protected urlRegex: RegExp
|
||||
protected defaultUrl: string
|
||||
public name: string
|
||||
name: string
|
||||
|
||||
protected constructor() {
|
||||
this.senderRegex = new RegExp(/NEWSLETTER_SENDER_REGEX/)
|
||||
this.urlRegex = new RegExp(/NEWSLETTER_URL_REGEX/)
|
||||
this.defaultUrl = 'NEWSLETTER_DEFAULT_URL'
|
||||
this.name = 'Handler name'
|
||||
}
|
||||
|
||||
@ -63,17 +66,57 @@ export abstract class ContentHandler {
|
||||
return Promise.resolve({ url, dom })
|
||||
}
|
||||
|
||||
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
|
||||
return false
|
||||
async isNewsletter(input: {
|
||||
postHeader: string
|
||||
from: string
|
||||
unSubHeader: string
|
||||
html?: string
|
||||
}): Promise<boolean> {
|
||||
const re = new RegExp(this.senderRegex)
|
||||
return Promise.resolve(
|
||||
re.test(input.from) && (!!input.postHeader || !!input.unSubHeader)
|
||||
)
|
||||
}
|
||||
|
||||
parseNewsletterUrl(_postHeader: string, html: string): string | undefined {
|
||||
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||
return undefined
|
||||
}
|
||||
|
||||
// Given an HTML blob tries to find a URL to use for
|
||||
// a canonical URL.
|
||||
async findNewsletterUrl(html: string): Promise<string | undefined> {
|
||||
const dom = parseHTML(html).document
|
||||
|
||||
// Check if this is a substack newsletter
|
||||
const href = this.findNewsletterHeaderHref(dom)
|
||||
if (href) {
|
||||
// Try to make a HEAD request, so we get the redirected URL, since these
|
||||
// will usually be behind tracking url redirects
|
||||
try {
|
||||
const response = await axios.head(href, { timeout: 5000 })
|
||||
return Promise.resolve(
|
||||
// eslint-disable-next-line @typescript-eslint/no-unsafe-member-access
|
||||
response.request.res.responseUrl as string | undefined
|
||||
)
|
||||
} catch (e) {
|
||||
console.log('error making HEAD request', e)
|
||||
return Promise.resolve(href)
|
||||
}
|
||||
}
|
||||
|
||||
return Promise.resolve(undefined)
|
||||
}
|
||||
|
||||
async parseNewsletterUrl(
|
||||
_postHeader: string,
|
||||
html: string
|
||||
): Promise<string | undefined> {
|
||||
// get newsletter url from html
|
||||
const matches = html.match(this.urlRegex)
|
||||
if (matches) {
|
||||
return matches[1]
|
||||
return Promise.resolve(matches[1])
|
||||
}
|
||||
return undefined
|
||||
return Promise.resolve(undefined)
|
||||
}
|
||||
|
||||
parseAuthor(from: string): string {
|
||||
@ -97,14 +140,14 @@ export abstract class ContentHandler {
|
||||
}
|
||||
}
|
||||
|
||||
handleNewsletter({
|
||||
async handleNewsletter({
|
||||
email,
|
||||
html,
|
||||
postHeader,
|
||||
title,
|
||||
from,
|
||||
unSubHeader,
|
||||
}: NewsletterInput): NewsletterResult {
|
||||
}: NewsletterInput): Promise<NewsletterResult> {
|
||||
console.log('handleNewsletter', email, postHeader, title, from)
|
||||
|
||||
if (!email || !html || !title || !from) {
|
||||
@ -115,8 +158,7 @@ export abstract class ContentHandler {
|
||||
// fallback to default url if newsletter url does not exist
|
||||
// assign a random uuid to the default url to avoid duplicate url
|
||||
const url =
|
||||
this.parseNewsletterUrl(postHeader, html) ||
|
||||
`${this.defaultUrl}?source=newsletters&id=${uuidv4()}`
|
||||
(await this.parseNewsletterUrl(postHeader, html)) || generateUniqueUrl()
|
||||
const author = this.parseAuthor(from)
|
||||
const unsubscribe = this.parseUnsubscribe(unSubHeader)
|
||||
|
||||
|
||||
@ -1,13 +1,14 @@
|
||||
import { AppleNewsHandler } from './content/apple-news-handler'
|
||||
import { BloombergHandler } from './content/bloomberg-handler'
|
||||
import { DerstandardHandler } from './content/derstandard-handler'
|
||||
import { ImageHandler } from './content/image-handler'
|
||||
import { MediumHandler } from './content/medium-handler'
|
||||
import { PdfHandler } from './content/pdf-handler'
|
||||
import { ScrapingBeeHandler } from './content/scrapingBee-handler'
|
||||
import { TDotCoHandler } from './content/t-dot-co-handler'
|
||||
import { TwitterHandler } from './content/twitter-handler'
|
||||
import { YoutubeHandler } from './content/youtube-handler'
|
||||
import { AppleNewsHandler } from './websites/apple-news-handler'
|
||||
import { BloombergHandler } from './websites/bloomberg-handler'
|
||||
import { DerstandardHandler } from './websites/derstandard-handler'
|
||||
import { ImageHandler } from './websites/image-handler'
|
||||
import { MediumHandler } from './websites/medium-handler'
|
||||
import { PdfHandler } from './websites/pdf-handler'
|
||||
import { ScrapingBeeHandler } from './websites/scrapingBee-handler'
|
||||
import { TDotCoHandler } from './websites/t-dot-co-handler'
|
||||
import { TwitterHandler } from './websites/twitter-handler'
|
||||
import { YoutubeHandler } from './websites/youtube-handler'
|
||||
import { WikipediaHandler } from './websites/wikipedia-handler'
|
||||
import {
|
||||
ContentHandler,
|
||||
NewsletterInput,
|
||||
@ -19,7 +20,9 @@ import { AxiosHandler } from './newsletters/axios-handler'
|
||||
import { GolangHandler } from './newsletters/golang-handler'
|
||||
import { MorningBrewHandler } from './newsletters/morning-brew-handler'
|
||||
import { BloombergNewsletterHandler } from './newsletters/bloomberg-newsletter-handler'
|
||||
import { WikipediaHandler } from './content/wikipedia-handler'
|
||||
import { BeehiivHandler } from './newsletters/beehiiv-handler'
|
||||
import { ConvertkitHandler } from './newsletters/convertkit-handler'
|
||||
import { RevueHandler } from './newsletters/revue-handler'
|
||||
|
||||
const validateUrlString = (url: string) => {
|
||||
const u = new URL(url)
|
||||
@ -57,6 +60,10 @@ const newsletterHandlers: ContentHandler[] = [
|
||||
new GolangHandler(),
|
||||
new SubstackHandler(),
|
||||
new MorningBrewHandler(),
|
||||
new SubstackHandler(),
|
||||
new BeehiivHandler(),
|
||||
new ConvertkitHandler(),
|
||||
new RevueHandler(),
|
||||
]
|
||||
|
||||
export const preHandleContent = async (
|
||||
@ -91,11 +98,11 @@ export const preHandleContent = async (
|
||||
return undefined
|
||||
}
|
||||
|
||||
export const handleNewsletter = (
|
||||
export const handleNewsletter = async (
|
||||
input: NewsletterInput
|
||||
): NewsletterResult | undefined => {
|
||||
): Promise<NewsletterResult | undefined> => {
|
||||
for (const handler of newsletterHandlers) {
|
||||
if (handler.isNewsletter(input.postHeader, input.from, input.unSubHeader)) {
|
||||
if (await handler.isNewsletter(input)) {
|
||||
return handler.handleNewsletter(input)
|
||||
}
|
||||
}
|
||||
|
||||
@ -5,7 +5,6 @@ export class AxiosHandler extends ContentHandler {
|
||||
super()
|
||||
this.senderRegex = /<.+@axios.com>/
|
||||
this.urlRegex = /View in browser at <a.*>(.*)<\/a>/
|
||||
this.defaultUrl = 'https://axios.com'
|
||||
this.name = 'axios'
|
||||
}
|
||||
|
||||
@ -44,10 +43,4 @@ export class AxiosHandler extends ContentHandler {
|
||||
|
||||
return Promise.resolve({ dom })
|
||||
}
|
||||
|
||||
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
|
||||
// Axios newsletter is from <xx@axios.com>
|
||||
const re = new RegExp(this.senderRegex)
|
||||
return re.test(from) && (!!postHeader || !!unSubHeader)
|
||||
}
|
||||
}
|
||||
|
||||
43
packages/content-handler/src/newsletters/beehiiv-handler.ts
Normal file
43
packages/content-handler/src/newsletters/beehiiv-handler.ts
Normal file
@ -0,0 +1,43 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
import { parseHTML } from 'linkedom'
|
||||
|
||||
export class BeehiivHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'beehiiv'
|
||||
}
|
||||
|
||||
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||
const readOnline = dom.querySelectorAll('table tr td div a[class*="link"]')
|
||||
let res: string | undefined = undefined
|
||||
readOnline.forEach((e) => {
|
||||
if (e.textContent === 'Read Online') {
|
||||
res = e.getAttribute('href') || undefined
|
||||
}
|
||||
})
|
||||
return res
|
||||
}
|
||||
|
||||
async isNewsletter(input: {
|
||||
postHeader: string
|
||||
from: string
|
||||
unSubHeader: string
|
||||
html: string
|
||||
}): Promise<boolean> {
|
||||
const dom = parseHTML(input.html).document
|
||||
if (dom.querySelectorAll('img[src*="beehiiv.net"]').length > 0) {
|
||||
const beehiivUrl = this.findNewsletterHeaderHref(dom)
|
||||
if (beehiivUrl) {
|
||||
return Promise.resolve(true)
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
async parseNewsletterUrl(
|
||||
postHeader: string,
|
||||
html: string
|
||||
): Promise<string | undefined> {
|
||||
return this.findNewsletterUrl(html)
|
||||
}
|
||||
}
|
||||
@ -5,7 +5,6 @@ export class BloombergNewsletterHandler extends ContentHandler {
|
||||
super()
|
||||
this.senderRegex = /<.+@mail.bloomberg.*.com>/
|
||||
this.urlRegex = /<a class="view-in-browser__url" href=["']([^"']*)["']/
|
||||
this.defaultUrl = 'https://www.bloomberg.com'
|
||||
this.name = 'bloomberg'
|
||||
}
|
||||
|
||||
@ -35,10 +34,4 @@ export class BloombergNewsletterHandler extends ContentHandler {
|
||||
|
||||
return Promise.resolve({ dom })
|
||||
}
|
||||
|
||||
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
|
||||
// Axios newsletter is from <xx@axios.com>
|
||||
const re = new RegExp(this.senderRegex)
|
||||
return re.test(from) && (!!postHeader || !!unSubHeader)
|
||||
}
|
||||
}
|
||||
|
||||
@ -0,0 +1,41 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
import { parseHTML } from 'linkedom'
|
||||
|
||||
export class ConvertkitHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'convertkit'
|
||||
}
|
||||
|
||||
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||
const readOnline = dom.querySelectorAll('table tr td a')
|
||||
let res: string | undefined = undefined
|
||||
readOnline.forEach((e) => {
|
||||
if (e.textContent === 'View this email in your browser') {
|
||||
res = e.getAttribute('href') || undefined
|
||||
}
|
||||
})
|
||||
return res
|
||||
}
|
||||
|
||||
async isNewsletter(input: {
|
||||
postHeader: string
|
||||
from: string
|
||||
unSubHeader: string
|
||||
html: string
|
||||
}): Promise<boolean> {
|
||||
const dom = parseHTML(input.html).document
|
||||
return Promise.resolve(
|
||||
dom.querySelectorAll(
|
||||
'img[src*="convertkit.com"], img[src*="convertkit-mail.com"]'
|
||||
).length > 0
|
||||
)
|
||||
}
|
||||
|
||||
async parseNewsletterUrl(
|
||||
postHeader: string,
|
||||
html: string
|
||||
): Promise<string | undefined> {
|
||||
return this.findNewsletterUrl(html)
|
||||
}
|
||||
}
|
||||
@ -5,7 +5,6 @@ export class GolangHandler extends ContentHandler {
|
||||
super()
|
||||
this.senderRegex = /<.+@golangweekly.com>/
|
||||
this.urlRegex = /<a href=["']([^"']*)["'].*>Read on the Web<\/a>/
|
||||
this.defaultUrl = 'https://golangweekly.com'
|
||||
this.name = 'golangweekly'
|
||||
}
|
||||
|
||||
@ -25,10 +24,4 @@ export class GolangHandler extends ContentHandler {
|
||||
|
||||
return Promise.resolve({ dom })
|
||||
}
|
||||
|
||||
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
|
||||
// Axios newsletter is from <xx@axios.com>
|
||||
const re = new RegExp(this.senderRegex)
|
||||
return re.test(from) && (!!postHeader || !!unSubHeader)
|
||||
}
|
||||
}
|
||||
|
||||
@ -5,7 +5,6 @@ export class MorningBrewHandler extends ContentHandler {
|
||||
super()
|
||||
this.senderRegex = /Morning Brew <crew@morningbrew.com>/
|
||||
this.urlRegex = /<a.* href=["']([^"']*)["'].*>View Online<\/a>/
|
||||
this.defaultUrl = 'https://www.morningbrew.com'
|
||||
this.name = 'morningbrew'
|
||||
}
|
||||
|
||||
@ -33,10 +32,4 @@ export class MorningBrewHandler extends ContentHandler {
|
||||
|
||||
return Promise.resolve({ dom })
|
||||
}
|
||||
|
||||
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
|
||||
// Axios newsletter is from <xx@axios.com>
|
||||
const re = new RegExp(this.senderRegex)
|
||||
return re.test(from) && (!!postHeader || !!unSubHeader)
|
||||
}
|
||||
}
|
||||
|
||||
46
packages/content-handler/src/newsletters/revue-handler.ts
Normal file
46
packages/content-handler/src/newsletters/revue-handler.ts
Normal file
@ -0,0 +1,46 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
import { parseHTML } from 'linkedom'
|
||||
|
||||
export class RevueHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'revue'
|
||||
}
|
||||
|
||||
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||
const viewOnline = dom.querySelectorAll('table tr td a[target="_blank"]')
|
||||
let res: string | undefined = undefined
|
||||
viewOnline.forEach((e) => {
|
||||
if (e.textContent === 'View online') {
|
||||
res = e.getAttribute('href') || undefined
|
||||
}
|
||||
})
|
||||
return res
|
||||
}
|
||||
|
||||
async isNewsletter(input: {
|
||||
postHeader: string
|
||||
from: string
|
||||
unSubHeader: string
|
||||
html: string
|
||||
}): Promise<boolean> {
|
||||
const dom = parseHTML(input.html).document
|
||||
if (
|
||||
dom.querySelectorAll('img[src*="getrevue.co"], img[src*="revue.email"]')
|
||||
.length > 0
|
||||
) {
|
||||
const getrevueUrl = this.findNewsletterHeaderHref(dom)
|
||||
if (getrevueUrl) {
|
||||
return Promise.resolve(true)
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
async parseNewsletterUrl(
|
||||
postHeader: string,
|
||||
html: string
|
||||
): Promise<string | undefined> {
|
||||
return this.findNewsletterUrl(html)
|
||||
}
|
||||
}
|
||||
@ -1,10 +1,10 @@
|
||||
import addressparser from 'addressparser'
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
import { parseHTML } from 'linkedom'
|
||||
|
||||
export class SubstackHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.defaultUrl = 'https://www.substack.com'
|
||||
this.name = 'substack'
|
||||
}
|
||||
|
||||
@ -38,15 +38,53 @@ export class SubstackHandler extends ContentHandler {
|
||||
return Promise.resolve(dom)
|
||||
}
|
||||
|
||||
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
|
||||
return !!postHeader
|
||||
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||
// Substack header links
|
||||
const postLink = dom.querySelector('h1 a ')
|
||||
if (postLink) {
|
||||
return postLink.getAttribute('href') || undefined
|
||||
}
|
||||
|
||||
return undefined
|
||||
}
|
||||
|
||||
parseNewsletterUrl(postHeader: string, html: string): string | undefined {
|
||||
async isNewsletter({
|
||||
postHeader,
|
||||
html,
|
||||
}: {
|
||||
postHeader: string
|
||||
from: string
|
||||
unSubHeader: string
|
||||
html: string
|
||||
}): Promise<boolean> {
|
||||
if (postHeader) {
|
||||
return Promise.resolve(true)
|
||||
}
|
||||
const dom = parseHTML(html).document
|
||||
// substack newsletter emails have tables with a *post-meta class
|
||||
if (dom.querySelector('table[class$="post-meta"]')) {
|
||||
return true
|
||||
}
|
||||
// If the article has a header link, and substack icons its probably a newsletter
|
||||
const href = this.findNewsletterHeaderHref(dom)
|
||||
const heartIcon = dom.querySelector(
|
||||
'table tbody td span a img[src*="HeartIcon"]'
|
||||
)
|
||||
const recommendIcon = dom.querySelector(
|
||||
'table tbody td span a img[src*="RecommendIconRounded"]'
|
||||
)
|
||||
return Promise.resolve(!!(href && (heartIcon || recommendIcon)))
|
||||
}
|
||||
|
||||
async parseNewsletterUrl(
|
||||
postHeader: string,
|
||||
html: string
|
||||
): Promise<string | undefined> {
|
||||
// raw SubStack newsletter url is like <https://hongbo130.substack.com/p/tldr>
|
||||
// we need to get the real url from the raw url
|
||||
return addressparser(postHeader).length > 0
|
||||
? addressparser(postHeader)[0].name
|
||||
: undefined
|
||||
if (postHeader && addressparser(postHeader).length > 0) {
|
||||
return Promise.resolve(addressparser(postHeader)[0].name)
|
||||
}
|
||||
return this.findNewsletterUrl(html)
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user