Add generic newsletter handler

This commit is contained in:
Hongbo Wu
2023-02-03 14:46:24 +08:00
parent 1458395630
commit 363489f5be
4 changed files with 62 additions and 31 deletions

View File

@ -39,16 +39,12 @@ export interface PreHandleResult {
export const FAKE_URL_PREFIX = 'https://omnivore.app/no_url?q='
export const generateUniqueUrl = () => FAKE_URL_PREFIX + uuid()
export class ContentHandler {
export abstract class ContentHandler {
protected senderRegex: RegExp
protected urlRegex: RegExp
name: string
// newsletter url text regex for newsletters that don't have a newsletter header
NEWSLETTER_URL_TEXT_REGEX =
/((View|Read)(.*)(email|post)?(.*)(in your browser|online|on (FS|the Web))|Lire en ligne)/i
constructor() {
protected constructor() {
this.senderRegex = new RegExp(/NEWSLETTER_SENDER_REGEX/)
this.urlRegex = new RegExp(/NEWSLETTER_URL_REGEX/)
this.name = 'Handler name'
@ -84,20 +80,16 @@ export class ContentHandler {
headers: Record<string, string | string[]>
dom: Document
}): Promise<boolean> {
const postHeader = input.headers['list-post'] || input.headers['list-id']
const re = new RegExp(this.senderRegex)
const postHeader = input.headers['list-post']
const unSubHeader = input.headers['list-unsubscribe']
return Promise.resolve(!!postHeader || !!unSubHeader)
return Promise.resolve(
re.test(input.from) && (!!postHeader || !!unSubHeader)
)
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const readOnline = dom.querySelectorAll('a')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (e.textContent && this.NEWSLETTER_URL_TEXT_REGEX.test(e.textContent)) {
res = e.getAttribute('href') || undefined
}
})
return res
return undefined
}
// Given an HTML blob tries to find a URL to use for
@ -129,24 +121,11 @@ export class ContentHandler {
headers: Record<string, string | string[]>,
html: string
): Promise<string | undefined> {
// raw SubStack newsletter url is like <https://hongbo130.substack.com/p/tldr>
// we need to get the real url from the raw url
const postHeader = headers['list-post']?.toString()
if (postHeader && addressparser(postHeader).length > 0) {
return addressparser(postHeader)[0].name
}
const url = await this.findNewsletterUrl(html)
if (url) {
return url
}
// get newsletter url from html
const matches = html.match(this.urlRegex)
if (matches) {
return Promise.resolve(matches[1])
}
return Promise.resolve(undefined)
}

View File

@ -30,6 +30,7 @@ import { CooperPressHandler } from './newsletters/cooper-press-handler'
import { HeyWorldHandler } from './newsletters/hey-world-handler'
import { Browser } from 'puppeteer-core'
import { StackOverflowHandler } from './websites/stack-overflow-handler'
import { GenericHandler } from './newsletters/generic-handler'
const validateUrlString = (url: string) => {
const u = new URL(url)
@ -80,6 +81,7 @@ const newsletterHandlers: ContentHandler[] = [
new GhostHandler(),
new CooperPressHandler(),
new HeyWorldHandler(),
new GenericHandler(),
]
export const preHandleContent = async (

View File

@ -0,0 +1,49 @@
import { ContentHandler } from '../content-handler'
import addressparser from 'addressparser'
export class GenericHandler extends ContentHandler {
// newsletter url text regex for newsletters that don't have a newsletter header
NEWSLETTER_URL_TEXT_REGEX =
/((View|Read)(.*)(email|post)?(.*)(in your browser|online|on (FS|the Web))|Lire en ligne)/i
constructor() {
super()
this.name = 'Generic Newsletter'
}
async isNewsletter(input: {
from: string
html: string
headers: Record<string, string | string[]>
dom: Document
}): Promise<boolean> {
const postHeader = input.headers['list-post'] || input.headers['list-id']
const unSubHeader = input.headers['list-unsubscribe']
return Promise.resolve(!!postHeader || !!unSubHeader)
}
findNewsletterHeaderHref(dom: Document): string | undefined {
const readOnline = dom.querySelectorAll('a')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (e.textContent && this.NEWSLETTER_URL_TEXT_REGEX.test(e.textContent)) {
res = e.getAttribute('href') || undefined
}
})
return res
}
async parseNewsletterUrl(
headers: Record<string, string | string[]>,
html: string
): Promise<string | undefined> {
// raw SubStack newsletter url is like <https://hongbo130.substack.com/p/tldr>
// we need to get the real url from the raw url
const postHeader = headers['list-post']?.toString()
if (postHeader && addressparser(postHeader).length > 0) {
return addressparser(postHeader)[0].name
}
return this.findNewsletterUrl(html)
}
}

View File

@ -18,6 +18,7 @@ import { CooperPressHandler } from '../src/newsletters/cooper-press-handler'
import { getNewsletterHandler } from '../src'
import { parseHTML } from 'linkedom'
import { HeyWorldHandler } from '../src/newsletters/hey-world-handler'
import { GenericHandler } from '../src/newsletters/generic-handler'
chai.use(chaiAsPromised)
chai.use(chaiString)
@ -97,7 +98,7 @@ describe('Newsletter email test', () => {
const html = load('./test/data/ttso-newsletter.html')
await expect(
new ContentHandler().parseNewsletterUrl({}, html)
new GenericHandler().parseNewsletterUrl({}, html)
).to.eventually.equal(url)
})
})
@ -414,7 +415,7 @@ describe('Newsletter email test', () => {
it('gets the URL from the header', async () => {
const html = load('./test/data/ttso-newsletter.html')
const url = await new ContentHandler().findNewsletterUrl(html)
const url = await new GenericHandler().findNewsletterUrl(html)
expect(url).to.startWith('https://ttso.paris/2023-01-31')
})
})