Add generic newsletter handler
This commit is contained in:
@ -39,16 +39,12 @@ export interface PreHandleResult {
|
||||
export const FAKE_URL_PREFIX = 'https://omnivore.app/no_url?q='
|
||||
export const generateUniqueUrl = () => FAKE_URL_PREFIX + uuid()
|
||||
|
||||
export class ContentHandler {
|
||||
export abstract class ContentHandler {
|
||||
protected senderRegex: RegExp
|
||||
protected urlRegex: RegExp
|
||||
name: string
|
||||
|
||||
// newsletter url text regex for newsletters that don't have a newsletter header
|
||||
NEWSLETTER_URL_TEXT_REGEX =
|
||||
/((View|Read)(.*)(email|post)?(.*)(in your browser|online|on (FS|the Web))|Lire en ligne)/i
|
||||
|
||||
constructor() {
|
||||
protected constructor() {
|
||||
this.senderRegex = new RegExp(/NEWSLETTER_SENDER_REGEX/)
|
||||
this.urlRegex = new RegExp(/NEWSLETTER_URL_REGEX/)
|
||||
this.name = 'Handler name'
|
||||
@ -84,20 +80,16 @@ export class ContentHandler {
|
||||
headers: Record<string, string | string[]>
|
||||
dom: Document
|
||||
}): Promise<boolean> {
|
||||
const postHeader = input.headers['list-post'] || input.headers['list-id']
|
||||
const re = new RegExp(this.senderRegex)
|
||||
const postHeader = input.headers['list-post']
|
||||
const unSubHeader = input.headers['list-unsubscribe']
|
||||
return Promise.resolve(!!postHeader || !!unSubHeader)
|
||||
return Promise.resolve(
|
||||
re.test(input.from) && (!!postHeader || !!unSubHeader)
|
||||
)
|
||||
}
|
||||
|
||||
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||
const readOnline = dom.querySelectorAll('a')
|
||||
let res: string | undefined = undefined
|
||||
readOnline.forEach((e) => {
|
||||
if (e.textContent && this.NEWSLETTER_URL_TEXT_REGEX.test(e.textContent)) {
|
||||
res = e.getAttribute('href') || undefined
|
||||
}
|
||||
})
|
||||
return res
|
||||
return undefined
|
||||
}
|
||||
|
||||
// Given an HTML blob tries to find a URL to use for
|
||||
@ -129,24 +121,11 @@ export class ContentHandler {
|
||||
headers: Record<string, string | string[]>,
|
||||
html: string
|
||||
): Promise<string | undefined> {
|
||||
// raw SubStack newsletter url is like <https://hongbo130.substack.com/p/tldr>
|
||||
// we need to get the real url from the raw url
|
||||
const postHeader = headers['list-post']?.toString()
|
||||
if (postHeader && addressparser(postHeader).length > 0) {
|
||||
return addressparser(postHeader)[0].name
|
||||
}
|
||||
|
||||
const url = await this.findNewsletterUrl(html)
|
||||
if (url) {
|
||||
return url
|
||||
}
|
||||
|
||||
// get newsletter url from html
|
||||
const matches = html.match(this.urlRegex)
|
||||
if (matches) {
|
||||
return Promise.resolve(matches[1])
|
||||
}
|
||||
|
||||
return Promise.resolve(undefined)
|
||||
}
|
||||
|
||||
|
||||
@ -30,6 +30,7 @@ import { CooperPressHandler } from './newsletters/cooper-press-handler'
|
||||
import { HeyWorldHandler } from './newsletters/hey-world-handler'
|
||||
import { Browser } from 'puppeteer-core'
|
||||
import { StackOverflowHandler } from './websites/stack-overflow-handler'
|
||||
import { GenericHandler } from './newsletters/generic-handler'
|
||||
|
||||
const validateUrlString = (url: string) => {
|
||||
const u = new URL(url)
|
||||
@ -80,6 +81,7 @@ const newsletterHandlers: ContentHandler[] = [
|
||||
new GhostHandler(),
|
||||
new CooperPressHandler(),
|
||||
new HeyWorldHandler(),
|
||||
new GenericHandler(),
|
||||
]
|
||||
|
||||
export const preHandleContent = async (
|
||||
|
||||
49
packages/content-handler/src/newsletters/generic-handler.ts
Normal file
49
packages/content-handler/src/newsletters/generic-handler.ts
Normal file
@ -0,0 +1,49 @@
|
||||
import { ContentHandler } from '../content-handler'
|
||||
import addressparser from 'addressparser'
|
||||
|
||||
export class GenericHandler extends ContentHandler {
|
||||
// newsletter url text regex for newsletters that don't have a newsletter header
|
||||
NEWSLETTER_URL_TEXT_REGEX =
|
||||
/((View|Read)(.*)(email|post)?(.*)(in your browser|online|on (FS|the Web))|Lire en ligne)/i
|
||||
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'Generic Newsletter'
|
||||
}
|
||||
|
||||
async isNewsletter(input: {
|
||||
from: string
|
||||
html: string
|
||||
headers: Record<string, string | string[]>
|
||||
dom: Document
|
||||
}): Promise<boolean> {
|
||||
const postHeader = input.headers['list-post'] || input.headers['list-id']
|
||||
const unSubHeader = input.headers['list-unsubscribe']
|
||||
return Promise.resolve(!!postHeader || !!unSubHeader)
|
||||
}
|
||||
|
||||
findNewsletterHeaderHref(dom: Document): string | undefined {
|
||||
const readOnline = dom.querySelectorAll('a')
|
||||
let res: string | undefined = undefined
|
||||
readOnline.forEach((e) => {
|
||||
if (e.textContent && this.NEWSLETTER_URL_TEXT_REGEX.test(e.textContent)) {
|
||||
res = e.getAttribute('href') || undefined
|
||||
}
|
||||
})
|
||||
return res
|
||||
}
|
||||
|
||||
async parseNewsletterUrl(
|
||||
headers: Record<string, string | string[]>,
|
||||
html: string
|
||||
): Promise<string | undefined> {
|
||||
// raw SubStack newsletter url is like <https://hongbo130.substack.com/p/tldr>
|
||||
// we need to get the real url from the raw url
|
||||
const postHeader = headers['list-post']?.toString()
|
||||
if (postHeader && addressparser(postHeader).length > 0) {
|
||||
return addressparser(postHeader)[0].name
|
||||
}
|
||||
|
||||
return this.findNewsletterUrl(html)
|
||||
}
|
||||
}
|
||||
@ -18,6 +18,7 @@ import { CooperPressHandler } from '../src/newsletters/cooper-press-handler'
|
||||
import { getNewsletterHandler } from '../src'
|
||||
import { parseHTML } from 'linkedom'
|
||||
import { HeyWorldHandler } from '../src/newsletters/hey-world-handler'
|
||||
import { GenericHandler } from '../src/newsletters/generic-handler'
|
||||
|
||||
chai.use(chaiAsPromised)
|
||||
chai.use(chaiString)
|
||||
@ -97,7 +98,7 @@ describe('Newsletter email test', () => {
|
||||
const html = load('./test/data/ttso-newsletter.html')
|
||||
|
||||
await expect(
|
||||
new ContentHandler().parseNewsletterUrl({}, html)
|
||||
new GenericHandler().parseNewsletterUrl({}, html)
|
||||
).to.eventually.equal(url)
|
||||
})
|
||||
})
|
||||
@ -414,7 +415,7 @@ describe('Newsletter email test', () => {
|
||||
|
||||
it('gets the URL from the header', async () => {
|
||||
const html = load('./test/data/ttso-newsletter.html')
|
||||
const url = await new ContentHandler().findNewsletterUrl(html)
|
||||
const url = await new GenericHandler().findNewsletterUrl(html)
|
||||
expect(url).to.startWith('https://ttso.paris/2023-01-31')
|
||||
})
|
||||
})
|
||||
|
||||
Reference in New Issue
Block a user