Add other newsletter handlers

This commit is contained in:
Hongbo Wu
2022-09-30 12:42:41 +08:00
parent 9b209314a6
commit b00a516737
23 changed files with 488 additions and 227 deletions

View File

@ -1,10 +1,10 @@
import addressparser from 'addressparser'
import { ContentHandler, PreHandleResult } from '../content-handler'
import { parseHTML } from 'linkedom'
export class SubstackHandler extends ContentHandler {
constructor() {
super()
this.defaultUrl = 'https://www.substack.com'
this.name = 'substack'
}
@ -38,15 +38,53 @@ export class SubstackHandler extends ContentHandler {
return Promise.resolve(dom)
}
isNewsletter(postHeader: string, from: string, unSubHeader: string): boolean {
return !!postHeader
findNewsletterHeaderHref(dom: Document): string | undefined {
// Substack header links
const postLink = dom.querySelector('h1 a ')
if (postLink) {
return postLink.getAttribute('href') || undefined
}
return undefined
}
parseNewsletterUrl(postHeader: string, html: string): string | undefined {
async isNewsletter({
postHeader,
html,
}: {
postHeader: string
from: string
unSubHeader: string
html: string
}): Promise<boolean> {
if (postHeader) {
return Promise.resolve(true)
}
const dom = parseHTML(html).document
// substack newsletter emails have tables with a *post-meta class
if (dom.querySelector('table[class$="post-meta"]')) {
return true
}
// If the article has a header link, and substack icons its probably a newsletter
const href = this.findNewsletterHeaderHref(dom)
const heartIcon = dom.querySelector(
'table tbody td span a img[src*="HeartIcon"]'
)
const recommendIcon = dom.querySelector(
'table tbody td span a img[src*="RecommendIconRounded"]'
)
return Promise.resolve(!!(href && (heartIcon || recommendIcon)))
}
async parseNewsletterUrl(
postHeader: string,
html: string
): Promise<string | undefined> {
// raw SubStack newsletter url is like <https://hongbo130.substack.com/p/tldr>
// we need to get the real url from the raw url
return addressparser(postHeader).length > 0
? addressparser(postHeader)[0].name
: undefined
if (postHeader && addressparser(postHeader).length > 0) {
return Promise.resolve(addressparser(postHeader)[0].name)
}
return this.findNewsletterUrl(html)
}
}