Remove parsing newsletter emails from forwarded emails

This commit is contained in:
Hongbo Wu
2022-09-30 12:51:00 +08:00
parent b00a516737
commit 9841ce7f8e

View File

@ -450,150 +450,6 @@ export const parseUrlMetadata = async (
}
}
// Attempt to determine if an HTML blob is a newsletter
// based on it's contents.
// TODO: when we consolidate the handlers we could include this
// as a utility method on each one.
export const isProbablyNewsletter = async (html: string): Promise<boolean> => {
const dom = parseHTML(html).document
const domCopy = parseHTML(dom.documentElement.outerHTML).document
const article = await new Readability(domCopy, {
debug: false,
keepTables: true,
}).parse()
if (!article || !article.content) {
return false
}
// substack newsletter emails have tables with a *post-meta class
if (dom.querySelector('table[class$="post-meta"]')) {
return true
}
// If the article has a header link, and substack icons its probably a newsletter
const href = findNewsletterHeaderHref(dom)
const heartIcon = dom.querySelector(
'table tbody td span a img[src*="HeartIcon"]'
)
const recommendIcon = dom.querySelector(
'table tbody td span a img[src*="RecommendIconRounded"]'
)
if (href && (heartIcon || recommendIcon)) {
return true
}
// Check if this is a beehiiv.net newsletter
if (dom.querySelectorAll('img[src*="beehiiv.net"]').length > 0) {
const beehiivUrl = beehiivNewsletterHref(dom)
if (beehiivUrl) {
return true
}
}
// Check if this is a newsletter from revue
if (
dom.querySelectorAll('img[src*="getrevue.co"], img[src*="revue.email"]')
.length > 0
) {
const getrevueUrl = revueNewsletterHref(dom)
if (getrevueUrl) {
return true
}
}
// Check if this is a convertkit.com newsletter
return (
dom.querySelectorAll(
'img[src*="convertkit.com"], img[src*="convertkit-mail.com"]'
).length > 0
)
}
const beehiivNewsletterHref = (dom: Document): string | undefined => {
const readOnline = dom.querySelectorAll('table tr td div a[class*="link"]')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (e.textContent === 'Read Online') {
res = e.getAttribute('href') || undefined
}
})
return res
}
const convertkitNewsletterHref = (dom: Document): string | undefined => {
const readOnline = dom.querySelectorAll('table tr td a')
let res: string | undefined = undefined
readOnline.forEach((e) => {
if (e.textContent === 'View this email in your browser') {
res = e.getAttribute('href') || undefined
}
})
return res
}
const revueNewsletterHref = (dom: Document): string | undefined => {
const viewOnline = dom.querySelectorAll('table tr td a[target="_blank"]')
let res: string | undefined = undefined
viewOnline.forEach((e) => {
if (e.textContent === 'View online') {
res = e.getAttribute('href') || undefined
}
})
return res
}
const findNewsletterHeaderHref = (dom: Document): string | undefined => {
// Substack header links
const postLink = dom.querySelector('h1 a ')
if (postLink) {
return postLink.getAttribute('href') || undefined
}
// Check if this is a beehiiv.net newsletter
const beehiiv = beehiivNewsletterHref(dom)
if (beehiiv) {
return beehiiv
}
// Check if this is a revue newsletter
const revue = revueNewsletterHref(dom)
if (revue) {
return revue
}
// Check if this is a convertkit.com newsletter
const convertkitUrl = convertkitNewsletterHref(dom)
if (convertkitUrl) {
return convertkitUrl
}
return undefined
}
// Given an HTML blob tries to find a URL to use for
// a canonical URL.
export const findNewsletterUrl = async (
html: string
): Promise<string | undefined> => {
const dom = parseHTML(html).document
// Check if this is a substack newsletter
const href = findNewsletterHeaderHref(dom)
if (href) {
// Try to make a HEAD request so we get the redirected URL, since these
// will usually be behind tracking url redirects
return axios({
method: 'HEAD',
url: href,
})
.then((res) => res.request.res.responseUrl as string | undefined)
.catch((e) => href)
}
return undefined
}
export const isProbablyArticle = async (
email: string,
subject: string