From 9841ce7f8efc19c22eeecd2422a947b2c83835e3 Mon Sep 17 00:00:00 2001 From: Hongbo Wu Date: Fri, 30 Sep 2022 12:51:00 +0800 Subject: [PATCH] Remove parsing newsletter emails from forwarded emails --- packages/api/src/utils/parser.ts | 144 ------------------------------- 1 file changed, 144 deletions(-) diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts index ee62d1050..b071f2478 100644 --- a/packages/api/src/utils/parser.ts +++ b/packages/api/src/utils/parser.ts @@ -450,150 +450,6 @@ export const parseUrlMetadata = async ( } } -// Attempt to determine if an HTML blob is a newsletter -// based on it's contents. -// TODO: when we consolidate the handlers we could include this -// as a utility method on each one. -export const isProbablyNewsletter = async (html: string): Promise => { - const dom = parseHTML(html).document - const domCopy = parseHTML(dom.documentElement.outerHTML).document - const article = await new Readability(domCopy, { - debug: false, - keepTables: true, - }).parse() - - if (!article || !article.content) { - return false - } - - // substack newsletter emails have tables with a *post-meta class - if (dom.querySelector('table[class$="post-meta"]')) { - return true - } - - // If the article has a header link, and substack icons its probably a newsletter - const href = findNewsletterHeaderHref(dom) - const heartIcon = dom.querySelector( - 'table tbody td span a img[src*="HeartIcon"]' - ) - const recommendIcon = dom.querySelector( - 'table tbody td span a img[src*="RecommendIconRounded"]' - ) - if (href && (heartIcon || recommendIcon)) { - return true - } - - // Check if this is a beehiiv.net newsletter - if (dom.querySelectorAll('img[src*="beehiiv.net"]').length > 0) { - const beehiivUrl = beehiivNewsletterHref(dom) - if (beehiivUrl) { - return true - } - } - - // Check if this is a newsletter from revue - if ( - dom.querySelectorAll('img[src*="getrevue.co"], img[src*="revue.email"]') - .length > 0 - ) { - const getrevueUrl = revueNewsletterHref(dom) - if (getrevueUrl) { - return true - } - } - - // Check if this is a convertkit.com newsletter - return ( - dom.querySelectorAll( - 'img[src*="convertkit.com"], img[src*="convertkit-mail.com"]' - ).length > 0 - ) -} - -const beehiivNewsletterHref = (dom: Document): string | undefined => { - const readOnline = dom.querySelectorAll('table tr td div a[class*="link"]') - let res: string | undefined = undefined - readOnline.forEach((e) => { - if (e.textContent === 'Read Online') { - res = e.getAttribute('href') || undefined - } - }) - return res -} - -const convertkitNewsletterHref = (dom: Document): string | undefined => { - const readOnline = dom.querySelectorAll('table tr td a') - let res: string | undefined = undefined - readOnline.forEach((e) => { - if (e.textContent === 'View this email in your browser') { - res = e.getAttribute('href') || undefined - } - }) - return res -} - -const revueNewsletterHref = (dom: Document): string | undefined => { - const viewOnline = dom.querySelectorAll('table tr td a[target="_blank"]') - let res: string | undefined = undefined - viewOnline.forEach((e) => { - if (e.textContent === 'View online') { - res = e.getAttribute('href') || undefined - } - }) - return res -} - -const findNewsletterHeaderHref = (dom: Document): string | undefined => { - // Substack header links - const postLink = dom.querySelector('h1 a ') - if (postLink) { - return postLink.getAttribute('href') || undefined - } - - // Check if this is a beehiiv.net newsletter - const beehiiv = beehiivNewsletterHref(dom) - if (beehiiv) { - return beehiiv - } - - // Check if this is a revue newsletter - const revue = revueNewsletterHref(dom) - if (revue) { - return revue - } - - // Check if this is a convertkit.com newsletter - const convertkitUrl = convertkitNewsletterHref(dom) - if (convertkitUrl) { - return convertkitUrl - } - - return undefined -} - -// Given an HTML blob tries to find a URL to use for -// a canonical URL. -export const findNewsletterUrl = async ( - html: string -): Promise => { - const dom = parseHTML(html).document - - // Check if this is a substack newsletter - const href = findNewsletterHeaderHref(dom) - if (href) { - // Try to make a HEAD request so we get the redirected URL, since these - // will usually be behind tracking url redirects - return axios({ - method: 'HEAD', - url: href, - }) - .then((res) => res.request.res.responseUrl as string | undefined) - .catch((e) => href) - } - - return undefined -} - export const isProbablyArticle = async ( email: string, subject: string