diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts index 595960ac2..8cb1aeade 100644 --- a/packages/api/src/utils/parser.ts +++ b/packages/api/src/utils/parser.ts @@ -388,7 +388,6 @@ export const isProbablyNewsletter = (html: string): boolean => { }).parse() if (!article || !article.content) { - console.log('no article content') return false } @@ -397,30 +396,45 @@ export const isProbablyNewsletter = (html: string): boolean => { return true } + // If the article has a header link, and substack icons its probably a newsletter + const href = findNewsletterHeaderHref(dom.window) + const heartIcon = dom.document.querySelector( + 'table tbody td span a img[src*="HeartIcon"]' + ) + const recommendIcon = dom.document.querySelector( + 'table tbody td span a img[src*="RecommendIconRounded"]' + ) + if (href && (heartIcon || recommendIcon)) { + return true + } + return false } +const findNewsletterHeaderHref = (dom: DOMWindow): string | undefined => { + const postLink = dom.document.querySelector('h1 a ') + if (postLink) { + return postLink.getAttribute('href') || undefined + } + return undefined +} + // Given an HTML blob tries to find a URL to use for // a canonical URL. export const findNewsletterUrl = async ( html: string ): Promise => { const dom = new JSDOM(html).window - - // If there is an

res.request.res.responseUrl as string | undefined) - .catch((e) => href) - } + const href = findNewsletterHeaderHref(dom.window) + if (href) { + // Try to make a HEAD request so we get the redirected URL, since these + // will usually be behind tracking url redirects + return axios({ + method: 'HEAD', + url: href, + }) + .then((res) => res.request.res.responseUrl as string | undefined) + .catch((e) => href) } return undefined diff --git a/packages/api/test/utils/data/substack-private-forwarded-newsletter.html b/packages/api/test/utils/data/substack-private-forwarded-newsletter.html new file mode 100644 index 000000000..071bfeb77 --- /dev/null +++ b/packages/api/test/utils/data/substack-private-forwarded-newsletter.html @@ -0,0 +1,2 @@ +


---------- Forwarded message ---------
From: giggs <darkgiggsxx@gmail.com>
Date: Wed, Mar 2, 2022 at 5:29 PM
Subject: Fwd: The German Retreat From Nuclear Power
To: Radek <radoslaw.jurga@gmail.com>



---------- Forwarded message ---------
De : Bismarck Analysis <bismarck@substack.com>
Date: mer. 2 mars 2022 à 15:02
Subject: The German Retreat From Nuclear Power
To: <darkgiggsxx@gmail.com>


Germany's economic priorities are torn between industrial growth and energy degrowth. As its last nuclear plants are shut down, the choice is between relying on fossil fuels or closing factories. ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌ ‌

The German Retreat From Nuclear Power

Germany's economic priorities are torn between industrial growth and energy degrowth. As its last nuclear plants are shut down, the choice is between relying on fossil fuels or closing factories.

Isar Nuclear Power Plant near Landshut, Germany in 2016. The Isar station is scheduled to be shut down by the end of 2022. Photo by Dennis Hansch. Source.
+
\ No newline at end of file diff --git a/packages/api/test/utils/parser.test.ts b/packages/api/test/utils/parser.test.ts index 2a2fedcfb..d4e6a64dc 100644 --- a/packages/api/test/utils/parser.test.ts +++ b/packages/api/test/utils/parser.test.ts @@ -14,6 +14,10 @@ describe('isProbablyNewsletter', () => { const html = load('./test/utils/data/substack-forwarded-newsletter.html') isProbablyNewsletter(html).should.be.true }) + it('returns true for private forwarded substack newsletter', () => { + const html = load('./test/utils/data/substack-private-forwarded-newsletter.html') + isProbablyNewsletter(html).should.be.true + }) it('returns false for substack welcome email', () => { const html = load('./test/utils/data/substack-forwarded-welcome-email.html') isProbablyNewsletter(html).should.be.false @@ -33,3 +37,4 @@ describe('findNewsletterUrl', async () => { expect(url).to.be.undefined }) }) +