diff --git a/packages/api/src/utils/parser.ts b/packages/api/src/utils/parser.ts index c32b539ed..d1846ad51 100644 --- a/packages/api/src/utils/parser.ts +++ b/packages/api/src/utils/parser.ts @@ -432,14 +432,43 @@ export const isProbablyNewsletter = (html: string): boolean => { return true } + // Check if this is a beehiiv.net newsletter + if (dom.document.querySelectorAll('img[src*="beehiiv.net"]').length > 0) { + const beehiivUrl = beehiivNewsletterHref(dom.window) + if (beehiivUrl) { + return true + } + } + return false } +const beehiivNewsletterHref = (dom: DOMWindow): string | undefined => { + const readOnline = dom.document.querySelectorAll( + 'table tr td div a[class*="link"]' + ) + let res: string | undefined = undefined + readOnline.forEach((e) => { + if (e.textContent === 'Read Online') { + res = e.getAttribute('href') || undefined + } + }) + return res +} + const findNewsletterHeaderHref = (dom: DOMWindow): string | undefined => { + // Substack header links const postLink = dom.document.querySelector('h1 a ') if (postLink) { return postLink.getAttribute('href') || undefined } + + // Check if this is a beehiiv.net newsletter + const beehiiv = beehiivNewsletterHref(dom.window) + if (beehiiv) { + return beehiiv + } + return undefined } @@ -449,6 +478,8 @@ export const findNewsletterUrl = async ( html: string ): Promise => { const dom = new JSDOM(html).window + + // Check if this is a substack newsletter const href = findNewsletterHeaderHref(dom.window) if (href) { // Try to make a HEAD request so we get the redirected URL, since these diff --git a/packages/api/test/utils/data/beehiiv-newsletter.html b/packages/api/test/utils/data/beehiiv-newsletter.html new file mode 100644 index 000000000..369d42af0 --- /dev/null +++ b/packages/api/test/utils/data/beehiiv-newsletter.html @@ -0,0 +1,15 @@ +I talked to a guy that spent $30M on a Beeple + +
+
+ + + diff --git a/packages/api/test/utils/parser.test.ts b/packages/api/test/utils/parser.test.ts index 48af3bab3..96acb630a 100644 --- a/packages/api/test/utils/parser.test.ts +++ b/packages/api/test/utils/parser.test.ts @@ -22,15 +22,24 @@ describe('isProbablyNewsletter', () => { const html = load('./test/utils/data/substack-forwarded-welcome-email.html') isProbablyNewsletter(html).should.be.false }) + it('returns true for beehiiv.com newsletter', () => { + const html = load('./test/utils/data/beehiiv-newsletter.html') + isProbablyNewsletter(html).should.be.true + }) }) describe('findNewsletterUrl', async () => { - it('gets the URL from the header if it is a newsletter', async () => { + it('gets the URL from the header if it is a substack newsletter', async () => { const html = load('./test/utils/data/substack-forwarded-newsletter.html') const url = await findNewsletterUrl(html) // Not sure if the redirects from substack expire, this test could eventually fail expect(url).to.startWith('https://newsletter.slowchinese.net/p/companies-that-eat-people-217') }) + it('gets the URL from the header if it is a beehiiv newsletter', async () => { + const html = load('./test/utils/data/beehiiv-newsletter.html') + const url = await findNewsletterUrl(html) + expect(url).to.startWith('https://www.milkroad.com/p/talked-guy-spent-30m-beeple') + }) it('returns undefined if it is not a newsletter', async () => { const html = load('./test/utils/data/substack-forwarded-welcome-email.html') const url = await findNewsletterUrl(html)