Attempt to pull URLs for probable newsletter emails out of content

This commit is contained in:
Jackson Harper
2022-03-02 23:09:10 -08:00
parent 21329949e5
commit 65ce8353dc
5 changed files with 46 additions and 21 deletions

View File

@ -4,7 +4,7 @@ import { sendEmail } from '../../utils/sendEmail'
import { analytics } from '../../utils/analytics'
import { getNewsletterEmail } from '../../services/newsletters'
import { env } from '../../env'
import { isProbablyNewsletter } from '../../utils/parser'
import { findNewsletterUrl, isProbablyNewsletter } from '../../utils/parser'
import { saveNewsletterEmail } from '../../services/save_newsletter_email'
interface ForwardEmailMessage {
@ -57,7 +57,7 @@ export function emailsServiceRouter() {
title: data.subject,
content: data.html,
author: data.from,
url: 'https://omnivore.app/no_url',
url: (await findNewsletterUrl(data.html)) || 'https://omnivore.app/no_url',
})
res.status(200).send('Newsletter')
return

View File

@ -399,3 +399,27 @@ export const isProbablyNewsletter = (html: string): boolean => {
return false
}
// Given an HTML blob tries to find a URL to use for
// a canonical URL.
export const findNewsletterUrl = async (html: string): Promise<string | undefined> => {
const dom = new JSDOM(html).window
// If there is an <h1 element with a URL, use that
const postLink = dom.document.querySelector('h1 a ')
if (postLink) {
const href = postLink.getAttribute('href')
if (href) {
// Try to make a HEAD request so we get the redirected URL, since these
// will usually be behind tracking url redirects
return axios({
method: 'HEAD',
url: href,
})
.then(res => res.request.res.responseUrl)
.catch((e) => href)
}
}
return undefined
}

View File

@ -94,6 +94,7 @@ const RESERVED_NAMES = new Set([
'mine',
'mis',
'news',
'no_url',
'oauth',
'oauth_clients',
'offers',

View File

@ -3,15 +3,8 @@ import { expect } from 'chai'
import 'chai/register-should'
import {
createTestUser,
createUserWithoutProfile,
deleteTestUser,
getProfile,
} from '../db'
import { createGroup } from '../../src/services/create_group'
import {
getUserFollowers,
getUserFollowing,
} from '../../src/services/followers'
import { createNewsletterEmail } from '../../src/services/newsletters'
import { saveNewsletterEmail } from '../../src/services/save_newsletter_email'
import { getRepository } from 'typeorm'
@ -23,14 +16,6 @@ describe('saveNewsletterEmail', () => {
await deleteTestUser(username)
})
interface NewsletterMessage {
email: string
content: string
url: string
title: string
author: string
}
it('adds the newsletter to the library', async () => {
const user = await createTestUser(username)
const email = await createNewsletterEmail(user.id)

View File

@ -3,12 +3,13 @@ import { expect } from 'chai'
import 'chai/register-should'
import { JSDOM } from 'jsdom'
import fs from 'fs'
import { isProbablyNewsletter } from '../../src/utils/parser'
import { findNewsletterUrl, isProbablyNewsletter } from '../../src/utils/parser'
const load = (path: string): string => {
return fs.readFileSync(path, 'utf8')
}
describe('isProbablyNewsletter', () => {
const load = (path: string): string => {
return fs.readFileSync(path, 'utf8')
}
it('returns true for substack newsletter', () => {
const html = load('./test/utils/data/substack-forwarded-newsletter.html')
isProbablyNewsletter(html).should.be.true
@ -18,3 +19,17 @@ describe('isProbablyNewsletter', () => {
isProbablyNewsletter(html).should.be.false
})
})
describe('findNewsletterUrl', async () => {
it('gets the URL from the header if it is a newsletter', async () => {
const html = load('./test/utils/data/substack-forwarded-newsletter.html')
const url = await findNewsletterUrl(html)
// Not sure if the redirects from substack expire, this test could eventually fail
expect(url).to.startWith('https://newsletter.slowchinese.net/p/companies-that-eat-people-217')
})
it('returns undefined if it is not a newsletter', async () => {
const html = load('./test/utils/data/substack-forwarded-welcome-email.html')
const url = await findNewsletterUrl(html)
expect(url).to.be.undefined
})
})