New function to determine if an HTML blob is probably a newsletter based on its content

This commit is contained in:
Jackson Harper
2022-03-02 16:31:15 -08:00
parent ff871c35bc
commit f7f83fe080
4 changed files with 48 additions and 0 deletions

View File

@ -0,0 +1,25 @@
import { Readability } from '@omnivore/readability';
import { DOMWindow } from 'jsdom'
// Attempt to determine if an HTML blob is a newsletter
// based on it's contents.
export const isProbablyNewsletter = (dom: DOMWindow): boolean => {
const article = new Readability(document, {
debug: false,
keepTables: true,
}).parse()
if (!article || !article.content) {
return false
}
// Maybe it is a substack newsletter
const body = dom.document.querySelector('.email-body-container')
if (body?.querySelector('.post-meta') || body?.querySelector('.post-cta')) {
return true
}
return false
};

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,21 @@
import 'mocha'
import { expect } from 'chai'
import 'chai/register-should'
import { JSDOM } from 'jsdom'
import fs from 'fs'
import { isProbablyNewsletter } from '../../src/utils/parser'
describe('isProbablyNewsletter', () => {
const load = (path: string): JSDOM => {
const content = fs.readFileSync(path, 'utf8')
return new JSDOM(content);
}
it('returns true for substack newsletter', () => {
const dom = load('./test/utils/data/substack-forwarded-newsletter.html')
isProbablyNewsletter(dom.window).should.be.true
})
it('returns false for substack welcome email', () => {
const dom = load('./test/utils/data/substack-forwarded-welcome-email.html')
isProbablyNewsletter(dom.window).should.be.false
})
})