Remove Related content links from article (IE: Read article X)

This commit is contained in:
Thomas Rogers
2023-06-20 10:24:41 +02:00
parent 55fd02b67a
commit c0f34270ee
3 changed files with 29 additions and 2004 deletions

View File

@ -13,6 +13,13 @@ export class TheAtlanticHandler extends ContentHandler {
return u.hostname.endsWith('theatlantic.com')
}
removeRelatedContentLinks(articleContent: Element): Node[] {
const content = Array.from(articleContent.children)
return content.filter(
(paragraph) => !paragraph.className.startsWith('ArticleRelated')
)
}
unfurlContent(content: Document): Document {
const articleContentSection = content.querySelector(
'[data-event-module="article body"]'
@ -22,11 +29,12 @@ export class TheAtlanticHandler extends ContentHandler {
return content
}
const articleContent = this.removeRelatedContentLinks(articleContentSection)
const divOverArticle = content.createElement('div')
divOverArticle.setAttribute('id', 'prehandled')
divOverArticle.innerHTML += articleContentSection.innerHTML
content.insertBefore(divOverArticle, articleContentSection)
articleContent.forEach((it) => divOverArticle.appendChild(it))
content.insertBefore(divOverArticle, articleContentSection)
articleContentSection.remove()
return content

File diff suppressed because one or more lines are too long

View File

@ -1,6 +1,7 @@
import { TheAtlanticHandler } from '../src/websites/the-atlantic-handler'
import fs from 'fs';
import nock from 'nock'
import { expect } from 'chai'
describe('Testing the atlantic opening', () => {
const load = (path: string): string => {
@ -10,33 +11,42 @@ describe('Testing the atlantic opening', () => {
before(() => {
const html = load('./test/data/the-atlantic-article.html');
nock('https://theatlantic.com').get('/article/').reply(200, html)
nock('https://theatlantic.com').persist().get('/article').reply(200, html)
})
it('should parse the title of the atlantic article.', async () => {
const response = await new TheAtlanticHandler().preHandle(
'https://theatlantic.com/article/'
'https://theatlantic.com/article'
);
// We grab the title from the doucment.
expect(response.title).not.toBeFalsy()
expect(response.title).not.to.be.undefined
})
it('should remove the article section, and replace it with a parseable div', async () => {
const response = await new TheAtlanticHandler().preHandle(
'https://theatlantic.com/article/'
'https://theatlantic.com/article'
);
// We grab the title from the doucment.
expect(response.dom?.querySelector('[data-event-module="article body"]')).toBeEmptyDOMElement()
// This should not exist
expect(response.dom?.querySelector('[data-event-module="article body"]')).to.be.null
});
it ('should append a new div, and add the article content inside', async() => {
const response = await new TheAtlanticHandler().preHandle(
'https://theatlantic.com/article/'
'https://theatlantic.com/article'
);
// We grab the title from the doucment.
expect(response.dom?.getElementById('prehandled')).not.toBeEmptyDOMElement()
expect(response.dom?.getElementById('prehandled')).not.to.be.null
})
it ('should remove any related content links.', async() => {
const response = await new TheAtlanticHandler().preHandle(
'https://theatlantic.com/article'
);
// We grab the title from the doucment.
expect(response.dom?.getElementsByClassName('ArticleRelatedContentModule_root__BBa6g').length).to.eql(0)
})
})