Merge pull request #2389 from Podginator/feat/atlantic-handler
Add The Atlantic handler to avoid paywall and correctly format
This commit is contained in:
@ -36,6 +36,7 @@ import { TDotCoHandler } from './websites/t-dot-co-handler'
|
||||
import { WeixinQqHandler } from './websites/weixin-qq-handler'
|
||||
import { WikipediaHandler } from './websites/wikipedia-handler'
|
||||
import { YoutubeHandler } from './websites/youtube-handler'
|
||||
import { TheAtlanticHandler } from './websites/the-atlantic-handler'
|
||||
|
||||
const validateUrlString = (url: string): boolean => {
|
||||
const u = new URL(url)
|
||||
@ -56,6 +57,7 @@ const validateUrlString = (url: string): boolean => {
|
||||
}
|
||||
|
||||
const contentHandlers: ContentHandler[] = [
|
||||
new TheAtlanticHandler(),
|
||||
new AppleNewsHandler(),
|
||||
new BloombergHandler(),
|
||||
new DerstandardHandler(),
|
||||
|
||||
@ -0,0 +1,59 @@
|
||||
import axios from 'axios'
|
||||
import { parseHTML } from 'linkedom'
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
|
||||
export class TheAtlanticHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'The Atlantic'
|
||||
}
|
||||
|
||||
shouldPreHandle(url: string): boolean {
|
||||
const u = new URL(url)
|
||||
return u.hostname.endsWith('theatlantic.com')
|
||||
}
|
||||
|
||||
removeRelatedContentLinks(articleContent: Element): Node[] {
|
||||
const content = Array.from(articleContent.children)
|
||||
return content.filter(
|
||||
(paragraph) => !paragraph.className.startsWith('ArticleRelated')
|
||||
)
|
||||
}
|
||||
|
||||
unfurlContent(content: Document): Document {
|
||||
const articleContentSection = content.querySelector(
|
||||
'[data-event-module="article body"]'
|
||||
)
|
||||
|
||||
// Remove the audio player.
|
||||
content.querySelector('[data-event-module="audio player"]')?.remove()
|
||||
|
||||
if (!articleContentSection) {
|
||||
return content
|
||||
}
|
||||
|
||||
const articleContent = this.removeRelatedContentLinks(articleContentSection)
|
||||
const divOverArticle = content.createElement('div')
|
||||
divOverArticle.setAttribute('id', 'prehandled')
|
||||
articleContent.forEach((it) => divOverArticle.appendChild(it))
|
||||
|
||||
content.insertBefore(divOverArticle, articleContentSection)
|
||||
articleContentSection.remove()
|
||||
|
||||
return content
|
||||
}
|
||||
|
||||
async preHandle(url: string): Promise<PreHandleResult> {
|
||||
// We simply retrieve the article without Javascript enabled using a GET command.
|
||||
const response = await axios.get(url)
|
||||
const data = response.data as string
|
||||
const dom = parseHTML(data).document
|
||||
const editedDom = this.unfurlContent(dom)
|
||||
|
||||
return {
|
||||
content: editedDom.body.outerHTML,
|
||||
title: dom.title,
|
||||
dom: editedDom,
|
||||
}
|
||||
}
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
52
packages/content-handler/test/the-atlantic-handler.test.ts
Normal file
52
packages/content-handler/test/the-atlantic-handler.test.ts
Normal file
@ -0,0 +1,52 @@
|
||||
import { TheAtlanticHandler } from '../src/websites/the-atlantic-handler'
|
||||
import fs from 'fs';
|
||||
import nock from 'nock'
|
||||
import { expect } from 'chai'
|
||||
|
||||
describe('Testing the atlantic opening', () => {
|
||||
const load = (path: string): string => {
|
||||
return fs.readFileSync(path, 'utf8')
|
||||
}
|
||||
|
||||
|
||||
before(() => {
|
||||
const html = load('./test/data/the-atlantic-article.html');
|
||||
nock('https://theatlantic.com').persist().get('/article').reply(200, html)
|
||||
})
|
||||
|
||||
it('should parse the title of the atlantic article.', async () => {
|
||||
const response = await new TheAtlanticHandler().preHandle(
|
||||
'https://theatlantic.com/article'
|
||||
);
|
||||
|
||||
// We grab the title from the doucment.
|
||||
expect(response.title).not.to.be.undefined
|
||||
})
|
||||
|
||||
it('should remove the article section, and replace it with a parseable div', async () => {
|
||||
const response = await new TheAtlanticHandler().preHandle(
|
||||
'https://theatlantic.com/article'
|
||||
);
|
||||
|
||||
// This should not exist
|
||||
expect(response.dom?.querySelector('[data-event-module="article body"]')).to.be.null
|
||||
});
|
||||
|
||||
it ('should append a new div, and add the article content inside', async() => {
|
||||
const response = await new TheAtlanticHandler().preHandle(
|
||||
'https://theatlantic.com/article'
|
||||
);
|
||||
|
||||
// We name the div to ensure we can validate that it has been inserted.
|
||||
expect(response.dom?.getElementById('prehandled')).not.to.be.null
|
||||
})
|
||||
|
||||
it ('should remove any related content links.', async() => {
|
||||
const response = await new TheAtlanticHandler().preHandle(
|
||||
'https://theatlantic.com/article'
|
||||
);
|
||||
|
||||
// This exists in the HTML, but we remove it when preparsing.
|
||||
expect(response.dom?.getElementsByClassName('ArticleRelatedContentModule_root__BBa6g').length).to.eql(0)
|
||||
})
|
||||
})
|
||||
Reference in New Issue
Block a user