Add Ars Technica handler for multipage articles

This commit is contained in:
Thomas Rogers
2023-08-04 20:00:03 +02:00
parent 3d7691c068
commit 49c1976ac9
6 changed files with 2710 additions and 0 deletions

View File

@ -37,6 +37,7 @@ import { WeixinQqHandler } from './websites/weixin-qq-handler'
import { WikipediaHandler } from './websites/wikipedia-handler'
import { YoutubeHandler } from './websites/youtube-handler'
import { TheAtlanticHandler } from './websites/the-atlantic-handler'
import { ArsTechnicaHandler } from './websites/ars-technica-handler'
const validateUrlString = (url: string): boolean => {
const u = new URL(url)
@ -57,6 +58,7 @@ const validateUrlString = (url: string): boolean => {
}
const contentHandlers: ContentHandler[] = [
new ArsTechnicaHandler(),
new TheAtlanticHandler(),
new AppleNewsHandler(),
new BloombergHandler(),

View File

@ -0,0 +1,87 @@
import axios from 'axios'
import { parseHTML } from 'linkedom'
import { ContentHandler, PreHandleResult } from '../content-handler'
/**
* Some of the content on Ars Technica is split over several pages.
* If this is the case we should unfurl the entire article into one. l
*/
export class ArsTechnicaHandler extends ContentHandler {
constructor() {
super()
this.name = 'ArsTechnica'
}
shouldPreHandle(url: string): boolean {
const u = new URL(url)
return u.hostname.endsWith('arstechnica.com')
}
hasMultiplePages(document: Document): boolean {
return document.querySelectorAll('nav.page-numbers')?.length != 0
}
async grabContentFromUrl(url: string): Promise<Document> {
const response = await axios.get(url)
const data = response.data as string
return parseHTML(data).document
}
async extractArticleContentsFromLink(url: string): Promise<Document[]> {
const dom = await this.grabContentFromUrl(url)
const articleContent = dom.querySelector('[itemprop="articleBody"]')
return [].slice.call(articleContent?.childNodes || [])
}
async expandLinksAndCombine(document: Document): Promise<Document> {
const pageNumbers = document.querySelector('nav.page-numbers')
const articleBody = document.querySelector('[itemprop="articleBody"]')
if (!pageNumbers || !articleBody) {
// We shouldn't ever really get here, but sometimes weird things happen.
return document
}
const pageLinkNodes = pageNumbers.querySelectorAll('a')
// Remove the "Next" Link, as it will duplicate some content.
const pageLinks =
Array.from(pageLinkNodes)
?.slice(0, pageLinkNodes.length - 1)
?.map(({ href }) => href) ?? []
const pageContents = await Promise.all(
pageLinks.map(this.extractArticleContentsFromLink.bind(this))
)
for (const articleContents of pageContents) {
// We place all the content in a span to indicate that a page has been parsed.
const span = document.createElement('SPAN')
span.className = 'nextPageContents'
span.append(...articleContents)
articleBody.append(span)
}
pageNumbers.remove()
return document
}
async preHandle(url: string): Promise<PreHandleResult> {
// We simply retrieve the article without Javascript enabled using a GET command.
const dom = await this.grabContentFromUrl(url)
if (!this.hasMultiplePages(dom)) {
return {
content: dom.body.outerHTML,
title: dom.title,
dom,
}
}
const expandedDom = await this.expandLinksAndCombine(dom)
return {
content: expandedDom.body.outerHTML,
title: dom.title,
dom: expandedDom,
}
}
}

View File

@ -0,0 +1,84 @@
import { ArsTechnicaHandler } from '../src/websites/ars-technica-handler'
import fs from 'fs'
import nock from 'nock'
import { expect } from 'chai'
import { parseHTML } from 'linkedom'
describe('Testing parsing multi-page articles from arstechnica.', () => {
let orignalArticle: Document | undefined
let htmlPg1: string | null
let htmlPg2: string | null
let htmlPg3: string | null
const load = (path: string): string => {
return fs.readFileSync(path, 'utf8')
}
before(() => {
htmlPg1 = load('./test/data/ars-multipage/ars-technica-page-1.html')
htmlPg2 = load('./test/data/ars-multipage/ars-technica-page-2.html')
htmlPg3 = load('./test/data/ars-multipage/ars-technica-page-3.html')
orignalArticle = parseHTML(htmlPg1).document
})
beforeEach(() => {
nock('https://arstechnica.com').get('/article/').reply(200, htmlPg1!)
nock('https://arstechnica.com').get('/article/2/').reply(200, htmlPg2!)
nock('https://arstechnica.com').get('/article/3/').reply(200, htmlPg3!)
})
afterEach(() => {
nock.cleanAll();
})
it('should parse the title of the atlantic article.', async () => {
const response = await new ArsTechnicaHandler().preHandle(
'https://arstechnica.com/article/'
)
// We grab the title from the doucment.
expect(response.title).not.to.be.undefined
expect(response.title).to.equal(
'Whats going on with the reports of a room-temperature superconductor? | Ars Technica'
)
})
it('should remove the navigation links', async () => {
const response = await new ArsTechnicaHandler().preHandle(
'https://arstechnica.com/article/'
)
console.log(response.dom?.querySelectorAll('nav.page-numbers'));
// This should not exist
expect(orignalArticle?.querySelector('nav.page-numbers')).not.to.be.null
expect(response.dom?.querySelectorAll('nav.page-numbers').length).to.equal(0);
})
it('should append all new content into the main article', async () => {
const response = await new ArsTechnicaHandler().preHandle(
'https://arstechnica.com/article/'
)
// We name the div to ensure we can validate that it has been inserted.
expect(
orignalArticle?.getElementsByClassName('nextPageContents')?.length || 0
).to.equal(0)
expect(
response.dom?.getElementsByClassName('nextPageContents')?.length || 0
).not.to.equal(0)
})
it('should remove any related content links.', async () => {
const response = await new ArsTechnicaHandler().preHandle(
'https://arstechnica.com/article/'
)
// This exists in the HTML, but we remove it when preparsing.
expect(
response.dom?.getElementsByClassName(
'ArticleRelatedContentModule_root__BBa6g'
).length
).to.eql(0)
})
})

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long