Merge pull request #2608 from Podginator/feat/Ars-technica-handler
Ars Technica Multipage handling
This commit is contained in:
@ -37,6 +37,7 @@ import { WeixinQqHandler } from './websites/weixin-qq-handler'
|
||||
import { WikipediaHandler } from './websites/wikipedia-handler'
|
||||
import { YoutubeHandler } from './websites/youtube-handler'
|
||||
import { TheAtlanticHandler } from './websites/the-atlantic-handler'
|
||||
import { ArsTechnicaHandler } from './websites/ars-technica-handler'
|
||||
|
||||
const validateUrlString = (url: string): boolean => {
|
||||
const u = new URL(url)
|
||||
@ -57,6 +58,7 @@ const validateUrlString = (url: string): boolean => {
|
||||
}
|
||||
|
||||
const contentHandlers: ContentHandler[] = [
|
||||
new ArsTechnicaHandler(),
|
||||
new TheAtlanticHandler(),
|
||||
new AppleNewsHandler(),
|
||||
new BloombergHandler(),
|
||||
|
||||
@ -0,0 +1,86 @@
|
||||
import axios from 'axios'
|
||||
import { parseHTML } from 'linkedom'
|
||||
import { ContentHandler, PreHandleResult } from '../content-handler'
|
||||
|
||||
/**
|
||||
* Some of the content on Ars Technica is split over several pages.
|
||||
* If this is the case we should unfurl the entire article into one. l
|
||||
*/
|
||||
export class ArsTechnicaHandler extends ContentHandler {
|
||||
constructor() {
|
||||
super()
|
||||
this.name = 'ArsTechnica'
|
||||
}
|
||||
|
||||
shouldPreHandle(url: string): boolean {
|
||||
const u = new URL(url)
|
||||
return u.hostname.endsWith('arstechnica.com')
|
||||
}
|
||||
|
||||
hasMultiplePages(document: Document): boolean {
|
||||
return document.querySelectorAll('nav.page-numbers')?.length != 0
|
||||
}
|
||||
|
||||
async grabContentFromUrl(url: string): Promise<Document> {
|
||||
const response = await axios.get(url)
|
||||
const data = response.data as string
|
||||
return parseHTML(data).document
|
||||
}
|
||||
|
||||
async extractArticleContentsFromLink(url: string): Promise<Document[]> {
|
||||
const dom = await this.grabContentFromUrl(url)
|
||||
const articleContent = dom.querySelector('[itemprop="articleBody"]')
|
||||
return [].slice.call(articleContent?.childNodes || [])
|
||||
}
|
||||
|
||||
async expandLinksAndCombine(document: Document): Promise<Document> {
|
||||
const pageNumbers = document.querySelector('nav.page-numbers')
|
||||
const articleBody = document.querySelector('[itemprop="articleBody"]')
|
||||
|
||||
if (!pageNumbers || !articleBody) {
|
||||
// We shouldn't ever really get here, but sometimes weird things happen.
|
||||
return document
|
||||
}
|
||||
|
||||
const pageLinkNodes = pageNumbers.querySelectorAll('a')
|
||||
// Remove the "Next" Link, as it will duplicate some content.
|
||||
const pageLinks =
|
||||
Array.from(pageLinkNodes)
|
||||
?.slice(0, pageLinkNodes.length - 1)
|
||||
?.map(({ href }) => href) ?? []
|
||||
|
||||
const pageContents = await Promise.all(
|
||||
pageLinks.map(this.extractArticleContentsFromLink.bind(this))
|
||||
)
|
||||
|
||||
for (const articleContents of pageContents) {
|
||||
// We place all the content in a span to indicate that a page has been parsed.
|
||||
const span = document.createElement('SPAN')
|
||||
span.className = 'nextPageContents'
|
||||
span.append(...articleContents)
|
||||
articleBody.append(span)
|
||||
}
|
||||
pageNumbers.remove()
|
||||
|
||||
return document
|
||||
}
|
||||
|
||||
async preHandle(url: string): Promise<PreHandleResult> {
|
||||
// We simply retrieve the article without Javascript enabled using a GET command.
|
||||
const dom = await this.grabContentFromUrl(url)
|
||||
if (!this.hasMultiplePages(dom)) {
|
||||
return {
|
||||
content: dom.body.outerHTML,
|
||||
title: dom.title,
|
||||
dom,
|
||||
}
|
||||
}
|
||||
|
||||
const expandedDom = await this.expandLinksAndCombine(dom)
|
||||
return {
|
||||
content: expandedDom.body.outerHTML,
|
||||
title: dom.title,
|
||||
dom: expandedDom,
|
||||
}
|
||||
}
|
||||
}
|
||||
82
packages/content-handler/test/ars-technica.test.ts
Normal file
82
packages/content-handler/test/ars-technica.test.ts
Normal file
@ -0,0 +1,82 @@
|
||||
import { ArsTechnicaHandler } from '../src/websites/ars-technica-handler'
|
||||
import fs from 'fs'
|
||||
import nock from 'nock'
|
||||
import { expect } from 'chai'
|
||||
import { parseHTML } from 'linkedom'
|
||||
|
||||
describe('Testing parsing multi-page articles from arstechnica.', () => {
|
||||
let orignalArticle: Document | undefined
|
||||
let htmlPg1: string | null
|
||||
let htmlPg2: string | null
|
||||
let htmlPg3: string | null
|
||||
|
||||
const load = (path: string): string => {
|
||||
return fs.readFileSync(path, 'utf8')
|
||||
}
|
||||
|
||||
before(() => {
|
||||
htmlPg1 = load('./test/data/ars-multipage/ars-technica-page-1.html')
|
||||
htmlPg2 = load('./test/data/ars-multipage/ars-technica-page-2.html')
|
||||
htmlPg3 = load('./test/data/ars-multipage/ars-technica-page-3.html')
|
||||
|
||||
orignalArticle = parseHTML(htmlPg1).document
|
||||
})
|
||||
|
||||
beforeEach(() => {
|
||||
nock('https://arstechnica.com').get('/article/').reply(200, htmlPg1!)
|
||||
nock('https://arstechnica.com').get('/article/2/').reply(200, htmlPg2!)
|
||||
nock('https://arstechnica.com').get('/article/3/').reply(200, htmlPg3!)
|
||||
})
|
||||
|
||||
afterEach(() => {
|
||||
nock.cleanAll();
|
||||
})
|
||||
|
||||
it('should parse the title of the atlantic article.', async () => {
|
||||
const response = await new ArsTechnicaHandler().preHandle(
|
||||
'https://arstechnica.com/article/'
|
||||
)
|
||||
|
||||
// We grab the title from the doucment.
|
||||
expect(response.title).not.to.be.undefined
|
||||
expect(response.title).to.equal(
|
||||
'What’s going on with the reports of a room-temperature superconductor? | Ars Technica'
|
||||
)
|
||||
})
|
||||
|
||||
it('should remove the navigation links', async () => {
|
||||
const response = await new ArsTechnicaHandler().preHandle(
|
||||
'https://arstechnica.com/article/'
|
||||
)
|
||||
|
||||
expect(orignalArticle?.querySelector('nav.page-numbers')).not.to.be.null
|
||||
expect(response.dom?.querySelectorAll('nav.page-numbers').length).to.equal(0);
|
||||
})
|
||||
|
||||
it('should append all new content into the main article', async () => {
|
||||
const response = await new ArsTechnicaHandler().preHandle(
|
||||
'https://arstechnica.com/article/'
|
||||
)
|
||||
|
||||
// We name the div to ensure we can validate that it has been inserted.
|
||||
expect(
|
||||
orignalArticle?.getElementsByClassName('nextPageContents')?.length || 0
|
||||
).to.equal(0)
|
||||
expect(
|
||||
response.dom?.getElementsByClassName('nextPageContents')?.length || 0
|
||||
).not.to.equal(0)
|
||||
})
|
||||
|
||||
it('should remove any related content links.', async () => {
|
||||
const response = await new ArsTechnicaHandler().preHandle(
|
||||
'https://arstechnica.com/article/'
|
||||
)
|
||||
|
||||
// This exists in the HTML, but we remove it when preparsing.
|
||||
expect(
|
||||
response.dom?.getElementsByClassName(
|
||||
'ArticleRelatedContentModule_root__BBa6g'
|
||||
).length
|
||||
).to.eql(0)
|
||||
})
|
||||
})
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
Reference in New Issue
Block a user