Add Ars Technica handler for multipage articles

This commit is contained in:
Thomas Rogers
2023-08-04 20:00:03 +02:00
parent 3d7691c068
commit 49c1976ac9
6 changed files with 2710 additions and 0 deletions

View File

@ -37,6 +37,7 @@ import { WeixinQqHandler } from './websites/weixin-qq-handler'
import { WikipediaHandler } from './websites/wikipedia-handler'
import { YoutubeHandler } from './websites/youtube-handler'
import { TheAtlanticHandler } from './websites/the-atlantic-handler'
import { ArsTechnicaHandler } from './websites/ars-technica-handler'
const validateUrlString = (url: string): boolean => {
const u = new URL(url)
@ -57,6 +58,7 @@ const validateUrlString = (url: string): boolean => {
}
const contentHandlers: ContentHandler[] = [
new ArsTechnicaHandler(),
new TheAtlanticHandler(),
new AppleNewsHandler(),
new BloombergHandler(),

View File

@ -0,0 +1,87 @@
import axios from 'axios'
import { parseHTML } from 'linkedom'
import { ContentHandler, PreHandleResult } from '../content-handler'
/**
* Some of the content on Ars Technica is split over several pages.
* If this is the case we should unfurl the entire article into one. l
*/
export class ArsTechnicaHandler extends ContentHandler {
constructor() {
super()
this.name = 'ArsTechnica'
}
shouldPreHandle(url: string): boolean {
const u = new URL(url)
return u.hostname.endsWith('arstechnica.com')
}
hasMultiplePages(document: Document): boolean {
return document.querySelectorAll('nav.page-numbers')?.length != 0
}
async grabContentFromUrl(url: string): Promise<Document> {
const response = await axios.get(url)
const data = response.data as string
return parseHTML(data).document
}
async extractArticleContentsFromLink(url: string): Promise<Document[]> {
const dom = await this.grabContentFromUrl(url)
const articleContent = dom.querySelector('[itemprop="articleBody"]')
return [].slice.call(articleContent?.childNodes || [])
}
async expandLinksAndCombine(document: Document): Promise<Document> {
const pageNumbers = document.querySelector('nav.page-numbers')
const articleBody = document.querySelector('[itemprop="articleBody"]')
if (!pageNumbers || !articleBody) {
// We shouldn't ever really get here, but sometimes weird things happen.
return document
}
const pageLinkNodes = pageNumbers.querySelectorAll('a')
// Remove the "Next" Link, as it will duplicate some content.
const pageLinks =
Array.from(pageLinkNodes)
?.slice(0, pageLinkNodes.length - 1)
?.map(({ href }) => href) ?? []
const pageContents = await Promise.all(
pageLinks.map(this.extractArticleContentsFromLink.bind(this))
)
for (const articleContents of pageContents) {
// We place all the content in a span to indicate that a page has been parsed.
const span = document.createElement('SPAN')
span.className = 'nextPageContents'
span.append(...articleContents)
articleBody.append(span)
}
pageNumbers.remove()
return document
}
async preHandle(url: string): Promise<PreHandleResult> {
// We simply retrieve the article without Javascript enabled using a GET command.
const dom = await this.grabContentFromUrl(url)
if (!this.hasMultiplePages(dom)) {
return {
content: dom.body.outerHTML,
title: dom.title,
dom,
}
}
const expandedDom = await this.expandLinksAndCombine(dom)
return {
content: expandedDom.body.outerHTML,
title: dom.title,
dom: expandedDom,
}
}
}