Merge pull request #2608 from Podginator/feat/Ars-technica-handler

Ars Technica Multipage handling
2023-08-07 10:43:39 +08:00
parent cb85eb1c7b 0392f6f009
commit 053625276b
6 changed files with 2707 additions and 0 deletions
--- a/packages/content-handler/src/index.ts
+++ b/packages/content-handler/src/index.ts
@ -37,6 +37,7 @@ import { WeixinQqHandler } from './websites/weixin-qq-handler'
 import { WikipediaHandler } from './websites/wikipedia-handler'
 import { YoutubeHandler } from './websites/youtube-handler'
 import { TheAtlanticHandler } from './websites/the-atlantic-handler'
+import { ArsTechnicaHandler } from './websites/ars-technica-handler'

 const validateUrlString = (url: string): boolean => {
  const u = new URL(url)
@ -57,6 +58,7 @@ const validateUrlString = (url: string): boolean => {
 }

 const contentHandlers: ContentHandler[] = [
+  new ArsTechnicaHandler(),
  new TheAtlanticHandler(),
  new AppleNewsHandler(),
  new BloombergHandler(),
--- a/packages/content-handler/src/websites/ars-technica-handler.ts
+++ b/packages/content-handler/src/websites/ars-technica-handler.ts
@ -0,0 +1,86 @@
+import axios from 'axios'
+import { parseHTML } from 'linkedom'
+import { ContentHandler, PreHandleResult } from '../content-handler'
+
+/**
+ * Some of the content on Ars Technica is split over several pages.
+ * If this is the case we should unfurl the entire article into one. l
+ */
+export class ArsTechnicaHandler extends ContentHandler {
+  constructor() {
+    super()
+    this.name = 'ArsTechnica'
+  }
+
+  shouldPreHandle(url: string): boolean {
+    const u = new URL(url)
+    return u.hostname.endsWith('arstechnica.com')
+  }
+
+  hasMultiplePages(document: Document): boolean {
+    return document.querySelectorAll('nav.page-numbers')?.length != 0
+  }
+
+  async grabContentFromUrl(url: string): Promise<Document> {
+    const response = await axios.get(url)
+    const data = response.data as string
+    return parseHTML(data).document
+  }
+
+  async extractArticleContentsFromLink(url: string): Promise<Document[]> {
+    const dom = await this.grabContentFromUrl(url)
+    const articleContent = dom.querySelector('[itemprop="articleBody"]')
+    return [].slice.call(articleContent?.childNodes || [])
+  }
+
+  async expandLinksAndCombine(document: Document): Promise<Document> {
+    const pageNumbers = document.querySelector('nav.page-numbers')
+    const articleBody = document.querySelector('[itemprop="articleBody"]')
+
+    if (!pageNumbers || !articleBody) {
+      // We shouldn't ever really get here, but sometimes weird things happen.
+      return document
+    }
+
+    const pageLinkNodes = pageNumbers.querySelectorAll('a')
+    // Remove the "Next" Link, as it will duplicate some content.
+    const pageLinks =
+      Array.from(pageLinkNodes)
+        ?.slice(0, pageLinkNodes.length - 1)
+        ?.map(({ href }) => href) ?? []
+
+    const pageContents = await Promise.all(
+      pageLinks.map(this.extractArticleContentsFromLink.bind(this))
+    )
+
+    for (const articleContents of pageContents) {
+      // We place all the content in a span to indicate that a page has been parsed.
+      const span = document.createElement('SPAN')
+      span.className = 'nextPageContents'
+      span.append(...articleContents)
+      articleBody.append(span)
+    }
+    pageNumbers.remove()
+
+    return document
+  }
+
+  async preHandle(url: string): Promise<PreHandleResult> {
+    // We simply retrieve the article without Javascript enabled using a GET command.
+    const dom = await this.grabContentFromUrl(url)
+    if (!this.hasMultiplePages(dom)) {
+      return {
+        content: dom.body.outerHTML,
+        title: dom.title,
+        dom,
+      }
+    }
+
+    const expandedDom = await this.expandLinksAndCombine(dom)
+    return {
+      content: expandedDom.body.outerHTML,
+      title: dom.title,
+      dom: expandedDom,
+    }
+  }
+}
--- a/packages/content-handler/test/ars-technica.test.ts
+++ b/packages/content-handler/test/ars-technica.test.ts
@ -0,0 +1,82 @@
+import { ArsTechnicaHandler } from '../src/websites/ars-technica-handler'
+import fs from 'fs'
+import nock from 'nock'
+import { expect } from 'chai'
+import { parseHTML } from 'linkedom'
+
+describe('Testing parsing multi-page articles from arstechnica.', () => {
+  let orignalArticle: Document | undefined
+  let htmlPg1: string | null
+  let htmlPg2: string | null
+  let htmlPg3: string | null
+
+  const load = (path: string): string => {
+    return fs.readFileSync(path, 'utf8')
+  }
+
+  before(() => {
+    htmlPg1 = load('./test/data/ars-multipage/ars-technica-page-1.html')
+    htmlPg2 = load('./test/data/ars-multipage/ars-technica-page-2.html')
+    htmlPg3 = load('./test/data/ars-multipage/ars-technica-page-3.html')
+
+    orignalArticle = parseHTML(htmlPg1).document
+  })
+
+  beforeEach(() => {
+    nock('https://arstechnica.com').get('/article/').reply(200, htmlPg1!)
+    nock('https://arstechnica.com').get('/article/2/').reply(200, htmlPg2!)
+    nock('https://arstechnica.com').get('/article/3/').reply(200, htmlPg3!)
+  })
+
+  afterEach(() => { 
+    nock.cleanAll();
+  })
+
+  it('should parse the title of the atlantic article.', async () => {
+    const response = await new ArsTechnicaHandler().preHandle(
+      'https://arstechnica.com/article/'
+    )
+
+    // We grab the title from the doucment.
+    expect(response.title).not.to.be.undefined
+    expect(response.title).to.equal(
+      'What’s going on with the reports of a room-temperature superconductor? | Ars Technica'
+    )
+  })
+
+  it('should remove the navigation links', async () => {
+    const response = await new ArsTechnicaHandler().preHandle(
+      'https://arstechnica.com/article/'
+    )
+
+    expect(orignalArticle?.querySelector('nav.page-numbers')).not.to.be.null
+    expect(response.dom?.querySelectorAll('nav.page-numbers').length).to.equal(0);
+  })
+
+  it('should append all new content into the main article', async () => {
+    const response = await new ArsTechnicaHandler().preHandle(
+      'https://arstechnica.com/article/'
+    )
+
+    // We name the div to ensure we can validate that it has been inserted.
+    expect(
+      orignalArticle?.getElementsByClassName('nextPageContents')?.length || 0
+    ).to.equal(0)
+    expect(
+      response.dom?.getElementsByClassName('nextPageContents')?.length || 0
+    ).not.to.equal(0)
+  })
+
+  it('should remove any related content links.', async () => {
+    const response = await new ArsTechnicaHandler().preHandle(
+      'https://arstechnica.com/article/'
+    )
+
+    // This exists in the HTML, but we remove it when preparsing.
+    expect(
+      response.dom?.getElementsByClassName(
+        'ArticleRelatedContentModule_root__BBa6g'
+      ).length
+    ).to.eql(0)
+  })
+})
--- a/packages/content-handler/test/data/ars-multipage/ars-technica-page-1.html
+++ b/packages/content-handler/test/data/ars-multipage/ars-technica-page-1.html
--- a/packages/content-handler/test/data/ars-multipage/ars-technica-page-2.html
+++ b/packages/content-handler/test/data/ars-multipage/ars-technica-page-2.html
--- a/packages/content-handler/test/data/ars-multipage/ars-technica-page-3.html
+++ b/packages/content-handler/test/data/ars-multipage/ars-technica-page-3.html