diff --git a/packages/puppeteer-parse/derstandard-handler.js b/packages/puppeteer-parse/derstandard-handler.js new file mode 100644 index 000000000..8a732c05c --- /dev/null +++ b/packages/puppeteer-parse/derstandard-handler.js @@ -0,0 +1,32 @@ +/* eslint-disable no-undef */ +/* eslint-disable no-empty */ +/* eslint-disable @typescript-eslint/explicit-function-return-type */ +/* eslint-disable @typescript-eslint/no-var-requires */ +/* eslint-disable @typescript-eslint/no-require-imports */ +require('dotenv').config(); +const axios = require('axios'); +const jsdom = require("jsdom"); +const { JSDOM } = jsdom; + +exports.derstandardHandler = { + shouldPrehandle: (url, env) => { + const u = new URL(url); + return u.hostname === 'www.derstandard.at'; + }, + + prehandle: async (url, env) => { + const response = await axios.get(url, { + // set cookie to give consent to get the article + headers: { + 'cookie': `DSGVO_ZUSAGE_V1=true; consentUUID=2bacb9c1-1e80-4be0-9f7b-ee987cf4e7b0_6` + }, + }); + const content = response.data; + + const dom = new JSDOM(content) + const titleElement = dom.window.document.querySelector('.article-title') + titleElement?.remove() + + return { content: dom.window.document.body.outerHTML, title: titleElement?.textContent }; + } +} diff --git a/packages/puppeteer-parse/index.js b/packages/puppeteer-parse/index.js index 3f845b412..fc8db66f6 100644 --- a/packages/puppeteer-parse/index.js +++ b/packages/puppeteer-parse/index.js @@ -23,6 +23,7 @@ const { youtubeHandler } = require('./youtube-handler'); const { tDotCoHandler } = require('./t-dot-co-handler'); const { pdfHandler } = require('./pdf-handler'); const { mediumHandler } = require('./medium-handler'); +const { derstandardHandler } = require('./derstandard-handler'); const storage = new Storage(); const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : []; @@ -226,6 +227,7 @@ const handlers = { 'youtube': youtubeHandler, 't-dot-co': tDotCoHandler, 'medium': mediumHandler, + 'derstandard': derstandardHandler, }; /**