Fix parsing articles from www.derstandard.at (#459)

* Fix parsing articles from www.derstandard.at

* slim cookies down
This commit is contained in:
Hongbo Wu
2022-04-22 10:53:28 +08:00
committed by GitHub
parent 71aa66ea9c
commit b679451548
2 changed files with 34 additions and 0 deletions

View File

@ -0,0 +1,32 @@
/* eslint-disable no-undef */
/* eslint-disable no-empty */
/* eslint-disable @typescript-eslint/explicit-function-return-type */
/* eslint-disable @typescript-eslint/no-var-requires */
/* eslint-disable @typescript-eslint/no-require-imports */
require('dotenv').config();
const axios = require('axios');
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
exports.derstandardHandler = {
shouldPrehandle: (url, env) => {
const u = new URL(url);
return u.hostname === 'www.derstandard.at';
},
prehandle: async (url, env) => {
const response = await axios.get(url, {
// set cookie to give consent to get the article
headers: {
'cookie': `DSGVO_ZUSAGE_V1=true; consentUUID=2bacb9c1-1e80-4be0-9f7b-ee987cf4e7b0_6`
},
});
const content = response.data;
const dom = new JSDOM(content)
const titleElement = dom.window.document.querySelector('.article-title')
titleElement?.remove()
return { content: dom.window.document.body.outerHTML, title: titleElement?.textContent };
}
}

View File

@ -23,6 +23,7 @@ const { youtubeHandler } = require('./youtube-handler');
const { tDotCoHandler } = require('./t-dot-co-handler');
const { pdfHandler } = require('./pdf-handler');
const { mediumHandler } = require('./medium-handler');
const { derstandardHandler } = require('./derstandard-handler');
const storage = new Storage();
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
@ -226,6 +227,7 @@ const handlers = {
'youtube': youtubeHandler,
't-dot-co': tDotCoHandler,
'medium': mediumHandler,
'derstandard': derstandardHandler,
};
/**