Fix parsing articles from www.derstandard.at (#459)
* Fix parsing articles from www.derstandard.at * slim cookies down
This commit is contained in:
32
packages/puppeteer-parse/derstandard-handler.js
Normal file
32
packages/puppeteer-parse/derstandard-handler.js
Normal file
@ -0,0 +1,32 @@
|
||||
/* eslint-disable no-undef */
|
||||
/* eslint-disable no-empty */
|
||||
/* eslint-disable @typescript-eslint/explicit-function-return-type */
|
||||
/* eslint-disable @typescript-eslint/no-var-requires */
|
||||
/* eslint-disable @typescript-eslint/no-require-imports */
|
||||
require('dotenv').config();
|
||||
const axios = require('axios');
|
||||
const jsdom = require("jsdom");
|
||||
const { JSDOM } = jsdom;
|
||||
|
||||
exports.derstandardHandler = {
|
||||
shouldPrehandle: (url, env) => {
|
||||
const u = new URL(url);
|
||||
return u.hostname === 'www.derstandard.at';
|
||||
},
|
||||
|
||||
prehandle: async (url, env) => {
|
||||
const response = await axios.get(url, {
|
||||
// set cookie to give consent to get the article
|
||||
headers: {
|
||||
'cookie': `DSGVO_ZUSAGE_V1=true; consentUUID=2bacb9c1-1e80-4be0-9f7b-ee987cf4e7b0_6`
|
||||
},
|
||||
});
|
||||
const content = response.data;
|
||||
|
||||
const dom = new JSDOM(content)
|
||||
const titleElement = dom.window.document.querySelector('.article-title')
|
||||
titleElement?.remove()
|
||||
|
||||
return { content: dom.window.document.body.outerHTML, title: titleElement?.textContent };
|
||||
}
|
||||
}
|
||||
@ -23,6 +23,7 @@ const { youtubeHandler } = require('./youtube-handler');
|
||||
const { tDotCoHandler } = require('./t-dot-co-handler');
|
||||
const { pdfHandler } = require('./pdf-handler');
|
||||
const { mediumHandler } = require('./medium-handler');
|
||||
const { derstandardHandler } = require('./derstandard-handler');
|
||||
|
||||
const storage = new Storage();
|
||||
const ALLOWED_ORIGINS = process.env.ALLOWED_ORIGINS ? process.env.ALLOWED_ORIGINS.split(',') : [];
|
||||
@ -226,6 +227,7 @@ const handlers = {
|
||||
'youtube': youtubeHandler,
|
||||
't-dot-co': tDotCoHandler,
|
||||
'medium': mediumHandler,
|
||||
'derstandard': derstandardHandler,
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user