diff --git a/packages/readabilityjs/Readability.js b/packages/readabilityjs/Readability.js
index 6799eb5da..58d5cccd0 100644
--- a/packages/readabilityjs/Readability.js
+++ b/packages/readabilityjs/Readability.js
@@ -303,7 +303,7 @@ Readability.prototype = {
if (!this._keepClasses) {
// Remove classes.
- this._cleanClasses(articleContent);
+ this._cleanElement(articleContent);
}
},
@@ -456,7 +456,7 @@ Readability.prototype = {
* @param Element
* @return void
*/
- _cleanClasses: function (node) {
+ _cleanElement: function (node) {
if (node.className && node.className.startsWith && node.className.startsWith('_omnivore')) {
return;
}
@@ -483,8 +483,10 @@ Readability.prototype = {
node.removeAttribute("class");
}
+ this._removeAllEventHandlers(node)
+
for (node = node.firstElementChild; node; node = node.nextElementSibling) {
- this._cleanClasses(node);
+ this._cleanElement(node);
}
},
@@ -546,7 +548,6 @@ Readability.prototype = {
this._forEachNode(medias, function (media) {
var src = media.getAttribute("src");
var poster = media.getAttribute("poster");
- var srcset = media.getAttribute("srcset");
if (src) {
media.setAttribute("src", this.toAbsoluteURI(src));
@@ -558,6 +559,20 @@ Readability.prototype = {
});
},
+ // removes all the javascript event handlers from the supplied element
+ _removeAllEventHandlers(element) {
+ const attributes = element.attributes;
+
+ // Iterate in reverse because removing attributes changes the length
+ for (let i = attributes.length - 1; i >= 0; i--) {
+ const attribute = attributes[i];
+ // Check if the attribute starts with "on" (like "onload", "onerror", etc.)
+ if (attribute.name.startsWith('on')) {
+ element.removeAttribute(attribute.name);
+ }
+ }
+ },
+
/** Creates imageproxy links for all article images with href source */
_createImageProxyLinks: function (articleContent) {
if (this.createImageProxyUrl !== undefined) {
diff --git a/packages/readabilityjs/test/test-pages/caixin/expected.html b/packages/readabilityjs/test/test-pages/caixin/expected.html
index 1d85d1b14..a2455e52a 100644
--- a/packages/readabilityjs/test/test-pages/caixin/expected.html
+++ b/packages/readabilityjs/test/test-pages/caixin/expected.html
@@ -19,7 +19,7 @@
-
【财新网】9月26日,汽车服务平台途虎养车正式在港交所主板挂牌上市。途虎养车( 09690.HK )上市发行价为28港元/股,此前公司披露的发行价区间为28港元/股至31港元/股,即实际发行价为区间下限。当日,途虎养车收报29.5港元/股,较发行价涨5.36%,市值为239.6亿港元。
+
【财新网】9月26日,汽车服务平台途虎养车正式在港交所主板挂牌上市。途虎养车( 09690.HK )上市发行价为28港元/股,此前公司披露的发行价区间为28港元/股至31港元/股,即实际发行价为区间下限。当日,途虎养车收报29.5港元/股,较发行价涨5.36%,市值为239.6亿港元。
途虎养车上市不易。途虎养车2022年1月即在港交所递表,2022年8月、2023年3月两次重新递交上市申请材料,终于在2023年8月23日通过聆讯。
diff --git a/packages/readabilityjs/test/test-readability.js b/packages/readabilityjs/test/test-readability.js
index 05460f8b3..abcba7760 100644
--- a/packages/readabilityjs/test/test-readability.js
+++ b/packages/readabilityjs/test/test-readability.js
@@ -1,353 +1,417 @@
-var chai = require("chai");
-var sinon = require("sinon");
-var chaiAsPromised = require("chai-as-promised");
-const { parseHTML } = require("linkedom");
-const nock = require("nock");
+var chai = require('chai')
+var sinon = require('sinon')
+var chaiAsPromised = require('chai-as-promised')
+const { parseHTML } = require('linkedom')
+const nock = require('nock')
-chai.use(chaiAsPromised);
-chai.config.includeStack = true;
-var expect = chai.expect;
+chai.use(chaiAsPromised)
+chai.config.includeStack = true
+var expect = chai.expect
-var Readability = require("../index").Readability;
-var JSDOMParser = require("../JSDOMParser");
-var prettyPrint = require("./utils").prettyPrint;
+var Readability = require('../index').Readability
+var JSDOMParser = require('../JSDOMParser')
+var prettyPrint = require('./utils').prettyPrint
-const isOmnivore = process.env.IS_OMNIVORE;
-var testPages = require("./utils").getTestPages(isOmnivore);
+const isOmnivore = process.env.IS_OMNIVORE
+var testPages = require('./utils').getTestPages(isOmnivore)
function reformatError(err) {
- var formattedError = new Error(err.message);
- formattedError.stack = err.stack;
- return formattedError;
+ var formattedError = new Error(err.message)
+ formattedError.stack = err.stack
+ return formattedError
}
function inOrderTraverse(fromNode) {
if (fromNode.firstChild) {
- return fromNode.firstChild;
+ return fromNode.firstChild
}
while (fromNode && !fromNode.nextSibling) {
- fromNode = fromNode.parentNode;
+ fromNode = fromNode.parentNode
}
- return fromNode ? fromNode.nextSibling : null;
+ return fromNode ? fromNode.nextSibling : null
}
function inOrderIgnoreEmptyTextNodes(fromNode) {
do {
- fromNode = inOrderTraverse(fromNode);
- } while (fromNode && fromNode.nodeType == 3 && !fromNode.textContent.trim());
- return fromNode;
+ fromNode = inOrderTraverse(fromNode)
+ } while (fromNode && fromNode.nodeType == 3 && !fromNode.textContent.trim())
+ return fromNode
}
function traverseDOM(callback, expectedDOM, actualDOM) {
- var actualNode = actualDOM.documentElement || actualDOM.childNodes[0];
- var expectedNode = expectedDOM.documentElement || expectedDOM.childNodes[0];
+ var actualNode = actualDOM.documentElement || actualDOM.childNodes[0]
+ var expectedNode = expectedDOM.documentElement || expectedDOM.childNodes[0]
while (actualNode || expectedNode) {
// We'll stop if we don't have both actualNode and expectedNode
if (!callback(actualNode, expectedNode)) {
- break;
+ break
}
- actualNode = inOrderIgnoreEmptyTextNodes(actualNode);
- expectedNode = inOrderIgnoreEmptyTextNodes(expectedNode);
+ actualNode = inOrderIgnoreEmptyTextNodes(actualNode)
+ expectedNode = inOrderIgnoreEmptyTextNodes(expectedNode)
}
}
// Collapse subsequent whitespace like HTML:
function htmlTransform(str) {
- return str.replace(/\s+/g, " ");
+ return str.replace(/\s+/g, ' ')
}
-function runTestsWithItems(label, domGenerationFn, source, expectedContent, expectedMetadata, uri) {
- describe(label, function() {
- this.timeout(30000);
+function runTestsWithItems(
+ label,
+ domGenerationFn,
+ source,
+ expectedContent,
+ expectedMetadata,
+ uri
+) {
+ describe(label, function () {
+ this.timeout(30000)
- var result;
+ var result
- before(async function() {
+ before(async function () {
try {
- var doc = domGenerationFn(source);
+ var doc = domGenerationFn(source)
// Provide one class name to preserve, which we know appears in a few
// of the test documents.
- var myReader = new Readability(doc, { classesToPreserve: ["caption"], url: uri });
- result = await myReader.parse();
+ var myReader = new Readability(doc, {
+ classesToPreserve: ['caption'],
+ url: uri,
+ })
+ result = await myReader.parse()
} catch (err) {
- throw reformatError(err);
+ throw reformatError(err)
}
- });
+ })
- it("should return a result object", function() {
- expect(result).to.include.keys("content", "title", "excerpt", "byline");
- });
+ it('should return a result object', function () {
+ expect(result).to.include.keys('content', 'title', 'excerpt', 'byline')
+ })
- it("should extract expected content", function() {
+ it('should extract expected content', function () {
function nodeStr(n) {
if (!n) {
- return "(no node)";
+ return '(no node)'
}
if (n.nodeType == 3) {
- return "#text(" + htmlTransform(n.textContent) + ")";
+ return '#text(' + htmlTransform(n.textContent) + ')'
}
if (n.nodeType != 1) {
- return "some other node type: " + n.nodeType + " with data " + n.data;
+ return 'some other node type: ' + n.nodeType + ' with data ' + n.data
}
- var rv = n.localName;
+ var rv = n.localName
if (n.id) {
- rv += "#" + n.id;
+ rv += '#' + n.id
}
if (n.className) {
- rv += ".(" + n.className + ")";
+ rv += '.(' + n.className + ')'
}
- return rv;
+ return rv
}
function genPath(node) {
if (node.id) {
- return "#" + node.id;
+ return '#' + node.id
}
- if (node.tagName == "BODY") {
- return "body";
+ if (node.tagName == 'BODY') {
+ return 'body'
}
- var parent = node.parentNode;
- var parentPath = genPath(parent);
- var index = Array.prototype.indexOf.call(parent.childNodes, node) + 1;
- return parentPath + " > " + nodeStr(node) + ":nth-child(" + index + ")";
+ var parent = node.parentNode
+ var parentPath = genPath(parent)
+ var index = Array.prototype.indexOf.call(parent.childNodes, node) + 1
+ return parentPath + ' > ' + nodeStr(node) + ':nth-child(' + index + ')'
}
function findableNodeDesc(node) {
- return genPath(node) + "(in: ``" + node.parentNode.innerHTML + "``)";
+ return genPath(node) + '(in: ``' + node.parentNode.innerHTML + '``)'
}
function attributesForNode(node) {
- return Array.from(node.attributes).map(function(attr) {
- return attr.name + "=" + attr.value;
- }).join(",");
+ return Array.from(node.attributes)
+ .map(function (attr) {
+ return attr.name + '=' + attr.value
+ })
+ .join(',')
}
-
- var actualDOM = domGenerationFn(prettyPrint(result.content));
- var expectedDOM = domGenerationFn(prettyPrint(expectedContent));
- traverseDOM(function(actualNode, expectedNode) {
- if (actualNode && expectedNode) {
- var actualDesc = nodeStr(actualNode);
- var expectedDesc = nodeStr(expectedNode);
- if (actualDesc != expectedDesc) {
- expect(actualDesc, findableNodeDesc(actualNode)).eql(expectedDesc);
- return false;
- }
- // Compare text for text nodes:
- if (actualNode.nodeType == 3) {
- var actualText = htmlTransform(actualNode.textContent);
- var expectedText = htmlTransform(expectedNode.textContent);
- expect(actualText, findableNodeDesc(actualNode)).eql(expectedText);
- if (actualText != expectedText) {
- return false;
+ var actualDOM = domGenerationFn(prettyPrint(result.content))
+ var expectedDOM = domGenerationFn(prettyPrint(expectedContent))
+ traverseDOM(
+ function (actualNode, expectedNode) {
+ if (actualNode && expectedNode) {
+ var actualDesc = nodeStr(actualNode)
+ var expectedDesc = nodeStr(expectedNode)
+ if (actualDesc != expectedDesc) {
+ expect(actualDesc, findableNodeDesc(actualNode)).eql(expectedDesc)
+ return false
}
- // Compare attributes for element nodes:
- } else if (actualNode.nodeType == 1) {
- var actualNodeDesc = attributesForNode(actualNode);
- var expectedNodeDesc = attributesForNode(expectedNode);
- var desc = "node " + nodeStr(actualNode) + " attributes (" + actualNodeDesc + ") should match (" + expectedNodeDesc + ") ";
- expect(actualNode.attributes.length, desc).eql(expectedNode.attributes.length);
- for (var i = 0; i < actualNode.attributes.length; i++) {
- var attr = actualNode.attributes[i].name;
- var actualValue = actualNode.getAttribute(attr);
- var expectedValue = expectedNode.getAttribute(attr);
- expect(expectedValue, "node (" + findableNodeDesc(actualNode) + ") attribute " + attr + " should match").eql(actualValue);
+ // Compare text for text nodes:
+ if (actualNode.nodeType == 3) {
+ var actualText = htmlTransform(actualNode.textContent)
+ var expectedText = htmlTransform(expectedNode.textContent)
+ expect(actualText, findableNodeDesc(actualNode)).eql(expectedText)
+ if (actualText != expectedText) {
+ return false
+ }
+ // Compare attributes for element nodes:
+ } else if (actualNode.nodeType == 1) {
+ var actualNodeDesc = attributesForNode(actualNode)
+ var expectedNodeDesc = attributesForNode(expectedNode)
+ var desc =
+ 'node ' +
+ nodeStr(actualNode) +
+ ' attributes (' +
+ actualNodeDesc +
+ ') should match (' +
+ expectedNodeDesc +
+ ') '
+ expect(actualNode.attributes.length, desc).eql(
+ expectedNode.attributes.length
+ )
+ for (var i = 0; i < actualNode.attributes.length; i++) {
+ var attr = actualNode.attributes[i].name
+ var actualValue = actualNode.getAttribute(attr)
+ var expectedValue = expectedNode.getAttribute(attr)
+ expect(
+ expectedValue,
+ 'node (' +
+ findableNodeDesc(actualNode) +
+ ') attribute ' +
+ attr +
+ ' should match'
+ ).eql(actualValue)
+ }
}
+ } else {
+ expect(
+ nodeStr(actualNode),
+ 'Should have a node from both DOMs'
+ ).eql(nodeStr(expectedNode))
+ return false
}
- } else {
- expect(nodeStr(actualNode), "Should have a node from both DOMs").eql(nodeStr(expectedNode));
- return false;
- }
- return true;
- }, actualDOM, expectedDOM);
- });
+ return true
+ },
+ actualDOM,
+ expectedDOM
+ )
+ })
- it("should extract expected title", function() {
- expect(result.title).eql(expectedMetadata.title);
- });
+ it('should extract expected title', function () {
+ expect(result.title).eql(expectedMetadata.title)
+ })
- it("should extract expected byline", function() {
- expect(result.byline).eql(expectedMetadata.byline);
- });
+ it('should extract expected byline', function () {
+ expect(result.byline).eql(expectedMetadata.byline)
+ })
- it("should extract expected excerpt", function() {
- expect(result.excerpt).eql(expectedMetadata.excerpt);
- });
+ it('should extract expected excerpt', function () {
+ expect(result.excerpt).eql(expectedMetadata.excerpt)
+ })
- it("should extract expected site name", function() {
- expect(result.siteName).eql(expectedMetadata.siteName);
- });
+ it('should extract expected site name', function () {
+ expect(result.siteName).eql(expectedMetadata.siteName)
+ })
- expectedMetadata.dir && it("should extract expected direction", function() {
- expect(result.dir).eql(expectedMetadata.dir);
- });
- });
+ expectedMetadata.dir &&
+ it('should extract expected direction', function () {
+ expect(result.dir).eql(expectedMetadata.dir)
+ })
+ })
}
function removeCommentNodesRecursively(node) {
for (var i = node.childNodes.length - 1; i >= 0; i--) {
- var child = node.childNodes[i];
+ var child = node.childNodes[i]
if (child.nodeType === child.COMMENT_NODE) {
- node.removeChild(child);
+ node.removeChild(child)
} else if (child.nodeType === child.ELEMENT_NODE) {
- removeCommentNodesRecursively(child);
+ removeCommentNodesRecursively(child)
}
}
}
-describe("Readability API", function() {
- describe("#constructor", function() {
- var doc = new JSDOMParser().parse("My image: ' +
+ it('should handle srcset elements with density descriptors', async function () {
+ var dom = parseHTML(
+ 'My image:

' +
+ ''
+ )
+ var expected_xhtml =
+ '
My image: ' +
'
![]()
';
- var content = (await (new Readability(dom.document, {
- createImageProxyUrl: function(url) {
- return url;
- }
- })).parse()).content;
- expect(content).eql(expected_xhtml);
- });
+ 'https://webkit.org/demos/srcset/image-4x.png 4x,">
'
+ var content = (
+ await new Readability(dom.document, {
+ createImageProxyUrl: function (url) {
+ return url
+ },
+ }).parse()
+ ).content
+ expect(content).eql(expected_xhtml)
+ })
- it("should remove srcset elements that are lazy loading placeholders", async function() {
- var dom = parseHTML('My image: ' +
+ it('should remove srcset elements that are lazy loading placeholders', async function () {
+ var dom = parseHTML(
+ 'My image:

'
+ )
+ var expected_xhtml =
+ '
' +
'My image:

' +
- '
';
- var content = (await (new Readability(dom.document, {
- createImageProxyUrl: function(url) {
- return url;
- }
- })).parse()).content;
- expect(content).eql(expected_xhtml);
- });
- });
-});
+ '
'
+ var content = (
+ await new Readability(dom.document, {
+ createImageProxyUrl: function (url) {
+ return url
+ },
+ }).parse()
+ ).content
+ expect(content).eql(expected_xhtml)
+ })
+ })
+})
-describe("Test pages", function() {
- before(function() {
+describe('Test pages', function () {
+ before(function () {
// mock the substack redirect to twitter
nock('https://email.mg2.substack.com')
- .get('/c/eJxNkcuO3SAMhp8m7E5EzC1ZsDjT0fQidVG1qtRVRIKToCYQAZmZ9OlLejaVEBhk-__5PJqMc4in3kPK5Nr6fO6oPb6lFXPGSI6EsXdWK8ZAcQXEam6bVrTEpX6KiJtxq87xQLIfw-pGk13wVwWojlNKFo1UMpgYWCrbtoNJmIYh60AMYKxopoewOaxDP6LGV4xn8EjGsG3o89WLrHrJeU8Vu1fwUlZ-c5e9uuSU2_cF1xX9gHHG-PW5vKRs8pFK0AjoWq6KdMNapVoJvGIvpdEz0ApkLsEdPrLz6dP5Pv76LNOfnx_gxxd6J04DBaACZDkphZrVVqnR0nZCaVs6GVFxus1Qp2MoeuPvyw2Jegl-HsJNheC-JX7DkuX8EN5vFrdQh8271xCxNvt-Ieuvbx7e5bNHb4YV7YNmfgzlH99-Ro-xDMv2JutGCiY6yRsGgj_gFURCyk6wRpHixoZS5fXmxsXgmv6n8xdPFKS3')
+ .get(
+ '/c/eJxNkcuO3SAMhp8m7E5EzC1ZsDjT0fQidVG1qtRVRIKToCYQAZmZ9OlLejaVEBhk-__5PJqMc4in3kPK5Nr6fO6oPb6lFXPGSI6EsXdWK8ZAcQXEam6bVrTEpX6KiJtxq87xQLIfw-pGk13wVwWojlNKFo1UMpgYWCrbtoNJmIYh60AMYKxopoewOaxDP6LGV4xn8EjGsG3o89WLrHrJeU8Vu1fwUlZ-c5e9uuSU2_cF1xX9gHHG-PW5vKRs8pFK0AjoWq6KdMNapVoJvGIvpdEz0ApkLsEdPrLz6dP5Pv76LNOfnx_gxxd6J04DBaACZDkphZrVVqnR0nZCaVs6GVFxus1Qp2MoeuPvyw2Jegl-HsJNheC-JX7DkuX8EN5vFrdQh8271xCxNvt-Ieuvbx7e5bNHb4YV7YNmfgzlH99-Ro-xDMv2JutGCiY6yRsGgj_gFURCyk6wRpHixoZS5fXmxsXgmv6n8xdPFKS3'
+ )
.reply(302, '', [
'location',
- 'https://twitter.com/ShellenbergerMD/status/1529847068138778624?s=20&t=A2G3yBHyxcYI6szVC2TJ0A'
- ]);
+ 'https://twitter.com/ShellenbergerMD/status/1529847068138778624?s=20&t=A2G3yBHyxcYI6szVC2TJ0A',
+ ])
nock('https://twitter.com')
.get('/ShellenbergerMD/status/1529847068138778624')
- .query({"s":"20","t":"A2G3yBHyxcYI6szVC2TJ0A"})
- .reply(200);
- });
+ .query({ s: '20', t: 'A2G3yBHyxcYI6szVC2TJ0A' })
+ .reply(200)
+ })
- testPages.forEach(function(testPage) {
- describe(testPage.dir, function() {
- var uri = "http://fakehost/test/page.html";
+ testPages.forEach(function (testPage) {
+ describe(testPage.dir, function () {
+ var uri = 'http://fakehost/test/page.html'
- runTestsWithItems("linkedom", function(source) {
- var doc = parseHTML(source).document;
- removeCommentNodesRecursively(doc);
- return doc;
- }, testPage.source, testPage.expectedContent, testPage.expectedMetadata, uri);
+ runTestsWithItems(
+ 'linkedom',
+ function (source) {
+ var doc = parseHTML(source).document
+ removeCommentNodesRecursively(doc)
+ return doc
+ },
+ testPage.source,
+ testPage.expectedContent,
+ testPage.expectedMetadata,
+ uri
+ )
// runTestsWithItems("JSDOMParser", function(source) {
// var parser = new JSDOMParser();
@@ -358,6 +422,6 @@ describe("Test pages", function() {
// }
// return doc;
// }, testPage.source, testPage.expectedContent, testPage.expectedMetadata);
- });
- });
-});
+ })
+ })
+})