var chai = require('chai') var sinon = require('sinon') var chaiAsPromised = require('chai-as-promised') const { parseHTML } = require('linkedom') const nock = require('nock') chai.use(chaiAsPromised) chai.config.includeStack = true var expect = chai.expect var Readability = require('../index').Readability var JSDOMParser = require('../JSDOMParser') var prettyPrint = require('./utils').prettyPrint const isOmnivore = process.env.IS_OMNIVORE var testPages = require('./utils').getTestPages(isOmnivore) function reformatError(err) { var formattedError = new Error(err.message) formattedError.stack = err.stack return formattedError } function inOrderTraverse(fromNode) { if (fromNode.firstChild) { return fromNode.firstChild } while (fromNode && !fromNode.nextSibling) { fromNode = fromNode.parentNode } return fromNode ? fromNode.nextSibling : null } function inOrderIgnoreEmptyTextNodes(fromNode) { do { fromNode = inOrderTraverse(fromNode) } while (fromNode && fromNode.nodeType == 3 && !fromNode.textContent.trim()) return fromNode } function traverseDOM(callback, expectedDOM, actualDOM) { var actualNode = actualDOM.documentElement || actualDOM.childNodes[0] var expectedNode = expectedDOM.documentElement || expectedDOM.childNodes[0] while (actualNode || expectedNode) { // We'll stop if we don't have both actualNode and expectedNode if (!callback(actualNode, expectedNode)) { break } actualNode = inOrderIgnoreEmptyTextNodes(actualNode) expectedNode = inOrderIgnoreEmptyTextNodes(expectedNode) } } // Collapse subsequent whitespace like HTML: function htmlTransform(str) { return str.replace(/\s+/g, ' ') } function runTestsWithItems( label, domGenerationFn, source, expectedContent, expectedMetadata, uri ) { describe(label, function () { this.timeout(30000) var result before(async function () { try { var doc = domGenerationFn(source) // Provide one class name to preserve, which we know appears in a few // of the test documents. var myReader = new Readability(doc, { classesToPreserve: ['caption'], url: uri, }) result = await myReader.parse() } catch (err) { throw reformatError(err) } }) it('should return a result object', function () { expect(result).to.include.keys('content', 'title', 'excerpt', 'byline') }) it('should extract expected content', function () { function nodeStr(n) { if (!n) { return '(no node)' } if (n.nodeType == 3) { return '#text(' + htmlTransform(n.textContent) + ')' } if (n.nodeType != 1) { return 'some other node type: ' + n.nodeType + ' with data ' + n.data } var rv = n.localName if (n.id) { rv += '#' + n.id } if (n.className) { rv += '.(' + n.className + ')' } return rv } function genPath(node) { if (node.id) { return '#' + node.id } if (node.tagName == 'BODY') { return 'body' } var parent = node.parentNode var parentPath = genPath(parent) var index = Array.prototype.indexOf.call(parent.childNodes, node) + 1 return parentPath + ' > ' + nodeStr(node) + ':nth-child(' + index + ')' } function findableNodeDesc(node) { return genPath(node) + '(in: ``' + node.parentNode.innerHTML + '``)' } function attributesForNode(node) { return Array.from(node.attributes) .map(function (attr) { return attr.name + '=' + attr.value }) .join(',') } var actualDOM = domGenerationFn(prettyPrint(result.content)) var expectedDOM = domGenerationFn(prettyPrint(expectedContent)) traverseDOM( function (actualNode, expectedNode) { if (actualNode && expectedNode) { var actualDesc = nodeStr(actualNode) var expectedDesc = nodeStr(expectedNode) if (actualDesc != expectedDesc) { expect(actualDesc, findableNodeDesc(actualNode)).eql(expectedDesc) return false } // Compare text for text nodes: if (actualNode.nodeType == 3) { var actualText = htmlTransform(actualNode.textContent) var expectedText = htmlTransform(expectedNode.textContent) expect(actualText, findableNodeDesc(actualNode)).eql(expectedText) if (actualText != expectedText) { return false } // Compare attributes for element nodes: } else if (actualNode.nodeType == 1) { var actualNodeDesc = attributesForNode(actualNode) var expectedNodeDesc = attributesForNode(expectedNode) var desc = 'node ' + nodeStr(actualNode) + ' attributes (' + actualNodeDesc + ') should match (' + expectedNodeDesc + ') ' expect(actualNode.attributes.length, desc).eql( expectedNode.attributes.length ) for (var i = 0; i < actualNode.attributes.length; i++) { var attr = actualNode.attributes[i].name var actualValue = actualNode.getAttribute(attr) var expectedValue = expectedNode.getAttribute(attr) expect( expectedValue, 'node (' + findableNodeDesc(actualNode) + ') attribute ' + attr + ' should match' ).eql(actualValue) } } } else { expect( nodeStr(actualNode), 'Should have a node from both DOMs' ).eql(nodeStr(expectedNode)) return false } return true }, actualDOM, expectedDOM ) }) it('should extract expected title', function () { expect(result.title).eql(expectedMetadata.title) }) it('should extract expected byline', function () { expect(result.byline).eql(expectedMetadata.byline) }) it('should extract expected excerpt', function () { expect(result.excerpt).eql(expectedMetadata.excerpt) }) it('should extract expected site name', function () { expect(result.siteName).eql(expectedMetadata.siteName) }) expectedMetadata.dir && it('should extract expected direction', function () { expect(result.dir).eql(expectedMetadata.dir) }) }) } function removeCommentNodesRecursively(node) { for (var i = node.childNodes.length - 1; i >= 0; i--) { var child = node.childNodes[i] if (child.nodeType === child.COMMENT_NODE) { node.removeChild(child) } else if (child.nodeType === child.ELEMENT_NODE) { removeCommentNodesRecursively(child) } } } describe('Readability API', function () { describe('#constructor', function () { var doc = new JSDOMParser().parse('
yo
') it('should accept a debug option', function () { expect(new Readability(doc)._debug).eql(false) expect(new Readability(doc, { debug: true })._debug).eql(true) }) it('should accept a nbTopCandidates option', function () { expect(new Readability(doc)._nbTopCandidates).eql(5) expect( new Readability(doc, { nbTopCandidates: 42 })._nbTopCandidates ).eql(42) }) it('should accept a maxElemsToParse option', function () { expect(new Readability(doc)._maxElemsToParse).eql(0) expect( new Readability(doc, { maxElemsToParse: 42 })._maxElemsToParse ).eql(42) }) it('should accept a keepClasses option', function () { expect(new Readability(doc)._keepClasses).eql(false) expect(new Readability(doc, { keepClasses: true })._keepClasses).eql(true) expect(new Readability(doc, { keepClasses: false })._keepClasses).eql( false ) }) }) describe('#parse', function () { var exampleSource = testPages[0].source it("shouldn't parse oversized documents as per configuration", async function () { var doc = new JSDOMParser().parse('
yo
') await expect( new Readability(doc, { maxElemsToParse: 1 }).parse() ).to.be.rejectedWith('Aborting parsing document; 2 elements found') }) it('should run _cleanElement with default configuration', async function () { var doc = parseHTML(exampleSource).document var parser = new Readability(doc) parser._cleanElement = sinon.fake() await parser.parse() expect(parser._cleanElement.called).eql(true) }) it('should run _cleanElement when option keepClasses = false', async function () { var doc = parseHTML(exampleSource).document var parser = new Readability(doc, { keepClasses: false }) parser._cleanElement = sinon.fake() await parser.parse() expect(parser._cleanElement.called).eql(true) }) it("shouldn't run _cleanElement when option keepClasses = true", async function () { var doc = parseHTML(exampleSource).document var parser = new Readability(doc, { keepClasses: true }) parser._cleanElement = sinon.fake() await parser.parse() expect(parser._cleanElement.called).eql(false) }) xit('should use custom content serializer sent as option', async function () { var dom = parseHTML("My cat: ") var expected_xhtml = '
My cat:
' var xml = new dom.window.XMLSerializer() var content = await new Readability(dom.window.document, { serializer: function (el) { return xml.serializeToString(el.firstChild) }, }).parse().content expect(content).eql(expected_xhtml) }) it('should not proxy image with data uri', async function () { var dom = parseHTML( 'My cat: Red dot' ) var expected_xhtml = '
My cat: Red dot
' var content = (await new Readability(dom.document).parse()).content expect(content).eql(expected_xhtml) }) it('should handle srcset elements with density descriptors', async function () { var dom = parseHTML( 'My image: ' + '' ) var expected_xhtml = '
My image: ' + '
' var content = ( await new Readability(dom.document, { createImageProxyUrl: function (url) { return url }, }).parse() ).content expect(content).eql(expected_xhtml) }) it('should remove srcset elements that are lazy loading placeholders', async function () { var dom = parseHTML( 'My image: ' ) var expected_xhtml = '
' + 'My image: ' + '
' var content = ( await new Readability(dom.document, { createImageProxyUrl: function (url) { return url }, }).parse() ).content expect(content).eql(expected_xhtml) }) }) }) describe('Test pages', function () { before(function () { // mock the substack redirect to twitter nock('https://email.mg2.substack.com') .get( '/c/eJxNkcuO3SAMhp8m7E5EzC1ZsDjT0fQidVG1qtRVRIKToCYQAZmZ9OlLejaVEBhk-__5PJqMc4in3kPK5Nr6fO6oPb6lFXPGSI6EsXdWK8ZAcQXEam6bVrTEpX6KiJtxq87xQLIfw-pGk13wVwWojlNKFo1UMpgYWCrbtoNJmIYh60AMYKxopoewOaxDP6LGV4xn8EjGsG3o89WLrHrJeU8Vu1fwUlZ-c5e9uuSU2_cF1xX9gHHG-PW5vKRs8pFK0AjoWq6KdMNapVoJvGIvpdEz0ApkLsEdPrLz6dP5Pv76LNOfnx_gxxd6J04DBaACZDkphZrVVqnR0nZCaVs6GVFxus1Qp2MoeuPvyw2Jegl-HsJNheC-JX7DkuX8EN5vFrdQh8271xCxNvt-Ieuvbx7e5bNHb4YV7YNmfgzlH99-Ro-xDMv2JutGCiY6yRsGgj_gFURCyk6wRpHixoZS5fXmxsXgmv6n8xdPFKS3' ) .reply(302, '', [ 'location', 'https://twitter.com/ShellenbergerMD/status/1529847068138778624?s=20&t=A2G3yBHyxcYI6szVC2TJ0A', ]) nock('https://twitter.com') .get('/ShellenbergerMD/status/1529847068138778624') .query({ s: '20', t: 'A2G3yBHyxcYI6szVC2TJ0A' }) .reply(200) }) testPages.forEach(function (testPage) { describe(testPage.dir, function () { var uri = 'http://fakehost/test/page.html' runTestsWithItems( 'linkedom', function (source) { var doc = parseHTML(source).document removeCommentNodesRecursively(doc) return doc }, testPage.source, testPage.expectedContent, testPage.expectedMetadata, uri ) // runTestsWithItems("JSDOMParser", function(source) { // var parser = new JSDOMParser(); // var doc = parser.parse(source, uri); // if (parser.errorState) { // console.error("Parsing this DOM caused errors:", parser.errorState); // return null; // } // return doc; // }, testPage.source, testPage.expectedContent, testPage.expectedMetadata); }) }) })