Update readability test for new _cleanElement name

This commit is contained in:
Jackson Harper
2024-08-29 10:59:53 +08:00
parent 6129606e54
commit 6b57ab7a84

View File

@ -1,353 +1,417 @@
var chai = require("chai"); var chai = require('chai')
var sinon = require("sinon"); var sinon = require('sinon')
var chaiAsPromised = require("chai-as-promised"); var chaiAsPromised = require('chai-as-promised')
const { parseHTML } = require("linkedom"); const { parseHTML } = require('linkedom')
const nock = require("nock"); const nock = require('nock')
chai.use(chaiAsPromised); chai.use(chaiAsPromised)
chai.config.includeStack = true; chai.config.includeStack = true
var expect = chai.expect; var expect = chai.expect
var Readability = require("../index").Readability; var Readability = require('../index').Readability
var JSDOMParser = require("../JSDOMParser"); var JSDOMParser = require('../JSDOMParser')
var prettyPrint = require("./utils").prettyPrint; var prettyPrint = require('./utils').prettyPrint
const isOmnivore = process.env.IS_OMNIVORE; const isOmnivore = process.env.IS_OMNIVORE
var testPages = require("./utils").getTestPages(isOmnivore); var testPages = require('./utils').getTestPages(isOmnivore)
function reformatError(err) { function reformatError(err) {
var formattedError = new Error(err.message); var formattedError = new Error(err.message)
formattedError.stack = err.stack; formattedError.stack = err.stack
return formattedError; return formattedError
} }
function inOrderTraverse(fromNode) { function inOrderTraverse(fromNode) {
if (fromNode.firstChild) { if (fromNode.firstChild) {
return fromNode.firstChild; return fromNode.firstChild
} }
while (fromNode && !fromNode.nextSibling) { while (fromNode && !fromNode.nextSibling) {
fromNode = fromNode.parentNode; fromNode = fromNode.parentNode
} }
return fromNode ? fromNode.nextSibling : null; return fromNode ? fromNode.nextSibling : null
} }
function inOrderIgnoreEmptyTextNodes(fromNode) { function inOrderIgnoreEmptyTextNodes(fromNode) {
do { do {
fromNode = inOrderTraverse(fromNode); fromNode = inOrderTraverse(fromNode)
} while (fromNode && fromNode.nodeType == 3 && !fromNode.textContent.trim()); } while (fromNode && fromNode.nodeType == 3 && !fromNode.textContent.trim())
return fromNode; return fromNode
} }
function traverseDOM(callback, expectedDOM, actualDOM) { function traverseDOM(callback, expectedDOM, actualDOM) {
var actualNode = actualDOM.documentElement || actualDOM.childNodes[0]; var actualNode = actualDOM.documentElement || actualDOM.childNodes[0]
var expectedNode = expectedDOM.documentElement || expectedDOM.childNodes[0]; var expectedNode = expectedDOM.documentElement || expectedDOM.childNodes[0]
while (actualNode || expectedNode) { while (actualNode || expectedNode) {
// We'll stop if we don't have both actualNode and expectedNode // We'll stop if we don't have both actualNode and expectedNode
if (!callback(actualNode, expectedNode)) { if (!callback(actualNode, expectedNode)) {
break; break
} }
actualNode = inOrderIgnoreEmptyTextNodes(actualNode); actualNode = inOrderIgnoreEmptyTextNodes(actualNode)
expectedNode = inOrderIgnoreEmptyTextNodes(expectedNode); expectedNode = inOrderIgnoreEmptyTextNodes(expectedNode)
} }
} }
// Collapse subsequent whitespace like HTML: // Collapse subsequent whitespace like HTML:
function htmlTransform(str) { function htmlTransform(str) {
return str.replace(/\s+/g, " "); return str.replace(/\s+/g, ' ')
} }
function runTestsWithItems(label, domGenerationFn, source, expectedContent, expectedMetadata, uri) { function runTestsWithItems(
describe(label, function() { label,
this.timeout(30000); domGenerationFn,
source,
expectedContent,
expectedMetadata,
uri
) {
describe(label, function () {
this.timeout(30000)
var result; var result
before(async function() { before(async function () {
try { try {
var doc = domGenerationFn(source); var doc = domGenerationFn(source)
// Provide one class name to preserve, which we know appears in a few // Provide one class name to preserve, which we know appears in a few
// of the test documents. // of the test documents.
var myReader = new Readability(doc, { classesToPreserve: ["caption"], url: uri }); var myReader = new Readability(doc, {
result = await myReader.parse(); classesToPreserve: ['caption'],
url: uri,
})
result = await myReader.parse()
} catch (err) { } catch (err) {
throw reformatError(err); throw reformatError(err)
} }
}); })
it("should return a result object", function() { it('should return a result object', function () {
expect(result).to.include.keys("content", "title", "excerpt", "byline"); expect(result).to.include.keys('content', 'title', 'excerpt', 'byline')
}); })
it("should extract expected content", function() { it('should extract expected content', function () {
function nodeStr(n) { function nodeStr(n) {
if (!n) { if (!n) {
return "(no node)"; return '(no node)'
} }
if (n.nodeType == 3) { if (n.nodeType == 3) {
return "#text(" + htmlTransform(n.textContent) + ")"; return '#text(' + htmlTransform(n.textContent) + ')'
} }
if (n.nodeType != 1) { if (n.nodeType != 1) {
return "some other node type: " + n.nodeType + " with data " + n.data; return 'some other node type: ' + n.nodeType + ' with data ' + n.data
} }
var rv = n.localName; var rv = n.localName
if (n.id) { if (n.id) {
rv += "#" + n.id; rv += '#' + n.id
} }
if (n.className) { if (n.className) {
rv += ".(" + n.className + ")"; rv += '.(' + n.className + ')'
} }
return rv; return rv
} }
function genPath(node) { function genPath(node) {
if (node.id) { if (node.id) {
return "#" + node.id; return '#' + node.id
} }
if (node.tagName == "BODY") { if (node.tagName == 'BODY') {
return "body"; return 'body'
} }
var parent = node.parentNode; var parent = node.parentNode
var parentPath = genPath(parent); var parentPath = genPath(parent)
var index = Array.prototype.indexOf.call(parent.childNodes, node) + 1; var index = Array.prototype.indexOf.call(parent.childNodes, node) + 1
return parentPath + " > " + nodeStr(node) + ":nth-child(" + index + ")"; return parentPath + ' > ' + nodeStr(node) + ':nth-child(' + index + ')'
} }
function findableNodeDesc(node) { function findableNodeDesc(node) {
return genPath(node) + "(in: ``" + node.parentNode.innerHTML + "``)"; return genPath(node) + '(in: ``' + node.parentNode.innerHTML + '``)'
} }
function attributesForNode(node) { function attributesForNode(node) {
return Array.from(node.attributes).map(function(attr) { return Array.from(node.attributes)
return attr.name + "=" + attr.value; .map(function (attr) {
}).join(","); return attr.name + '=' + attr.value
})
.join(',')
} }
var actualDOM = domGenerationFn(prettyPrint(result.content))
var actualDOM = domGenerationFn(prettyPrint(result.content)); var expectedDOM = domGenerationFn(prettyPrint(expectedContent))
var expectedDOM = domGenerationFn(prettyPrint(expectedContent)); traverseDOM(
traverseDOM(function(actualNode, expectedNode) { function (actualNode, expectedNode) {
if (actualNode && expectedNode) { if (actualNode && expectedNode) {
var actualDesc = nodeStr(actualNode); var actualDesc = nodeStr(actualNode)
var expectedDesc = nodeStr(expectedNode); var expectedDesc = nodeStr(expectedNode)
if (actualDesc != expectedDesc) { if (actualDesc != expectedDesc) {
expect(actualDesc, findableNodeDesc(actualNode)).eql(expectedDesc); expect(actualDesc, findableNodeDesc(actualNode)).eql(expectedDesc)
return false; return false
}
// Compare text for text nodes:
if (actualNode.nodeType == 3) {
var actualText = htmlTransform(actualNode.textContent);
var expectedText = htmlTransform(expectedNode.textContent);
expect(actualText, findableNodeDesc(actualNode)).eql(expectedText);
if (actualText != expectedText) {
return false;
} }
// Compare attributes for element nodes: // Compare text for text nodes:
} else if (actualNode.nodeType == 1) { if (actualNode.nodeType == 3) {
var actualNodeDesc = attributesForNode(actualNode); var actualText = htmlTransform(actualNode.textContent)
var expectedNodeDesc = attributesForNode(expectedNode); var expectedText = htmlTransform(expectedNode.textContent)
var desc = "node " + nodeStr(actualNode) + " attributes (" + actualNodeDesc + ") should match (" + expectedNodeDesc + ") "; expect(actualText, findableNodeDesc(actualNode)).eql(expectedText)
expect(actualNode.attributes.length, desc).eql(expectedNode.attributes.length); if (actualText != expectedText) {
for (var i = 0; i < actualNode.attributes.length; i++) { return false
var attr = actualNode.attributes[i].name; }
var actualValue = actualNode.getAttribute(attr); // Compare attributes for element nodes:
var expectedValue = expectedNode.getAttribute(attr); } else if (actualNode.nodeType == 1) {
expect(expectedValue, "node (" + findableNodeDesc(actualNode) + ") attribute " + attr + " should match").eql(actualValue); var actualNodeDesc = attributesForNode(actualNode)
var expectedNodeDesc = attributesForNode(expectedNode)
var desc =
'node ' +
nodeStr(actualNode) +
' attributes (' +
actualNodeDesc +
') should match (' +
expectedNodeDesc +
') '
expect(actualNode.attributes.length, desc).eql(
expectedNode.attributes.length
)
for (var i = 0; i < actualNode.attributes.length; i++) {
var attr = actualNode.attributes[i].name
var actualValue = actualNode.getAttribute(attr)
var expectedValue = expectedNode.getAttribute(attr)
expect(
expectedValue,
'node (' +
findableNodeDesc(actualNode) +
') attribute ' +
attr +
' should match'
).eql(actualValue)
}
} }
} else {
expect(
nodeStr(actualNode),
'Should have a node from both DOMs'
).eql(nodeStr(expectedNode))
return false
} }
} else { return true
expect(nodeStr(actualNode), "Should have a node from both DOMs").eql(nodeStr(expectedNode)); },
return false; actualDOM,
} expectedDOM
return true; )
}, actualDOM, expectedDOM); })
});
it("should extract expected title", function() { it('should extract expected title', function () {
expect(result.title).eql(expectedMetadata.title); expect(result.title).eql(expectedMetadata.title)
}); })
it("should extract expected byline", function() { it('should extract expected byline', function () {
expect(result.byline).eql(expectedMetadata.byline); expect(result.byline).eql(expectedMetadata.byline)
}); })
it("should extract expected excerpt", function() { it('should extract expected excerpt', function () {
expect(result.excerpt).eql(expectedMetadata.excerpt); expect(result.excerpt).eql(expectedMetadata.excerpt)
}); })
it("should extract expected site name", function() { it('should extract expected site name', function () {
expect(result.siteName).eql(expectedMetadata.siteName); expect(result.siteName).eql(expectedMetadata.siteName)
}); })
expectedMetadata.dir && it("should extract expected direction", function() { expectedMetadata.dir &&
expect(result.dir).eql(expectedMetadata.dir); it('should extract expected direction', function () {
}); expect(result.dir).eql(expectedMetadata.dir)
}); })
})
} }
function removeCommentNodesRecursively(node) { function removeCommentNodesRecursively(node) {
for (var i = node.childNodes.length - 1; i >= 0; i--) { for (var i = node.childNodes.length - 1; i >= 0; i--) {
var child = node.childNodes[i]; var child = node.childNodes[i]
if (child.nodeType === child.COMMENT_NODE) { if (child.nodeType === child.COMMENT_NODE) {
node.removeChild(child); node.removeChild(child)
} else if (child.nodeType === child.ELEMENT_NODE) { } else if (child.nodeType === child.ELEMENT_NODE) {
removeCommentNodesRecursively(child); removeCommentNodesRecursively(child)
} }
} }
} }
describe("Readability API", function() { describe('Readability API', function () {
describe("#constructor", function() { describe('#constructor', function () {
var doc = new JSDOMParser().parse("<html><div>yo</div></html>"); var doc = new JSDOMParser().parse('<html><div>yo</div></html>')
it("should accept a debug option", function() { it('should accept a debug option', function () {
expect(new Readability(doc)._debug).eql(false); expect(new Readability(doc)._debug).eql(false)
expect(new Readability(doc, {debug: true})._debug).eql(true); expect(new Readability(doc, { debug: true })._debug).eql(true)
}); })
it("should accept a nbTopCandidates option", function() { it('should accept a nbTopCandidates option', function () {
expect(new Readability(doc)._nbTopCandidates).eql(5); expect(new Readability(doc)._nbTopCandidates).eql(5)
expect(new Readability(doc, {nbTopCandidates: 42})._nbTopCandidates).eql(42); expect(
}); new Readability(doc, { nbTopCandidates: 42 })._nbTopCandidates
).eql(42)
})
it("should accept a maxElemsToParse option", function() { it('should accept a maxElemsToParse option', function () {
expect(new Readability(doc)._maxElemsToParse).eql(0); expect(new Readability(doc)._maxElemsToParse).eql(0)
expect(new Readability(doc, {maxElemsToParse: 42})._maxElemsToParse).eql(42); expect(
}); new Readability(doc, { maxElemsToParse: 42 })._maxElemsToParse
).eql(42)
})
it("should accept a keepClasses option", function() { it('should accept a keepClasses option', function () {
expect(new Readability(doc)._keepClasses).eql(false); expect(new Readability(doc)._keepClasses).eql(false)
expect(new Readability(doc, {keepClasses: true})._keepClasses).eql(true); expect(new Readability(doc, { keepClasses: true })._keepClasses).eql(true)
expect(new Readability(doc, {keepClasses: false})._keepClasses).eql(false); expect(new Readability(doc, { keepClasses: false })._keepClasses).eql(
}); false
}); )
})
})
describe("#parse", function() { describe('#parse', function () {
var exampleSource = testPages[0].source; var exampleSource = testPages[0].source
it("shouldn't parse oversized documents as per configuration", async function() { it("shouldn't parse oversized documents as per configuration", async function () {
var doc = new JSDOMParser().parse("<html><div>yo</div></html>"); var doc = new JSDOMParser().parse('<html><div>yo</div></html>')
await expect( await expect(
(new Readability(doc, { maxElemsToParse: 1 })).parse() new Readability(doc, { maxElemsToParse: 1 }).parse()
).to.be.rejectedWith("Aborting parsing document; 2 elements found"); ).to.be.rejectedWith('Aborting parsing document; 2 elements found')
}); })
it("should run _cleanClasses with default configuration", async function() { it('should run _cleanElement with default configuration', async function () {
var doc = parseHTML(exampleSource).document; var doc = parseHTML(exampleSource).document
var parser = new Readability(doc); var parser = new Readability(doc)
parser._cleanClasses = sinon.fake(); parser._cleanElement = sinon.fake()
await parser.parse(); await parser.parse()
expect(parser._cleanClasses.called).eql(true); expect(parser._cleanElement.called).eql(true)
}); })
it("should run _cleanClasses when option keepClasses = false", async function() { it('should run _cleanElement when option keepClasses = false', async function () {
var doc = parseHTML(exampleSource).document; var doc = parseHTML(exampleSource).document
var parser = new Readability(doc, { keepClasses: false }); var parser = new Readability(doc, { keepClasses: false })
parser._cleanClasses = sinon.fake(); parser._cleanElement = sinon.fake()
await parser.parse(); await parser.parse()
expect(parser._cleanClasses.called).eql(true); expect(parser._cleanElement.called).eql(true)
}); })
it("shouldn't run _cleanClasses when option keepClasses = true", async function() { it("shouldn't run _cleanElement when option keepClasses = true", async function () {
var doc = parseHTML(exampleSource).document; var doc = parseHTML(exampleSource).document
var parser = new Readability(doc, { keepClasses: true }); var parser = new Readability(doc, { keepClasses: true })
parser._cleanClasses = sinon.fake(); parser._cleanElement = sinon.fake()
await parser.parse(); await parser.parse()
expect(parser._cleanClasses.called).eql(false); expect(parser._cleanElement.called).eql(false)
}); })
xit("should use custom content serializer sent as option", async function() { xit('should use custom content serializer sent as option', async function () {
var dom = parseHTML("<html><body>My cat: <img src=''></body></html>"); var dom = parseHTML("<html><body>My cat: <img src=''></body></html>")
var expected_xhtml = "<div xmlns=\"http://www.w3.org/1999/xhtml\" id=\"readability-page-1\" class=\"page\">My cat: <img src=\"\" /></div>"; var expected_xhtml =
var xml = new dom.window.XMLSerializer(); '<div xmlns="http://www.w3.org/1999/xhtml" id="readability-page-1" class="page">My cat: <img src="" /></div>'
var content = await (new Readability(dom.window.document, { var xml = new dom.window.XMLSerializer()
serializer: function(el) { var content = await new Readability(dom.window.document, {
return xml.serializeToString(el.firstChild); serializer: function (el) {
} return xml.serializeToString(el.firstChild)
})).parse().content; },
expect(content).eql(expected_xhtml); }).parse().content
}); expect(content).eql(expected_xhtml)
})
it("should not proxy image with data uri", async function() { it('should not proxy image with data uri', async function () {
var dom = parseHTML("<html><body>My cat: <img src=\"data:image/png;base64, iVBORw0KGgoAAAANSUhEUgAAAAUA" + var dom = parseHTML(
"AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg==\"" + '<html><body>My cat: <img src="data:image/png;base64, iVBORw0KGgoAAAANSUhEUgAAAAUA' +
" alt=\"Red dot\" /></body></html>"); 'AAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0Y4OHwAAAABJRU5ErkJggg=="' +
var expected_xhtml = "<DIV class=\"page\" id=\"readability-page-1\">My cat: <img src=\"data:image/png;base64," + ' alt="Red dot" /></body></html>'
" iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0" + )
"Y4OHwAAAABJRU5ErkJggg==\" alt=\"Red dot\"></DIV>"; var expected_xhtml =
var content = (await (new Readability(dom.document)).parse()).content; '<DIV class="page" id="readability-page-1">My cat: <img src="data:image/png;base64,' +
expect(content).eql(expected_xhtml); ' iVBORw0KGgoAAAANSUhEUgAAAAUAAAAFCAYAAACNbyblAAAAHElEQVQI12P4//8/w38GIAXDIBKE0DHxgljNBAAO9TXL0' +
}); 'Y4OHwAAAABJRU5ErkJggg==" alt="Red dot"></DIV>'
var content = (await new Readability(dom.document).parse()).content
expect(content).eql(expected_xhtml)
})
it("should handle srcset elements with density descriptors", async function() { it('should handle srcset elements with density descriptors', async function () {
var dom = parseHTML('<html><body>My image: <img src="https://webkit.org/demos/srcset/image-src.png" ' + var dom = parseHTML(
'srcset="https://webkit.org/demos/srcset/image-1x.png 1x, ' + '<html><body>My image: <img src="https://webkit.org/demos/srcset/image-src.png" ' +
'https://webkit.org/demos/srcset/image-2x.png 2x, ' + 'srcset="https://webkit.org/demos/srcset/image-1x.png 1x, ' +
'https://webkit.org/demos/srcset/image-3x.png 3x, ' + 'https://webkit.org/demos/srcset/image-2x.png 2x, ' +
'https://webkit.org/demos/srcset/image-4x.png 4x">' + 'https://webkit.org/demos/srcset/image-3x.png 3x, ' +
'</body></html>'); 'https://webkit.org/demos/srcset/image-4x.png 4x">' +
var expected_xhtml = '<DIV class="page" id="readability-page-1">My image: ' + '</body></html>'
)
var expected_xhtml =
'<DIV class="page" id="readability-page-1">My image: ' +
'<img data-omnivore-original-src="https://webkit.org/demos/srcset/image-src.png" ' + '<img data-omnivore-original-src="https://webkit.org/demos/srcset/image-src.png" ' +
'src="https://webkit.org/demos/srcset/image-src.png" ' + 'src="https://webkit.org/demos/srcset/image-src.png" ' +
'srcset="https://webkit.org/demos/srcset/image-1x.png 1x,' + 'srcset="https://webkit.org/demos/srcset/image-1x.png 1x,' +
'https://webkit.org/demos/srcset/image-2x.png 2x,' + 'https://webkit.org/demos/srcset/image-2x.png 2x,' +
'https://webkit.org/demos/srcset/image-3x.png 3x,' + 'https://webkit.org/demos/srcset/image-3x.png 3x,' +
'https://webkit.org/demos/srcset/image-4x.png 4x,"></DIV>'; 'https://webkit.org/demos/srcset/image-4x.png 4x,"></DIV>'
var content = (await (new Readability(dom.document, { var content = (
createImageProxyUrl: function(url) { await new Readability(dom.document, {
return url; createImageProxyUrl: function (url) {
} return url
})).parse()).content; },
expect(content).eql(expected_xhtml); }).parse()
}); ).content
expect(content).eql(expected_xhtml)
})
it("should remove srcset elements that are lazy loading placeholders", async function() { it('should remove srcset elements that are lazy loading placeholders', async function () {
var dom = parseHTML('<html><body>My image: <img class="shrinkToFit jetpack-lazy-image" src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&#038;ssl=1" alt width="900" height="380" data-recalc-dims="1" data-lazy-src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&amp;is-pending-load=1#038;ssl=1" srcset="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7"></body></html>'); var dom = parseHTML(
var expected_xhtml = '<DIV class="page" id="readability-page-1">' + '<html><body>My image: <img class="shrinkToFit jetpack-lazy-image" src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&#038;ssl=1" alt width="900" height="380" data-recalc-dims="1" data-lazy-src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&amp;is-pending-load=1#038;ssl=1" srcset="data:image/gif;base64,R0lGODlhAQABAIAAAAAAAP///yH5BAEAAAAALAAAAAABAAEAAAIBRAA7"></body></html>'
)
var expected_xhtml =
'<DIV class="page" id="readability-page-1">' +
'My image: <img data-omnivore-original-src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&is-pending-load=1#038;ssl=1" src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&is-pending-load=1#038;ssl=1" alt="" width="900" height="380" data-recalc-dims="1" data-lazy-src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&is-pending-load=1#038;ssl=1">' + 'My image: <img data-omnivore-original-src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&is-pending-load=1#038;ssl=1" src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&is-pending-load=1#038;ssl=1" alt="" width="900" height="380" data-recalc-dims="1" data-lazy-src="https://i0.wp.com/cdn-images-1.medium.com/max/2000/1*rPXwIczUJRCE54v8FfAHGw.jpeg?resize=900%2C380&is-pending-load=1#038;ssl=1">' +
'</DIV>'; '</DIV>'
var content = (await (new Readability(dom.document, { var content = (
createImageProxyUrl: function(url) { await new Readability(dom.document, {
return url; createImageProxyUrl: function (url) {
} return url
})).parse()).content; },
expect(content).eql(expected_xhtml); }).parse()
}); ).content
}); expect(content).eql(expected_xhtml)
}); })
})
})
describe("Test pages", function() { describe('Test pages', function () {
before(function() { before(function () {
// mock the substack redirect to twitter // mock the substack redirect to twitter
nock('https://email.mg2.substack.com') nock('https://email.mg2.substack.com')
.get('/c/eJxNkcuO3SAMhp8m7E5EzC1ZsDjT0fQidVG1qtRVRIKToCYQAZmZ9OlLejaVEBhk-__5PJqMc4in3kPK5Nr6fO6oPb6lFXPGSI6EsXdWK8ZAcQXEam6bVrTEpX6KiJtxq87xQLIfw-pGk13wVwWojlNKFo1UMpgYWCrbtoNJmIYh60AMYKxopoewOaxDP6LGV4xn8EjGsG3o89WLrHrJeU8Vu1fwUlZ-c5e9uuSU2_cF1xX9gHHG-PW5vKRs8pFK0AjoWq6KdMNapVoJvGIvpdEz0ApkLsEdPrLz6dP5Pv76LNOfnx_gxxd6J04DBaACZDkphZrVVqnR0nZCaVs6GVFxus1Qp2MoeuPvyw2Jegl-HsJNheC-JX7DkuX8EN5vFrdQh8271xCxNvt-Ieuvbx7e5bNHb4YV7YNmfgzlH99-Ro-xDMv2JutGCiY6yRsGgj_gFURCyk6wRpHixoZS5fXmxsXgmv6n8xdPFKS3') .get(
'/c/eJxNkcuO3SAMhp8m7E5EzC1ZsDjT0fQidVG1qtRVRIKToCYQAZmZ9OlLejaVEBhk-__5PJqMc4in3kPK5Nr6fO6oPb6lFXPGSI6EsXdWK8ZAcQXEam6bVrTEpX6KiJtxq87xQLIfw-pGk13wVwWojlNKFo1UMpgYWCrbtoNJmIYh60AMYKxopoewOaxDP6LGV4xn8EjGsG3o89WLrHrJeU8Vu1fwUlZ-c5e9uuSU2_cF1xX9gHHG-PW5vKRs8pFK0AjoWq6KdMNapVoJvGIvpdEz0ApkLsEdPrLz6dP5Pv76LNOfnx_gxxd6J04DBaACZDkphZrVVqnR0nZCaVs6GVFxus1Qp2MoeuPvyw2Jegl-HsJNheC-JX7DkuX8EN5vFrdQh8271xCxNvt-Ieuvbx7e5bNHb4YV7YNmfgzlH99-Ro-xDMv2JutGCiY6yRsGgj_gFURCyk6wRpHixoZS5fXmxsXgmv6n8xdPFKS3'
)
.reply(302, '', [ .reply(302, '', [
'location', 'location',
'https://twitter.com/ShellenbergerMD/status/1529847068138778624?s=20&t=A2G3yBHyxcYI6szVC2TJ0A' 'https://twitter.com/ShellenbergerMD/status/1529847068138778624?s=20&t=A2G3yBHyxcYI6szVC2TJ0A',
]); ])
nock('https://twitter.com') nock('https://twitter.com')
.get('/ShellenbergerMD/status/1529847068138778624') .get('/ShellenbergerMD/status/1529847068138778624')
.query({"s":"20","t":"A2G3yBHyxcYI6szVC2TJ0A"}) .query({ s: '20', t: 'A2G3yBHyxcYI6szVC2TJ0A' })
.reply(200); .reply(200)
}); })
testPages.forEach(function(testPage) { testPages.forEach(function (testPage) {
describe(testPage.dir, function() { describe(testPage.dir, function () {
var uri = "http://fakehost/test/page.html"; var uri = 'http://fakehost/test/page.html'
runTestsWithItems("linkedom", function(source) { runTestsWithItems(
var doc = parseHTML(source).document; 'linkedom',
removeCommentNodesRecursively(doc); function (source) {
return doc; var doc = parseHTML(source).document
}, testPage.source, testPage.expectedContent, testPage.expectedMetadata, uri); removeCommentNodesRecursively(doc)
return doc
},
testPage.source,
testPage.expectedContent,
testPage.expectedMetadata,
uri
)
// runTestsWithItems("JSDOMParser", function(source) { // runTestsWithItems("JSDOMParser", function(source) {
// var parser = new JSDOMParser(); // var parser = new JSDOMParser();
@ -358,6 +422,6 @@ describe("Test pages", function() {
// } // }
// return doc; // return doc;
// }, testPage.source, testPage.expectedContent, testPage.expectedMetadata); // }, testPage.source, testPage.expectedContent, testPage.expectedMetadata);
}); })
}); })
}); })