Bug fix: many pages only grab partial content (dirty.ru, nytimes.com)

1) Avoid conversion of whitespace text nodes into paragraphs. They
create a lot of noise and actually prevent sibling joining logic from
working in many pages.

2) Handle case when adjacent content is actually located in parent's
sibling node instead of top candidate’s sibling.
pull/338/head
andrei-ch 8 years ago committed by Gijs
parent a58913d975
commit 486927ebd9

@ -727,7 +727,7 @@ Readability.prototype = {
} else {
// EXPERIMENTAL
this._forEachNode(node.childNodes, function(childNode) {
if (childNode.nodeType === Node.TEXT_NODE) {
if (childNode.nodeType === Node.TEXT_NODE && childNode.textContent.trim() != "") {
var p = doc.createElement('p');
p.textContent = childNode.textContent;
p.style.display = 'inline';
@ -898,6 +898,14 @@ Readability.prototype = {
lastScore = parentOfTopCandidate.readability.contentScore;
parentOfTopCandidate = parentOfTopCandidate.parentNode;
}
// If the top candidate is the only child, use parent instead. This will help sibling
// joining logic when adjacent content is actually located in parent's sibling node.
parentOfTopCandidate = topCandidate.parentNode;
while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) {
topCandidate = parentOfTopCandidate;
parentOfTopCandidate = topCandidate.parentNode;
}
}
// Now that we have the top candidate, look through its siblings for content

Loading…
Cancel
Save