--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -114,17 +114,17 @@ Readability.prototype = {
DEFAULT_MAX_PAGES: 5,
// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
- unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
+ unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|legends|menu|modal|related|remark|rss|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
byline: /byline|author|dateline|writtenby|p-author/i,
replaceFonts: /<(\/?)font[^>]*>/gi,
normalize: /\s{2,}/g,
videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
@@ -466,16 +466,22 @@ Readability.prototype = {
// Clean out junk from the article content
this._cleanConditionally(articleContent, "form");
this._clean(articleContent, "object");
this._clean(articleContent, "embed");
this._clean(articleContent, "h1");
this._clean(articleContent, "footer");
+ // Clean out elements have "share" in their id/class combinations from final top candidates,
+ // which means we don't remove the top candidates even they have "share".
+ this._forEachNode(articleContent.children, function(topCandidate) {
+ this._cleanMatchedNodes(topCandidate, /share/);
+ });
+
// If there is only one h2, they are probably using it as a header
// and not a subheader, so remove it since we already have a header.
if (articleContent.getElementsByTagName('h2').length === 1)
this._clean(articleContent, "h2");
this._clean(articleContent, "iframe");
this._cleanHeaders(articleContent);
@@ -657,19 +663,16 @@ Readability.prototype = {
// We can't grab an article if we don't have a page!
if (!page) {
this.log("No body found in document. Abort.");
return null;
}
var pageCacheHtml = page.innerHTML;
- // Check if any "dir" is set on the toplevel document element
- this._articleDir = doc.documentElement.getAttribute("dir");
-
while (true) {
var stripUnlikelyCandidates = this._flagIsActive(this.FLAG_STRIP_UNLIKELYS);
// First, node prepping. Trash nodes that look cruddy (like ones with the
// class name "comment", etc), and turn divs into P tags where they have been
// used inappropriately (as in, where they contain no other block level elements.)
var elementsToScore = [];
var node = this._doc.documentElement;
@@ -807,16 +810,17 @@ Readability.prototype = {
topCandidates.pop();
break;
}
}
}
var topCandidate = topCandidates[0] || null;
var neededToCreateTopCandidate = false;
+ var parentOfTopCandidate;
// If we still have no top candidate, just use the body as a last resort.
// We also have to copy the body node so it is something we can modify.
if (topCandidate === null || topCandidate.tagName === "BODY") {
// Move all of the page's children into topCandidate
topCandidate = doc.createElement("DIV");
neededToCreateTopCandidate = true;
// Move everything (not just elements, also text nodes etc.) into the container
@@ -833,17 +837,17 @@ Readability.prototype = {
} else if (topCandidate) {
// Because of our bonus system, parents of candidates might have scores
// themselves. They get half of the node. There won't be nodes with higher
// scores than our topCandidate, but if we see the score going *up* in the first
// few steps up the tree, that's a decent sign that there might be more content
// lurking in other places that we want to unify in. The sibling stuff
// below does some of that - but only if we've looked high enough up the DOM
// tree.
- var parentOfTopCandidate = topCandidate.parentNode;
+ parentOfTopCandidate = topCandidate.parentNode;
var lastScore = topCandidate.readability.contentScore;
// The scores shouldn't get too low.
var scoreThreshold = lastScore / 3;
while (parentOfTopCandidate && parentOfTopCandidate.readability) {
var parentScore = parentOfTopCandidate.readability.contentScore;
if (parentScore < scoreThreshold)
break;
if (parentScore > lastScore) {
@@ -859,17 +863,19 @@ Readability.prototype = {
// Now that we have the top candidate, look through its siblings for content
// that might also be related. Things like preambles, content split by ads
// that we removed, etc.
var articleContent = doc.createElement("DIV");
if (isPaging)
articleContent.id = "readability-content";
var siblingScoreThreshold = Math.max(10, topCandidate.readability.contentScore * 0.2);
- var siblings = topCandidate.parentNode.children;
+ // Keep potential top candidate's parent node to try to get text direction of it later.
+ parentOfTopCandidate = topCandidate.parentNode;
+ var siblings = parentOfTopCandidate.children;
for (var s = 0, sl = siblings.length; s < sl; s++) {
var sibling = siblings[s];
var append = false;
this.log("Looking at sibling node:", sibling, sibling.readability ? ("with score " + sibling.readability.contentScore) : '');
this.log("Sibling has score", sibling.readability ? sibling.readability.contentScore : 'Unknown');
@@ -963,16 +969,28 @@ Readability.prototype = {
} else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
this._removeFlag(this.FLAG_WEIGHT_CLASSES);
} else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
} else {
return null;
}
} else {
+ // Find out text direction from ancestors of final top candidate.
+ var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
+ this._someNode(ancestors, function(ancestor) {
+ if (!ancestor.tagName)
+ return false;
+ var articleDir = ancestor.getAttribute("dir");
+ if (articleDir) {
+ this._articleDir = articleDir;
+ return true;
+ }
+ return false;
+ });
return articleContent;
}
}
},
/**
* Check whether the input string could be a byline.
* This verifies that the input is a string, and that the length
@@ -1691,16 +1709,35 @@ Readability.prototype = {
((embedCount === 1 && contentLength < 75) || embedCount > 1);
return haveToRemove;
}
return false;
});
},
/**
+ * Clean out elements whose id/class combinations match specific string.
+ *
+ * @param Element
+ * @param RegExp match id/class combination.
+ * @return void
+ **/
+ _cleanMatchedNodes: function(e, regex) {
+ var endOfSearchMarkerNode = this._getNextNode(e, true);
+ var next = this._getNextNode(e);
+ while (next && next != endOfSearchMarkerNode) {
+ if (regex.test(next.className + " " + next.id)) {
+ next = this._removeAndGetNext(next);
+ } else {
+ next = this._getNextNode(next);
+ }
+ }
+ },
+
+ /**
* Clean out spurious headers from an Element. Checks things like classnames and link density.
*
* @param Element
* @return void
**/
_cleanHeaders: function(e) {
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
this._removeNodes(e.getElementsByTagName('h' + headerIndex), function (header) {