--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -34,16 +34,17 @@
* @param {Object} options The options object.
*/
function Readability(uri, doc, options) {
options = options || {};
this._uri = uri;
this._doc = doc;
this._biggestFrame = false;
+ this._articleTitle = null;
this._articleByline = null;
this._articleDir = null;
// Configureable options
this._debug = !!options.debug;
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES;
@@ -114,17 +115,17 @@ Readability.prototype = {
DEFAULT_MAX_PAGES: 5,
// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
- unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|ad-break|agegate|pagination|pager|popup|yom-remote/i,
+ unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
byline: /byline|author|dateline|writtenby|p-author/i,
replaceFonts: /<(\/?)font[^>]*>/gi,
normalize: /\s{2,}/g,
videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
@@ -484,20 +485,28 @@ Readability.prototype = {
this._clean(articleContent, "footer");
// Clean out elements have "share" in their id/class combinations from final top candidates,
// which means we don't remove the top candidates even they have "share".
this._forEachNode(articleContent.children, function(topCandidate) {
this._cleanMatchedNodes(topCandidate, /share/);
});
- // If there is only one h2, they are probably using it as a header
- // and not a subheader, so remove it since we already have a header.
- if (articleContent.getElementsByTagName('h2').length === 1)
- this._clean(articleContent, "h2");
+ // If there is only one h2 and its text content substantially equals article title,
+ // they are probably using it as a header and not a subheader,
+ // so remove it since we already extract the title separately.
+ var h2 = articleContent.getElementsByTagName('h2');
+ if (h2.length === 1) {
+ var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length;
+ if (Math.abs(lengthSimilarRate) < 0.5 &&
+ (lengthSimilarRate > 0 ? h2[0].textContent.includes(this._articleTitle) :
+ this._articleTitle.includes(h2[0].textContent))) {
+ this._clean(articleContent, "h2");
+ }
+ }
this._clean(articleContent, "iframe");
this._clean(articleContent, "input");
this._clean(articleContent, "textarea");
this._clean(articleContent, "select");
this._clean(articleContent, "button");
this._cleanHeaders(articleContent);
@@ -709,16 +718,25 @@ Readability.prototype = {
node.tagName !== "BODY" &&
node.tagName !== "A") {
this.log("Removing unlikely candidate - " + matchString);
node = this._removeAndGetNext(node);
continue;
}
}
+ // Remove empty DIV, SECTION, and HEADER nodes
+ if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" ||
+ node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
+ node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
+ this._isEmptyElement(node)) {
+ node = this._removeAndGetNext(node);
+ continue;
+ }
+
if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
elementsToScore.push(node);
}
// Turn all divs that don't have children block level elements into p's
if (node.tagName === "DIV") {
// Sites like http://mobile.slate.com encloses each paragraph with a DIV
// element. DIVs with only a P element inside and no text content can be
@@ -729,17 +747,17 @@ Readability.prototype = {
node.parentNode.replaceChild(newNode, node);
node = newNode;
} else if (!this._hasChildBlockElement(node)) {
node = this._setNodeTag(node, "P");
elementsToScore.push(node);
} else {
// EXPERIMENTAL
this._forEachNode(node.childNodes, function(childNode) {
- if (childNode.nodeType === Node.TEXT_NODE) {
+ if (childNode.nodeType === Node.TEXT_NODE && childNode.textContent.trim().length > 0) {
var p = doc.createElement('p');
p.textContent = childNode.textContent;
p.style.display = 'inline';
p.className = 'readability-styled';
node.replaceChild(p, childNode);
}
});
}
@@ -900,16 +918,27 @@ Readability.prototype = {
if (parentScore > lastScore) {
// Alright! We found a better parent to use.
topCandidate = parentOfTopCandidate;
break;
}
lastScore = parentOfTopCandidate.readability.contentScore;
parentOfTopCandidate = parentOfTopCandidate.parentNode;
}
+
+ // If the top candidate is the only child, use parent instead. This will help sibling
+ // joining logic when adjacent content is actually located in parent's sibling node.
+ parentOfTopCandidate = topCandidate.parentNode;
+ while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) {
+ topCandidate = parentOfTopCandidate;
+ parentOfTopCandidate = topCandidate.parentNode;
+ }
+ if (!topCandidate.readability) {
+ this._initializeNode(topCandidate);
+ }
}
// Now that we have the top candidate, look through its siblings for content
// that might also be related. Things like preambles, content split by ads
// that we removed, etc.
var articleContent = doc.createElement("DIV");
if (isPaging)
articleContent.id = "readability-content";
@@ -1152,16 +1181,22 @@ Readability.prototype = {
// And there should be no text nodes with real content
return !this._someNode(element.childNodes, function(node) {
return node.nodeType === Node.TEXT_NODE &&
this.REGEXPS.hasContent.test(node.textContent);
});
},
+ _isEmptyElement: function(node) {
+ return node.nodeType === Node.ELEMENT_NODE &&
+ node.children.length == 0 &&
+ node.textContent.trim().length == 0;
+ },
+
/**
* Determine whether element has any children block level elements.
*
* @param Element
*/
_hasChildBlockElement: function (element) {
return this._someNode(element.childNodes, function(node) {
return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 ||
@@ -1739,17 +1774,17 @@ Readability.prototype = {
if (!this.REGEXPS.videos.test(embeds[ei].src))
embedCount += 1;
}
var linkDensity = this._getLinkDensity(node);
var contentLength = this._getInnerText(node).length;
var haveToRemove =
- (img > 1 && img > p && !this._hasAncestorTag(node, "figure")) ||
+ (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) ||
(!isList && li > p) ||
(input > Math.floor(p/3)) ||
(!isList && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) ||
(!isList && weight < 25 && linkDensity > 0.2) ||
(weight >= 25 && linkDensity > 0.5) ||
((embedCount === 1 && contentLength < 75) || embedCount > 1);
return haveToRemove;
}
@@ -1896,17 +1931,17 @@ Readability.prototype = {
// this._parsedPages[uri.spec.replace(/\/$/, '')] = true;
// Pull out any possible next page link first.
// var nextPageLink = this._findNextPageLink(doc.body);
this._prepDocument();
var metadata = this._getArticleMetadata();
- var articleTitle = metadata.title;
+ this._articleTitle = metadata.title;
var articleContent = this._grabArticle();
if (!articleContent)
return null;
this.log("Grabbed: " + articleContent.innerHTML);
this._postProcessContent(articleContent);
@@ -1927,17 +1962,17 @@ Readability.prototype = {
if (paragraphs.length > 0) {
metadata.excerpt = paragraphs[0].textContent.trim();
}
}
var textContent = articleContent.textContent;
return {
uri: this._uri,
- title: articleTitle,
+ title: this._articleTitle,
byline: metadata.byline || this._articleByline,
dir: this._articleDir,
content: articleContent.innerHTML,
textContent: textContent,
length: textContent.length,
excerpt: metadata.excerpt,
};
}