--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -24,35 +24,40 @@
/*
* This code is heavily based on Arc90's readability.js (1.7.1) script
* available at: http://code.google.com/p/arc90labs-readability
*/
/**
* Public constructor.
- * @param {Object} uri The URI descriptor object.
* @param {HTMLDocument} doc The document to parse.
* @param {Object} options The options object.
*/
-function Readability(uri, doc, options) {
+function Readability(doc, options) {
+ // In some older versions, people passed a URI as the first argument. Cope:
+ if (options && options.documentElement) {
+ doc = options;
+ options = arguments[2];
+ } else if (!doc || !doc.documentElement) {
+ throw new Error("First argument to Readability constructor should be a document object.");
+ }
options = options || {};
- this._uri = uri;
this._doc = doc;
this._articleTitle = null;
this._articleByline = null;
this._articleDir = null;
this._attempts = [];
// Configurable options
this._debug = !!options.debug;
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
- this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;
+ this._charThreshold = options.charThreshold || this.DEFAULT_CHAR_THRESHOLD;
this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
// Start with all flags set
this._flags = this.FLAG_STRIP_UNLIKELYS |
this.FLAG_WEIGHT_CLASSES |
this.FLAG_CLEAN_CONDITIONALLY;
var logEl;
@@ -88,28 +93,32 @@ function Readability(uri, doc, options)
}
}
Readability.prototype = {
FLAG_STRIP_UNLIKELYS: 0x1,
FLAG_WEIGHT_CLASSES: 0x2,
FLAG_CLEAN_CONDITIONALLY: 0x4,
+ // https://developer.mozilla.org/en-US/docs/Web/API/Node/nodeType
+ ELEMENT_NODE: 1,
+ TEXT_NODE: 3,
+
// Max number of nodes supported by this parser. Default: 0 (no limit)
DEFAULT_MAX_ELEMS_TO_PARSE: 0,
// The number of top candidates to consider when analysing how
// tight the competition is among candidates.
DEFAULT_N_TOP_CANDIDATES: 5,
// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
- // The default number of words an article must have in order to return a result
- DEFAULT_WORD_THRESHOLD: 500,
+ // The default number of chars an article must have in order to return a result
+ DEFAULT_CHAR_THRESHOLD: 500,
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
@@ -127,18 +136,29 @@ Readability.prototype = {
DIV_TO_P_ELEMS: [ "A", "BLOCKQUOTE", "DL", "DIV", "IMG", "OL", "P", "PRE", "TABLE", "UL", "SELECT" ],
ALTER_TO_DIV_EXCEPTIONS: ["DIV", "ARTICLE", "SECTION", "P"],
PRESENTATIONAL_ATTRIBUTES: [ "align", "background", "bgcolor", "border", "cellpadding", "cellspacing", "frame", "hspace", "rules", "style", "valign", "vspace" ],
DEPRECATED_SIZE_ATTRIBUTE_ELEMS: [ "TABLE", "TH", "TD", "HR", "PRE" ],
+ // The commented out elements qualify as phrasing content but tend to be
+ // removed by readability when put into paragraphs, so we ignore them here.
+ PHRASING_ELEMS: [
+ // "CANVAS", "IFRAME", "SVG", "VIDEO",
+ "ABBR", "AUDIO", "B", "BDO", "BR", "BUTTON", "CITE", "CODE", "DATA",
+ "DATALIST", "DFN", "EM", "EMBED", "I", "IMG", "INPUT", "KBD", "LABEL",
+ "MARK", "MATH", "METER", "NOSCRIPT", "OBJECT", "OUTPUT", "PROGRESS", "Q",
+ "RUBY", "SAMP", "SCRIPT", "SELECT", "SMALL", "SPAN", "STRONG", "SUB",
+ "SUP", "TEXTAREA", "TIME", "VAR", "WBR"
+ ],
+
// These are the classes that readability sets itself.
- CLASSES_TO_PRESERVE: [ "readability-styled", "page" ],
+ CLASSES_TO_PRESERVE: [ "page" ],
/**
* Run any post-process modifications to article content as necessary.
*
* @param Element
* @return void
**/
_postProcessContent: function(articleContent) {
@@ -211,16 +231,31 @@ Readability.prototype = {
* @param Function fn The iterate function.
* @return Boolean
*/
_someNode: function(nodeList, fn) {
return Array.prototype.some.call(nodeList, fn, this);
},
/**
+ * Iterate over a NodeList, return true if all of the provided iterate
+ * function calls return true, false otherwise.
+ *
+ * For convenience, the current object context is applied to the
+ * provided iterate function.
+ *
+ * @param NodeList nodeList The NodeList.
+ * @param Function fn The iterate function.
+ * @return Boolean
+ */
+ _everyNode: function(nodeList, fn) {
+ return Array.prototype.every.call(nodeList, fn, this);
+ },
+
+ /**
* Concat all nodelists passed as arguments.
*
* @return ...NodeList
* @return Array
*/
_concatNodeLists: function() {
var slice = Array.prototype.slice;
var args = slice.call(arguments);
@@ -322,17 +357,17 @@ Readability.prototype = {
* @return void
**/
_getArticleTitle: function() {
var doc = this._doc;
var curTitle = "";
var origTitle = "";
try {
- curTitle = origTitle = doc.title;
+ curTitle = origTitle = doc.title.trim();
// If they had an element with id "title" in their HTML
if (typeof curTitle !== "string")
curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
} catch (e) {/* ignore exceptions setting the title. */}
var titleHadHierarchicalSeparators = false;
function wordCount(str) {
@@ -350,18 +385,19 @@ Readability.prototype = {
curTitle = origTitle.replace(/[^\|\-\\\/>»]*[\|\-\\\/>»](.*)/gi, '$1');
} else if (curTitle.indexOf(': ') !== -1) {
// Check if we have an heading containing this exact string, so we
// could assume it's the full title.
var headings = this._concatNodeLists(
doc.getElementsByTagName('h1'),
doc.getElementsByTagName('h2')
);
+ var trimmedTitle = curTitle.trim();
var match = this._someNode(headings, function(heading) {
- return heading.textContent === curTitle;
+ return heading.textContent.trim() === trimmedTitle;
});
// If we don't, let's extract the title out of the original title string.
if (!match) {
curTitle = origTitle.substring(origTitle.lastIndexOf(':') + 1);
// If the title is now too short, try the first colon instead:
if (wordCount(curTitle) < 3) {
@@ -416,17 +452,17 @@ Readability.prototype = {
/**
* Finds the next element, starting from the given node, and ignoring
* whitespace in between. If the given node is an element, the same node is
* returned.
*/
_nextElement: function (node) {
var next = node;
while (next
- && (next.nodeType != Node.ELEMENT_NODE)
+ && (next.nodeType != this.ELEMENT_NODE)
&& this.REGEXPS.whitespace.test(next.textContent)) {
next = next.nextSibling;
}
return next;
},
/**
* Replaces 2 or more successive <br> elements with a single <p>.
@@ -459,26 +495,32 @@ Readability.prototype = {
if (replaced) {
var p = this._doc.createElement("p");
br.parentNode.replaceChild(p, br);
next = p.nextSibling;
while (next) {
// If we've hit another <br><br>, we're done adding children to this <p>.
if (next.tagName == "BR") {
- var nextElem = this._nextElement(next);
+ var nextElem = this._nextElement(next.nextSibling);
if (nextElem && nextElem.tagName == "BR")
break;
}
+ if (!this._isPhrasingContent(next)) break;
+
// Otherwise, make this node a child of the new <p>.
var sibling = next.nextSibling;
p.appendChild(next);
next = sibling;
}
+
+ while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild);
+
+ if (p.parentNode.tagName === "P") this._setNodeTag(p.parentNode, "DIV");
}
});
},
_setNodeTag: function (node, tag) {
this.log("_setNodeTag", node, tag);
if (node.__JSDOMParser__) {
node.localName = tag.toLowerCase();
@@ -518,16 +560,17 @@ Readability.prototype = {
// Clean out junk from the article content
this._cleanConditionally(articleContent, "form");
this._cleanConditionally(articleContent, "fieldset");
this._clean(articleContent, "object");
this._clean(articleContent, "embed");
this._clean(articleContent, "h1");
this._clean(articleContent, "footer");
this._clean(articleContent, "link");
+ this._clean(articleContent, "aside");
// Clean out elements have "share" in their id/class combinations from final top candidates,
// which means we don't remove the top candidates even they have "share".
this._forEachNode(articleContent.children, function(topCandidate) {
this._cleanMatchedNodes(topCandidate, /share/);
});
// If there is only one h2 and its text content substantially equals article title,
@@ -574,16 +617,29 @@ Readability.prototype = {
return totalCount === 0 && !this._getInnerText(paragraph, false);
});
this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) {
var next = this._nextElement(br.nextSibling);
if (next && next.tagName == "P")
br.parentNode.removeChild(br);
});
+
+ // Remove single-cell tables
+ this._forEachNode(this._getAllNodesWithTag(articleContent, ["table"]), function(table) {
+ var tbody = this._hasSingleTagInsideElement(table, "TBODY") ? table.firstElementChild : table;
+ if (this._hasSingleTagInsideElement(tbody, "TR")) {
+ var row = tbody.firstElementChild;
+ if (this._hasSingleTagInsideElement(row, "TD")) {
+ var cell = row.firstElementChild;
+ cell = this._setNodeTag(cell, this._everyNode(cell.childNodes, this._isPhrasingContent) ? "P" : "DIV");
+ table.parentNode.replaceChild(cell, table);
+ }
+ }
+ });
},
/**
* Initialize a node with the readability object. Also checks the
* className/id for special names to add to its score.
*
* @param Element
* @return void
@@ -653,47 +709,16 @@ Readability.prototype = {
// (because this is depth-first traversal, we will have already
// seen the parent nodes themselves).
do {
node = node.parentNode;
} while (node && !node.nextElementSibling);
return node && node.nextElementSibling;
},
- /**
- * Like _getNextNode, but for DOM implementations with no
- * firstElementChild/nextElementSibling functionality...
- */
- _getNextNodeNoElementProperties: function(node, ignoreSelfAndKids) {
- function nextSiblingEl(n) {
- do {
- n = n.nextSibling;
- } while (n && n.nodeType !== n.ELEMENT_NODE);
- return n;
- }
- // First check for kids if those aren't being ignored
- if (!ignoreSelfAndKids && node.children[0]) {
- return node.children[0];
- }
- // Then for siblings...
- var next = nextSiblingEl(node);
- if (next) {
- return next;
- }
- // And finally, move up the parent chain *and* find a sibling
- // (because this is depth-first traversal, we will have already
- // seen the parent nodes themselves).
- do {
- node = node.parentNode;
- if (node)
- next = nextSiblingEl(node);
- } while (node && !next);
- return node && next;
- },
-
_checkByline: function(node, matchString) {
if (this._articleByline) {
return false;
}
if (node.getAttribute !== undefined) {
var rel = node.getAttribute("rel");
}
@@ -746,16 +771,22 @@ Readability.prototype = {
// class name "comment", etc), and turn divs into P tags where they have been
// used inappropriately (as in, where they contain no other block level elements.)
var elementsToScore = [];
var node = this._doc.documentElement;
while (node) {
var matchString = node.className + " " + node.id;
+ if (!this._isProbablyVisible(node)) {
+ this.log("Removing hidden node - " + matchString);
+ node = this._removeAndGetNext(node);
+ continue;
+ }
+
// Check to see if this node is a byline, and remove it if it is.
if (this._checkByline(node, matchString)) {
node = this._removeAndGetNext(node);
continue;
}
// Remove unlikely candidates
if (stripUnlikelyCandidates) {
@@ -779,39 +810,48 @@ Readability.prototype = {
}
if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
elementsToScore.push(node);
}
// Turn all divs that don't have children block level elements into p's
if (node.tagName === "DIV") {
+ // Put phrasing content into paragraphs.
+ var p = null;
+ var childNode = node.firstChild;
+ while (childNode) {
+ var nextSibling = childNode.nextSibling;
+ if (this._isPhrasingContent(childNode)) {
+ if (p !== null) {
+ p.appendChild(childNode);
+ } else if (!this._isWhitespace(childNode)) {
+ p = doc.createElement('p');
+ node.replaceChild(p, childNode);
+ p.appendChild(childNode);
+ }
+ } else if (p !== null) {
+ while (p.lastChild && this._isWhitespace(p.lastChild)) p.removeChild(p.lastChild);
+ p = null;
+ }
+ childNode = nextSibling;
+ }
+
// Sites like http://mobile.slate.com encloses each paragraph with a DIV
// element. DIVs with only a P element inside and no text content can be
// safely converted into plain P elements to avoid confusing the scoring
// algorithm with DIVs with are, in practice, paragraphs.
- if (this._hasSinglePInsideElement(node)) {
+ if (this._hasSingleTagInsideElement(node, "P") && this._getLinkDensity(node) < 0.25) {
var newNode = node.children[0];
node.parentNode.replaceChild(newNode, node);
node = newNode;
elementsToScore.push(node);
} else if (!this._hasChildBlockElement(node)) {
node = this._setNodeTag(node, "P");
elementsToScore.push(node);
- } else {
- // EXPERIMENTAL
- this._forEachNode(node.childNodes, function(childNode) {
- if (childNode.nodeType === Node.TEXT_NODE && childNode.textContent.trim().length > 0) {
- var p = doc.createElement('p');
- p.textContent = childNode.textContent;
- p.style.display = 'inline';
- p.className = 'readability-styled';
- node.replaceChild(p, childNode);
- }
- });
}
}
node = this._getNextNode(node);
}
/**
* Loop through all paragraphs, and assign a score to them based on how content-y they look.
* Then add their score to their parent node.
@@ -841,17 +881,17 @@ Readability.prototype = {
// Add points for any commas within this paragraph.
contentScore += innerText.split(',').length;
// For every 100 characters in this paragraph, add another point. Up to 3 points.
contentScore += Math.min(Math.floor(innerText.length / 100), 3);
// Initialize and score ancestors.
this._forEachNode(ancestors, function(ancestor, level) {
- if (!ancestor.tagName)
+ if (!ancestor.tagName || !ancestor.parentNode || typeof(ancestor.parentNode.tagName) === 'undefined')
return;
if (typeof(ancestor.readability) === 'undefined') {
this._initializeNode(ancestor);
candidates.push(ancestor);
}
// Node score divider:
@@ -1080,17 +1120,17 @@ Readability.prototype = {
var parseSuccessful = true;
// Now that we've gone through the full algorithm, check to see if
// we got any meaningful content. If we didn't, we may need to re-run
// grabArticle with different flags set. This gives us a higher likelihood of
// finding the content, and the sieve approach gives us a higher likelihood of
// finding the -right- content.
var textLength = this._getInnerText(articleContent, true).length;
- if (textLength < this._wordThreshold) {
+ if (textLength < this._charThreshold) {
parseSuccessful = false;
page.innerHTML = pageCacheHtml;
if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
this._attempts.push({articleContent: articleContent, textLength: textLength});
} else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
this._removeFlag(this.FLAG_WEIGHT_CLASSES);
@@ -1228,37 +1268,38 @@ Readability.prototype = {
scriptNode.nodeValue = "";
scriptNode.removeAttribute('src');
return true;
});
this._removeNodes(doc.getElementsByTagName('noscript'));
},
/**
- * Check if this node has only whitespace and a single P element
+ * Check if this node has only whitespace and a single element with given tag
* Returns false if the DIV node contains non-empty text nodes
- * or if it contains no P or more than 1 element.
+ * or if it contains no element with given tag or more than 1 element.
*
* @param Element
+ * @param string tag of child element
**/
- _hasSinglePInsideElement: function(element) {
- // There should be exactly 1 element child which is a P:
- if (element.children.length != 1 || element.children[0].tagName !== "P") {
+ _hasSingleTagInsideElement: function(element, tag) {
+ // There should be exactly 1 element child with given tag
+ if (element.children.length != 1 || element.children[0].tagName !== tag) {
return false;
}
// And there should be no text nodes with real content
return !this._someNode(element.childNodes, function(node) {
- return node.nodeType === Node.TEXT_NODE &&
+ return node.nodeType === this.TEXT_NODE &&
this.REGEXPS.hasContent.test(node.textContent);
});
},
_isElementWithoutContent: function(node) {
- return node.nodeType === Node.ELEMENT_NODE &&
+ return node.nodeType === this.ELEMENT_NODE &&
node.textContent.trim().length == 0 &&
(node.children.length == 0 ||
node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
},
/**
* Determine whether element has any children block level elements.
*
@@ -1266,16 +1307,31 @@ Readability.prototype = {
*/
_hasChildBlockElement: function (element) {
return this._someNode(element.childNodes, function(node) {
return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 ||
this._hasChildBlockElement(node);
});
},
+ /***
+ * Determine if a node qualifies as phrasing content.
+ * https://developer.mozilla.org/en-US/docs/Web/Guide/HTML/Content_categories#Phrasing_content
+ **/
+ _isPhrasingContent: function(node) {
+ return node.nodeType === this.TEXT_NODE || this.PHRASING_ELEMS.indexOf(node.tagName) !== -1 ||
+ ((node.tagName === "A" || node.tagName === "DEL" || node.tagName === "INS") &&
+ this._everyNode(node.childNodes, this._isPhrasingContent));
+ },
+
+ _isWhitespace: function(node) {
+ return (node.nodeType === this.TEXT_NODE && node.textContent.trim().length === 0) ||
+ (node.nodeType === this.ELEMENT_NODE && node.tagName === "BR");
+ },
+
/**
* Get the inner text of a node - cross browser compatibly.
* This also strips out any excess whitespace to be found.
*
* @param Element
* @param Boolean normalizeSpaces (default: true)
* @return string
**/
@@ -1307,26 +1363,24 @@ Readability.prototype = {
*
* @param Element
* @return void
**/
_cleanStyles: function(e) {
if (!e || e.tagName.toLowerCase() === 'svg')
return;
- if (e.className !== 'readability-styled') {
- // Remove `style` and deprecated presentational attributes
- for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
- e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
- }
+ // Remove `style` and deprecated presentational attributes
+ for (var i = 0; i < this.PRESENTATIONAL_ATTRIBUTES.length; i++) {
+ e.removeAttribute(this.PRESENTATIONAL_ATTRIBUTES[i]);
+ }
- if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
- e.removeAttribute('width');
- e.removeAttribute('height');
- }
+ if (this.DEPRECATED_SIZE_ATTRIBUTE_ELEMS.indexOf(e.tagName) !== -1) {
+ e.removeAttribute('width');
+ e.removeAttribute('height');
}
var cur = e.firstElementChild;
while (cur !== null) {
this._cleanStyles(cur);
cur = cur.nextElementSibling;
}
},
@@ -1634,16 +1688,20 @@ Readability.prototype = {
_flagIsActive: function(flag) {
return (this._flags & flag) > 0;
},
_removeFlag: function(flag) {
this._flags = this._flags & ~flag;
},
+ _isProbablyVisible: function(node) {
+ return node.style.display != "none" && !node.hasAttribute("hidden");
+ },
+
/**
* Decides whether or not the document is reader-able without parsing the whole thing.
*
* @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
*/
isProbablyReaderable: function(helperIsVisible) {
var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
@@ -1658,19 +1716,19 @@ Readability.prototype = {
if (brNodes.length) {
var set = new Set();
[].forEach.call(brNodes, function(node) {
set.add(node.parentNode);
});
nodes = [].concat.apply(Array.from(set), nodes);
}
- // FIXME we should have a fallback for helperIsVisible, but this is
- // problematic because of jsdom's elem.style handling - see
- // https://github.com/mozilla/readability/pull/186 for context.
+ if (!helperIsVisible) {
+ helperIsVisible = this._isProbablyVisible;
+ }
var score = 0;
// This is a little cheeky, we use the accumulator 'score' to decide what to return from
// this callback:
return this._someNode(nodes, function(node) {
if (helperIsVisible && !helperIsVisible(node))
return false;
var matchString = node.className + " " + node.id;
@@ -1714,19 +1772,16 @@ Readability.prototype = {
// Avoid parsing too large documents, as per configuration option
if (this._maxElemsToParse > 0) {
var numTags = this._doc.getElementsByTagName("*").length;
if (numTags > this._maxElemsToParse) {
throw new Error("Aborting parsing document; " + numTags + " elements found");
}
}
- if (typeof this._doc.documentElement.firstElementChild === "undefined") {
- this._getNextNode = this._getNextNodeNoElementProperties;
- }
// Remove script tags from the document.
this._removeScripts(this._doc);
this._prepDocument();
var metadata = this._getArticleMetadata();
this._articleTitle = metadata.title;
@@ -1745,17 +1800,16 @@ Readability.prototype = {
var paragraphs = articleContent.getElementsByTagName("p");
if (paragraphs.length > 0) {
metadata.excerpt = paragraphs[0].textContent.trim();
}
}
var textContent = articleContent.textContent;
return {
- uri: this._uri,
title: this._articleTitle,
byline: metadata.byline || this._articleByline,
dir: this._articleDir,
content: articleContent.innerHTML,
textContent: textContent,
length: textContent.length,
excerpt: metadata.excerpt,
};