--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@@ -1012,56 +1012,16 @@
while ((child = this.readNode())) {
// Don't keep Comment nodes
if (child.nodeType !== 8) {
node.appendChild(child);
}
}
},
- readScript: function (node) {
- while (this.currentChar < this.html.length) {
- var c = this.nextChar();
- var nextC = this.peekNext();
- if (c === "<") {
- if (nextC === "!" || nextC === "?") {
- // We're still before the ! or ? that is starting this comment:
- this.currentChar++;
- node.appendChild(this.discardNextComment());
- continue;
- }
- if (nextC === "/" && this.html.substr(this.currentChar, 8 /*"/script>".length */).toLowerCase() == "/script>") {
- // Go back before the '<' so we find the end tag.
- this.currentChar--;
- // Done with this script tag, the caller will close:
- return;
- }
- }
- // Either c wasn't a '<' or it was but we couldn't find either a comment
- // or a closing script tag, so we should just parse as text until the next one
- // comes along:
-
- var haveTextNode = node.lastChild && node.lastChild.nodeType === Node.TEXT_NODE;
- var textNode = haveTextNode ? node.lastChild : new Text();
- var n = this.html.indexOf("<", this.currentChar);
- // Decrement this to include the current character *afterwards* so we don't get stuck
- // looking for the same < all the time.
- this.currentChar--;
- if (n === -1) {
- textNode.innerHTML += this.html.substring(this.currentChar, this.html.length);
- this.currentChar = this.html.length;
- } else {
- textNode.innerHTML += this.html.substring(this.currentChar, n);
- this.currentChar = n;
- }
- if (!haveTextNode)
- node.appendChild(textNode);
- }
- },
-
discardNextComment: function() {
if (this.match("--")) {
this.discardTo("-->");
} else {
var c = this.nextChar();
while (c !== ">") {
if (c === undefined)
return null;
@@ -1126,21 +1086,17 @@
return null;
var node = this.retPair[0];
var closed = this.retPair[1];
var localName = node.localName;
// If this isn't a void Element, read its child nodes
if (!closed) {
- if (localName == "script") {
- this.readScript(node);
- } else {
- this.readChildren(node);
- }
+ this.readChildren(node);
var closingTag = "</" + localName + ">";
if (!this.match(closingTag)) {
this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length));
return null;
}
}
// Only use the first title, because SVG might have other
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -471,16 +471,21 @@ Readability.prototype = {
* iframes, forms, strip extraneous <p> tags, etc.
*
* @param Element
* @return void
**/
_prepArticle: function(articleContent) {
this._cleanStyles(articleContent);
+ // Check for data tables before we continue, to avoid removing items in
+ // those tables, which will often be isolated even though they're
+ // visually linked to other content-ful elements (text, images, etc.).
+ this._markDataTables(articleContent);
+
// Clean out junk from the article content
this._cleanConditionally(articleContent, "form");
this._cleanConditionally(articleContent, "fieldset");
this._clean(articleContent, "object");
this._clean(articleContent, "embed");
this._clean(articleContent, "h1");
this._clean(articleContent, "footer");
@@ -718,21 +723,21 @@ Readability.prototype = {
node.tagName !== "BODY" &&
node.tagName !== "A") {
this.log("Removing unlikely candidate - " + matchString);
node = this._removeAndGetNext(node);
continue;
}
}
- // Remove empty DIV, SECTION, and HEADER nodes
+ // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" ||
node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
- this._isEmptyElement(node)) {
+ this._isElementWithoutContent(node)) {
node = this._removeAndGetNext(node);
continue;
}
if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
elementsToScore.push(node);
}
@@ -1181,20 +1186,21 @@ Readability.prototype = {
// And there should be no text nodes with real content
return !this._someNode(element.childNodes, function(node) {
return node.nodeType === Node.TEXT_NODE &&
this.REGEXPS.hasContent.test(node.textContent);
});
},
- _isEmptyElement: function(node) {
+ _isElementWithoutContent: function(node) {
return node.nodeType === Node.ELEMENT_NODE &&
- node.children.length == 0 &&
- node.textContent.trim().length == 0;
+ node.textContent.trim().length == 0 &&
+ (node.children.length == 0 ||
+ node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
},
/**
* Determine whether element has any children block level elements.
*
* @param Element
*/
_hasChildBlockElement: function (element) {
@@ -1710,34 +1716,122 @@ Readability.prototype = {
},
/**
* Check if a given node has one of its ancestor tag name matching the
* provided one.
* @param HTMLElement node
* @param String tagName
* @param Number maxDepth
+ * @param Function filterFn a filter to invoke to determine whether this node 'counts'
* @return Boolean
*/
- _hasAncestorTag: function(node, tagName, maxDepth) {
+ _hasAncestorTag: function(node, tagName, maxDepth, filterFn) {
maxDepth = maxDepth || 3;
tagName = tagName.toUpperCase();
var depth = 0;
while (node.parentNode) {
- if (depth > maxDepth)
+ if (maxDepth > 0 && depth > maxDepth)
return false;
- if (node.parentNode.tagName === tagName)
+ if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode)))
return true;
node = node.parentNode;
depth++;
}
return false;
},
/**
+ * Return an object indicating how many rows and columns this table has.
+ */
+ _getRowAndColumnCount: function(table) {
+ var rows = 0;
+ var columns = 0;
+ var trs = table.getElementsByTagName("tr");
+ for (var i = 0; i < trs.length; i++) {
+ var rowspan = trs[i].getAttribute("rowspan") || 0;
+ if (rowspan) {
+ rowspan = parseInt(rowspan, 10);
+ }
+ rows += (rowspan || 1);
+
+ // Now look for column-related info
+ var columnsInThisRow = 0;
+ var cells = trs[i].getElementsByTagName("td");
+ for (var j = 0; j < cells.length; j++) {
+ var colspan = cells[j].getAttribute("colspan") || 0;
+ if (colspan) {
+ colspan = parseInt(colspan, 10);
+ }
+ columnsInThisRow += (colspan || 1);
+ }
+ columns = Math.max(columns, columnsInThisRow);
+ }
+ return {rows: rows, columns: columns};
+ },
+
+ /**
+ * Look for 'data' (as opposed to 'layout') tables, for which we use
+ * similar checks as
+ * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920
+ */
+ _markDataTables: function(root) {
+ var tables = root.getElementsByTagName("table");
+ for (var i = 0; i < tables.length; i++) {
+ var table = tables[i];
+ var role = table.getAttribute("role");
+ if (role == "presentation") {
+ table._readabilityDataTable = false;
+ continue;
+ }
+ var datatable = table.getAttribute("datatable");
+ if (datatable == "0") {
+ table._readabilityDataTable = false;
+ continue;
+ }
+ var summary = table.getAttribute("summary");
+ if (summary) {
+ table._readabilityDataTable = true;
+ continue;
+ }
+
+ var caption = table.getElementsByTagName("caption")[0];
+ if (caption && caption.childNodes.length > 0) {
+ table._readabilityDataTable = true;
+ continue;
+ }
+
+ // If the table has a descendant with any of these tags, consider a data table:
+ var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"];
+ var descendantExists = function(tag) {
+ return !!table.getElementsByTagName(tag)[0];
+ };
+ if (dataTableDescendants.some(descendantExists)) {
+ this.log("Data table because found data-y descendant");
+ table._readabilityDataTable = true;
+ continue;
+ }
+
+ // Nested tables indicate a layout table:
+ if (table.getElementsByTagName("table")[0]) {
+ table._readabilityDataTable = false;
+ continue;
+ }
+
+ var sizeInfo = this._getRowAndColumnCount(table);
+ if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
+ table._readabilityDataTable = true;
+ continue;
+ }
+ // Now just go by size entirely:
+ table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
+ }
+ },
+
+ /**
* Clean an element of all tags of type "tag" if they look fishy.
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
*
* @return void
**/
_cleanConditionally: function(e, tag) {
if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
return;
@@ -1745,32 +1839,41 @@ Readability.prototype = {
var isList = tag === "ul" || tag === "ol";
// Gather counts for other typical elements embedded within.
// Traverse backwards so we can remove nodes at the same time
// without effecting the traversal.
//
// TODO: Consider taking into account original contentScore here.
this._removeNodes(e.getElementsByTagName(tag), function(node) {
+ // First check if we're in a data table, in which case don't remove us.
+ var isDataTable = function(t) {
+ return t._readabilityDataTable;
+ };
+
+ if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
+ return false;
+ }
+
var weight = this._getClassWeight(node);
var contentScore = 0;
this.log("Cleaning Conditionally", node);
if (weight + contentScore < 0) {
return true;
}
if (this._getCharCount(node, ',') < 10) {
// If there are not very many commas, and the number of
// non-paragraph elements is more than paragraphs or other
// ominous signs, remove the element.
var p = node.getElementsByTagName("p").length;
var img = node.getElementsByTagName("img").length;
- var li = node.getElementsByTagName("li").length-100;
+ var li = node.getElementsByTagName("li").length - 100;
var input = node.getElementsByTagName("input").length;
var embedCount = 0;
var embeds = node.getElementsByTagName("embed");
for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
if (!this.REGEXPS.videos.test(embeds[ei].src))
embedCount += 1;
}