Bug 1323861, Bug 1322674, Bug 1217007 - Update readability from github repo, r=Gijs draft
authorEvan Tseng <evan@tseng.io>
Thu, 23 Feb 2017 16:35:06 +0800
changeset 488531 a18e1a9093c59b6f42b9d04839ffa81d50f16b7c
parent 488302 32dcdde1fc64fc39a9065dc4218265dbc727673f
child 546760 90e97ed74fa71ab99ac0885abb22a1ac0217727a
push id46562
push userbmo:evan@tseng.io
push dateThu, 23 Feb 2017 08:37:14 +0000
reviewersGijs
bugs1323861, 1322674, 1217007
milestone54.0a1
Bug 1323861, Bug 1322674, Bug 1217007 - Update readability from github repo, r=Gijs MozReview-Commit-ID: K0VcAPMaqBV
toolkit/components/reader/JSDOMParser.js
toolkit/components/reader/Readability.js
--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@@ -1012,56 +1012,16 @@
       while ((child = this.readNode())) {
         // Don't keep Comment nodes
         if (child.nodeType !== 8) {
           node.appendChild(child);
         }
       }
     },
 
-    readScript: function (node) {
-      while (this.currentChar < this.html.length) {
-        var c = this.nextChar();
-        var nextC = this.peekNext();
-        if (c === "<") {
-          if (nextC === "!" || nextC === "?") {
-            // We're still before the ! or ? that is starting this comment:
-            this.currentChar++;
-            node.appendChild(this.discardNextComment());
-            continue;
-          }
-          if (nextC === "/" && this.html.substr(this.currentChar, 8 /*"/script>".length */).toLowerCase() == "/script>") {
-            // Go back before the '<' so we find the end tag.
-            this.currentChar--;
-            // Done with this script tag, the caller will close:
-            return;
-          }
-        }
-        // Either c wasn't a '<' or it was but we couldn't find either a comment
-        // or a closing script tag, so we should just parse as text until the next one
-        // comes along:
-
-        var haveTextNode = node.lastChild && node.lastChild.nodeType === Node.TEXT_NODE;
-        var textNode = haveTextNode ? node.lastChild : new Text();
-        var n = this.html.indexOf("<", this.currentChar);
-        // Decrement this to include the current character *afterwards* so we don't get stuck
-        // looking for the same < all the time.
-        this.currentChar--;
-        if (n === -1) {
-          textNode.innerHTML += this.html.substring(this.currentChar, this.html.length);
-          this.currentChar = this.html.length;
-        } else {
-          textNode.innerHTML += this.html.substring(this.currentChar, n);
-          this.currentChar = n;
-        }
-        if (!haveTextNode)
-          node.appendChild(textNode);
-      }
-    },
-
     discardNextComment: function() {
       if (this.match("--")) {
         this.discardTo("-->");
       } else {
         var c = this.nextChar();
         while (c !== ">") {
           if (c === undefined)
             return null;
@@ -1126,21 +1086,17 @@
         return null;
 
       var node = this.retPair[0];
       var closed = this.retPair[1];
       var localName = node.localName;
 
       // If this isn't a void Element, read its child nodes
       if (!closed) {
-        if (localName == "script") {
-          this.readScript(node);
-        } else {
-          this.readChildren(node);
-        }
+        this.readChildren(node);
         var closingTag = "</" + localName + ">";
         if (!this.match(closingTag)) {
           this.error("expected '" + closingTag + "' and got " + this.html.substr(this.currentChar, closingTag.length));
           return null;
         }
       }
 
       // Only use the first title, because SVG might have other
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -471,16 +471,21 @@ Readability.prototype = {
    * iframes, forms, strip extraneous <p> tags, etc.
    *
    * @param Element
    * @return void
    **/
   _prepArticle: function(articleContent) {
     this._cleanStyles(articleContent);
 
+    // Check for data tables before we continue, to avoid removing items in
+    // those tables, which will often be isolated even though they're
+    // visually linked to other content-ful elements (text, images, etc.).
+    this._markDataTables(articleContent);
+
     // Clean out junk from the article content
     this._cleanConditionally(articleContent, "form");
     this._cleanConditionally(articleContent, "fieldset");
     this._clean(articleContent, "object");
     this._clean(articleContent, "embed");
     this._clean(articleContent, "h1");
     this._clean(articleContent, "footer");
 
@@ -718,21 +723,21 @@ Readability.prototype = {
               node.tagName !== "BODY" &&
               node.tagName !== "A") {
             this.log("Removing unlikely candidate - " + matchString);
             node = this._removeAndGetNext(node);
             continue;
           }
         }
 
-        // Remove empty DIV, SECTION, and HEADER nodes
+        // Remove DIV, SECTION, and HEADER nodes without any content(e.g. text, image, video, or iframe).
         if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" ||
              node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
              node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
-            this._isEmptyElement(node)) {
+            this._isElementWithoutContent(node)) {
           node = this._removeAndGetNext(node);
           continue;
         }
 
         if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
           elementsToScore.push(node);
         }
 
@@ -1181,20 +1186,21 @@ Readability.prototype = {
 
     // And there should be no text nodes with real content
     return !this._someNode(element.childNodes, function(node) {
       return node.nodeType === Node.TEXT_NODE &&
              this.REGEXPS.hasContent.test(node.textContent);
     });
   },
 
-  _isEmptyElement: function(node) {
+  _isElementWithoutContent: function(node) {
     return node.nodeType === Node.ELEMENT_NODE &&
-      node.children.length == 0 &&
-      node.textContent.trim().length == 0;
+      node.textContent.trim().length == 0 &&
+      (node.children.length == 0 ||
+       node.children.length == node.getElementsByTagName("br").length + node.getElementsByTagName("hr").length);
   },
 
   /**
    * Determine whether element has any children block level elements.
    *
    * @param Element
    */
   _hasChildBlockElement: function (element) {
@@ -1710,34 +1716,122 @@ Readability.prototype = {
   },
 
   /**
    * Check if a given node has one of its ancestor tag name matching the
    * provided one.
    * @param  HTMLElement node
    * @param  String      tagName
    * @param  Number      maxDepth
+   * @param  Function    filterFn a filter to invoke to determine whether this node 'counts'
    * @return Boolean
    */
-  _hasAncestorTag: function(node, tagName, maxDepth) {
+  _hasAncestorTag: function(node, tagName, maxDepth, filterFn) {
     maxDepth = maxDepth || 3;
     tagName = tagName.toUpperCase();
     var depth = 0;
     while (node.parentNode) {
-      if (depth > maxDepth)
+      if (maxDepth > 0 && depth > maxDepth)
         return false;
-      if (node.parentNode.tagName === tagName)
+      if (node.parentNode.tagName === tagName && (!filterFn || filterFn(node.parentNode)))
         return true;
       node = node.parentNode;
       depth++;
     }
     return false;
   },
 
   /**
+   * Return an object indicating how many rows and columns this table has.
+   */
+  _getRowAndColumnCount: function(table) {
+    var rows = 0;
+    var columns = 0;
+    var trs = table.getElementsByTagName("tr");
+    for (var i = 0; i < trs.length; i++) {
+      var rowspan = trs[i].getAttribute("rowspan") || 0;
+      if (rowspan) {
+        rowspan = parseInt(rowspan, 10);
+      }
+      rows += (rowspan || 1);
+
+      // Now look for column-related info
+      var columnsInThisRow = 0;
+      var cells = trs[i].getElementsByTagName("td");
+      for (var j = 0; j < cells.length; j++) {
+        var colspan = cells[j].getAttribute("colspan") || 0;
+        if (colspan) {
+          colspan = parseInt(colspan, 10);
+        }
+        columnsInThisRow += (colspan || 1);
+      }
+      columns = Math.max(columns, columnsInThisRow);
+    }
+    return {rows: rows, columns: columns};
+  },
+
+  /**
+   * Look for 'data' (as opposed to 'layout') tables, for which we use
+   * similar checks as
+   * https://dxr.mozilla.org/mozilla-central/rev/71224049c0b52ab190564d3ea0eab089a159a4cf/accessible/html/HTMLTableAccessible.cpp#920
+   */
+  _markDataTables: function(root) {
+    var tables = root.getElementsByTagName("table");
+    for (var i = 0; i < tables.length; i++) {
+      var table = tables[i];
+      var role = table.getAttribute("role");
+      if (role == "presentation") {
+        table._readabilityDataTable = false;
+        continue;
+      }
+      var datatable = table.getAttribute("datatable");
+      if (datatable == "0") {
+        table._readabilityDataTable = false;
+        continue;
+      }
+      var summary = table.getAttribute("summary");
+      if (summary) {
+        table._readabilityDataTable = true;
+        continue;
+      }
+
+      var caption = table.getElementsByTagName("caption")[0];
+      if (caption && caption.childNodes.length > 0) {
+        table._readabilityDataTable = true;
+        continue;
+      }
+
+      // If the table has a descendant with any of these tags, consider a data table:
+      var dataTableDescendants = ["col", "colgroup", "tfoot", "thead", "th"];
+      var descendantExists = function(tag) {
+        return !!table.getElementsByTagName(tag)[0];
+      };
+      if (dataTableDescendants.some(descendantExists)) {
+        this.log("Data table because found data-y descendant");
+        table._readabilityDataTable = true;
+        continue;
+      }
+
+      // Nested tables indicate a layout table:
+      if (table.getElementsByTagName("table")[0]) {
+        table._readabilityDataTable = false;
+        continue;
+      }
+
+      var sizeInfo = this._getRowAndColumnCount(table);
+      if (sizeInfo.rows >= 10 || sizeInfo.columns > 4) {
+        table._readabilityDataTable = true;
+        continue;
+      }
+      // Now just go by size entirely:
+      table._readabilityDataTable = sizeInfo.rows * sizeInfo.columns > 10;
+    }
+  },
+
+  /**
    * Clean an element of all tags of type "tag" if they look fishy.
    * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
    *
    * @return void
    **/
   _cleanConditionally: function(e, tag) {
     if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
       return;
@@ -1745,32 +1839,41 @@ Readability.prototype = {
     var isList = tag === "ul" || tag === "ol";
 
     // Gather counts for other typical elements embedded within.
     // Traverse backwards so we can remove nodes at the same time
     // without effecting the traversal.
     //
     // TODO: Consider taking into account original contentScore here.
     this._removeNodes(e.getElementsByTagName(tag), function(node) {
+      // First check if we're in a data table, in which case don't remove us.
+      var isDataTable = function(t) {
+        return t._readabilityDataTable;
+      };
+
+      if (this._hasAncestorTag(node, "table", -1, isDataTable)) {
+        return false;
+      }
+
       var weight = this._getClassWeight(node);
       var contentScore = 0;
 
       this.log("Cleaning Conditionally", node);
 
       if (weight + contentScore < 0) {
         return true;
       }
 
       if (this._getCharCount(node, ',') < 10) {
         // If there are not very many commas, and the number of
         // non-paragraph elements is more than paragraphs or other
         // ominous signs, remove the element.
         var p = node.getElementsByTagName("p").length;
         var img = node.getElementsByTagName("img").length;
-        var li = node.getElementsByTagName("li").length-100;
+        var li = node.getElementsByTagName("li").length - 100;
         var input = node.getElementsByTagName("input").length;
 
         var embedCount = 0;
         var embeds = node.getElementsByTagName("embed");
         for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
           if (!this.REGEXPS.videos.test(embeds[ei].src))
             embedCount += 1;
         }