author Evan Tseng <evan@tseng.io>

Tue, 01 Nov 2016 18:54:06 +0800

changeset 432530 b61753cb1abfadf8947809abc4b92b148c77d4fc

parent 432230 3e73fd638e687a4d7f46613586e5156b8e2af846

child 535689 565910436cae26439c62023b6638fb33aa73bc6c

push id 34354

push user bmo:evan@tseng.io

push date Wed, 02 Nov 2016 09:35:57 +0000

reviewers Gijs

bugs 1177619

milestone 52.0a1

toolkit/components/reader/.eslintrc.js file | annotate | diff | comparison | revisions

toolkit/components/reader/JSDOMParser.js file | annotate | diff | comparison | revisions

toolkit/components/reader/Readability.js file | annotate | diff | comparison | revisions
new file mode 100644
--- /dev/null
+++ b/toolkit/components/reader/.eslintrc.js
@@ -0,0 +1,199 @@
+"use strict";
+
+module.exports = {
+  "rules": {
+    // Braces only needed for multi-line arrow function blocks
+    // "arrow-body-style": [2, "as-needed"],
+
+    // Require spacing around =>
+    // "arrow-spacing": 2,
+
+    // Always require spacing around a single line block
+    // "block-spacing": 1,
+
+    // No newline before open brace for a block
+    "brace-style": 2,
+
+    // No space before always a space after a comma
+    "comma-spacing": [2, {"before": false, "after": true}],
+
+    // Commas at the end of the line not the start
+    // "comma-style": 2,
+
+    // Don't require spaces around computed properties
+    // "computed-property-spacing": [2, "never"],
+
+    // Functions must always return something or nothing
+    "consistent-return": 2,
+
+    // Require braces around blocks that start a new line
+    // Note that this rule is likely to be overridden on a per-directory basis
+    // very frequently.
+    // "curly": [2, "multi-line"],
+
+    // Always require a trailing EOL
+    "eol-last": 2,
+
+    // Require function* name()
+    // "generator-star-spacing": [2, {"before": false, "after": true}],
+
+    // Two space indent
+    "indent": [2, 2, { "SwitchCase": 1 }],
+
+    // Space after colon not before in property declarations
+    "key-spacing": [2, { "beforeColon": false, "afterColon": true, "mode": "minimum" }],
+
+    // Unix linebreaks
+    "linebreak-style": [2, "unix"],
+
+    // Always require parenthesis for new calls
+    "new-parens": 2,
+
+    // Use [] instead of Array()
+    // "no-array-constructor": 2,
+
+    // No duplicate arguments in function declarations
+    "no-dupe-args": 2,
+
+    // No duplicate keys in object declarations
+    "no-dupe-keys": 2,
+
+    // No duplicate cases in switch statements
+    "no-duplicate-case": 2,
+
+    // No labels
+    "no-labels": 2,
+
+    // If an if block ends with a return no need for an else block
+    "no-else-return": 2,
+
+    // No empty statements
+    "no-empty": 2,
+
+    // No empty character classes in regex
+    "no-empty-character-class": 2,
+
+    // Disallow empty destructuring
+    "no-empty-pattern": 2,
+
+    // No assiging to exception variable
+    // "no-ex-assign": 2,
+
+    // No using !! where casting to boolean is already happening
+    // "no-extra-boolean-cast": 2,
+
+    // No double semicolon
+    "no-extra-semi": 2,
+
+    // No overwriting defined functions
+    "no-func-assign": 2,
+
+    // Declarations in Program or Function Body
+    "no-inner-declarations": 2,
+
+    // No invalid regular expresions
+    "no-invalid-regexp": 2,
+
+    // No odd whitespace characters
+    "no-irregular-whitespace": 2,
+
+    // No single if block inside an else block
+    "no-lonely-if": 2,
+
+    // No mixing spaces and tabs in indent
+    "no-mixed-spaces-and-tabs": [2, "smart-tabs"],
+
+    // No unnecessary spacing
+    "no-multi-spaces": [2, { exceptions: { "AssignmentExpression": true, "VariableDeclarator": true, "ArrayExpression": true, "ObjectExpression": true } }],
+
+    // No reassigning native JS objects
+    "no-native-reassign": 2,
+
+    // No (!foo in bar)
+    "no-negated-in-lhs": 2,
+
+    // Nested ternary statements are confusing
+    "no-nested-ternary": 2,
+
+    // Use {} instead of new Object()
+    // "no-new-object": 2,
+
+    // No Math() or JSON()
+    "no-obj-calls": 2,
+
+    // No octal literals
+    "no-octal": 2,
+
+    // No redeclaring variables
+    "no-redeclare": 2,
+
+    // No unnecessary comparisons
+    "no-self-compare": 2,
+
+    // No declaring variables from an outer scope
+    "no-shadow": 2,
+
+    // No declaring variables that hide things like arguments
+    "no-shadow-restricted-names": 2,
+
+    // No spaces between function name and parentheses
+    "no-spaced-func": 2,
+
+    // No trailing whitespace
+    "no-trailing-spaces": 2,
+
+    // No using undeclared variables
+    // "no-undef": 2,
+
+    // Error on newline where a semicolon is needed
+    "no-unexpected-multiline": 2,
+
+    // No unreachable statements
+    "no-unreachable": 2,
+
+    // No expressions where a statement is expected
+    // "no-unused-expressions": 2,
+
+    // No declaring variables that are never used
+    "no-unused-vars": [2, {"vars": "all", "args": "none"}],
+
+    // No using variables before defined
+    // "no-use-before-define": [2, "nofunc"],
+
+    // No using with
+    "no-with": 2,
+
+    // Always require semicolon at end of statement
+    "semi": [2, "always"],
+
+    // Require space after keywords
+    "keyword-spacing": 2,
+
+    // Require space before blocks
+    "space-before-blocks": 2,
+
+    // Never use spaces before function parentheses
+    // "space-before-function-paren": [2, { "anonymous": "always", "named": "never" }],
+
+    // Require spaces before finally, catch, etc.
+    // "space-before-keywords": [2, "always"],
+
+    // No space padding in parentheses
+    // "space-in-parens": [2, "never"],
+
+    // Require spaces around operators
+    // "space-infix-ops": 2,
+
+    // Require spaces after return, throw and case
+    // "space-return-throw-case": 2,
+
+    // ++ and -- should not need spacing
+    // "space-unary-ops": [2, { "words": true, "nonwords": false }],
+
+    // No comparisons to NaN
+    "use-isnan": 2,
+
+    // Only check typeof against valid results
+    "valid-typeof": 2,
+  },
+}
--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@@ -455,26 +455,25 @@
             if (newNode.nextElementSibling)
               newNode.nextElementSibling.previousElementSibling = newNode;
 
             if (newNode.nextElementSibling)
               this.children.splice(this.children.indexOf(newNode.nextElementSibling), 0, newNode);
             else
               this.children.push(newNode);
           }
-        } else {
+        } else if (oldNode.nodeType === Node.ELEMENT_NODE) {
           // new node is not an element node.
           // if the old one was, update its element siblings:
-          if (oldNode.nodeType === Node.ELEMENT_NODE) {
-            if (oldNode.previousElementSibling)
-              oldNode.previousElementSibling.nextElementSibling = oldNode.nextElementSibling;
-            if (oldNode.nextElementSibling)
-              oldNode.nextElementSibling.previousElementSibling = oldNode.previousElementSibling;
-            this.children.splice(this.children.indexOf(oldNode), 1);
-          }
+          if (oldNode.previousElementSibling)
+            oldNode.previousElementSibling.nextElementSibling = oldNode.nextElementSibling;
+          if (oldNode.nextElementSibling)
+            oldNode.nextElementSibling.previousElementSibling = oldNode.previousElementSibling;
+          this.children.splice(this.children.indexOf(oldNode), 1);
+
           // If the old node wasn't an element, neither the new nor the old node was an element,
           // and the children array and its members shouldn't need any updating.
         }
 
 
         oldNode.parentNode = null;
         oldNode.previousSibling = null;
         oldNode.nextSibling = null;
@@ -484,18 +483,18 @@
         }
         return oldNode;
       }
     },
 
     __JSDOMParser__: true,
   };
 
-  for (var i in nodeTypes) {
-    Node[i] = Node.prototype[i] = nodeTypes[i];
+  for (var nodeType in nodeTypes) {
+    Node[nodeType] = Node.prototype[nodeType] = nodeTypes[nodeType];
   }
 
   var Attribute = function (name, value) {
     this.name = name;
     this._value = value;
   };
 
   Attribute.prototype = {
@@ -554,17 +553,17 @@
     set innerHTML(newHTML) {
       this._innerHTML = newHTML;
       delete this._textContent;
     },
     set textContent(newText) {
       this._textContent = newText;
       delete this._innerHTML;
     },
-  }
+  };
 
   var Document = function () {
     this.styleSheets = [];
     this.childNodes = [];
     this.children = [];
   };
 
   Document.prototype = {
@@ -824,17 +823,17 @@
   for (var jsName in styleMap) {
     (function (cssName) {
       Style.prototype.__defineGetter__(jsName, function () {
         return this.getStyle(cssName);
       });
       Style.prototype.__defineSetter__(jsName, function (value) {
         this.setStyle(cssName, value);
       });
-    }) (styleMap[jsName]);
+    })(styleMap[jsName]);
   }
 
   var JSDOMParser = function () {
     this.currentChar = 0;
 
     // In makeElementNode() we build up many strings one char at a time. Using
     // += for this results in lots of short-lived intermediate strings. It's
     // better to build an array of single-char strings and then join() them
@@ -971,17 +970,17 @@
         if (c !== ">") {
           this.error("expected '>' to close " + tag);
           return false;
         }
       }
 
       retPair[0] = node;
       retPair[1] = closed;
-      return true
+      return true;
     },
 
     /**
      * If the current input matches this string, advance the input index;
      * otherwise, do nothing.
      *
      * @returns whether input matched string
      */
@@ -1188,9 +1187,9 @@
   global.Comment = Comment;
   global.Document = Document;
   global.Element = Element;
   global.Text = Text;
 
   // Attach JSDOMParser to the global scope
   global.JSDOMParser = JSDOMParser;
 
-}) (this);
+})(this);
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -21,25 +21,24 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 
 /*
  * This code is heavily based on Arc90's readability.js (1.7.1) script
  * available at: http://code.google.com/p/arc90labs-readability
  */
-var root = this;
 
 /**
  * Public constructor.
  * @param {Object}       uri     The URI descriptor object.
  * @param {HTMLDocument} doc     The document to parse.
  * @param {Object}       options The options object.
  */
-var Readability = function(uri, doc, options) {
+function Readability(uri, doc, options) {
   options = options || {};
 
   this._uri = uri;
   this._doc = doc;
   this._biggestFrame = false;
   this._articleByline = null;
   this._articleDir = null;
 
@@ -78,22 +77,22 @@ var Readability = function(uri, doc, opt
       var elDesc = "";
       if (e.id)
         elDesc = "(#" + e.id + classDesc + ")";
       else if (classDesc)
         elDesc = "(" + classDesc + ")";
       return rv + elDesc;
     };
     this.log = function () {
-      if ("dump" in root) {
+      if (typeof dump !== undefined) {
         var msg = Array.prototype.map.call(arguments, function(x) {
           return (x && x.nodeName) ? logEl(x) : x;
         }).join(" ");
         dump("Reader: (Readability) " + msg + "\n");
-      } else if ("console" in root) {
+      } else if (typeof console !== undefined) {
         var args = ["Reader: (Readability) "].concat(arguments);
         console.log.apply(console, args);
       }
     };
   } else {
     this.log = function () {};
   }
 }
@@ -117,20 +116,20 @@ Readability.prototype = {
   // Element tags to score by default.
   DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
 
   // All of the regular expressions in use within readability.
   // Defined up here so we don't instantiate them repeatedly in loops.
   REGEXPS: {
     unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
     okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
-    positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
+    positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
     negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
-    byline: /byline|author|dateline|writtenby/i,
+    byline: /byline|author|dateline|writtenby|p-author/i,
     replaceFonts: /<(\/?)font[^>]*>/gi,
     normalize: /\s{2,}/g,
     videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
     nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
     prevLink: /(prev|earl|old|new|<|«)/i,
     whitespace: /^\s*$/,
     hasContent: /\S$/,
   },
@@ -146,28 +145,51 @@ Readability.prototype = {
    * @return void
   **/
   _postProcessContent: function(articleContent) {
     // Readability cannot open relative uris so we convert them to absolute uris.
     this._fixRelativeUris(articleContent);
   },
 
   /**
+   * Iterates over a NodeList, calls `filterFn` for each node and removes node
+   * if function returned `true`.
+   *
+   * If function is not passed, removes all the nodes in node list.
+   *
+   * @param NodeList nodeList The no
+   * @param Function filterFn
+   * @return void
+   */
+  _removeNodes: function(nodeList, filterFn) {
+    for (var i = nodeList.length - 1; i >= 0; i--) {
+      var node = nodeList[i];
+      var parentNode = node.parentNode;
+      if (parentNode) {
+        if (!filterFn || filterFn.call(this, node, i, nodeList)) {
+          parentNode.removeChild(node);
+        }
+      }
+    }
+  },
+
+  /**
    * Iterate over a NodeList, which doesn't natively fully implement the Array
    * interface.
    *
    * For convenience, the current object context is applied to the provided
    * iterate function.
    *
    * @param  NodeList nodeList The NodeList.
    * @param  Function fn       The iterate function.
+   * @param  Boolean  backward Whether to use backward iteration.
    * @return void
    */
-  _forEachNode: function(nodeList, fn) {
-    return Array.prototype.forEach.call(nodeList, fn, this);
+  _forEachNode: function(nodeList, fn, backward) {
+    Array.prototype.forEach.call(nodeList, fn, this);
   },
 
   /**
    * Iterate over a NodeList, return true if any of the provided iterate
    * function calls returns true, false otherwise.
    *
    * For convenience, the current object context is applied to the
    * provided iterate function.
@@ -278,23 +300,23 @@ Readability.prototype = {
     var origTitle = "";
 
     try {
       curTitle = origTitle = doc.title;
 
       // If they had an element with id "title" in their HTML
       if (typeof curTitle !== "string")
         curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
-    } catch(e) {}
+    } catch (e) {/* ignore exceptions setting the title. */}
 
     if (curTitle.match(/ [\|\-] /)) {
-      curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
+      curTitle = origTitle.replace(/(.*)[\|\-] .*/gi, '$1');
 
       if (curTitle.split(' ').length < 3)
-        curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
+        curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi, '$1');
     } else if (curTitle.indexOf(': ') !== -1) {
       // Check if we have an heading containing this exact string, so we
       // could assume it's the full title.
       var headings = this._concatNodeLists(
         doc.getElementsByTagName('h1'),
         doc.getElementsByTagName('h2')
       );
       var match = this._someNode(headings, function(heading) {
@@ -329,19 +351,17 @@ Readability.prototype = {
    * This includes things like stripping javascript, CSS, and handling terrible markup.
    *
    * @return void
    **/
   _prepDocument: function() {
     var doc = this._doc;
 
     // Remove all style tags in head
-    this._forEachNode(doc.getElementsByTagName("style"), function(styleNode) {
-      styleNode.parentNode.removeChild(styleNode);
-    });
+    this._removeNodes(doc.getElementsByTagName("style"));
 
     if (doc.body) {
       this._replaceBrs(doc.body);
     }
 
     this._forEachNode(doc.getElementsByTagName("font"), function(fontNode) {
       this._setNodeTag(fontNode, "SPAN");
     });
@@ -365,17 +385,17 @@ Readability.prototype = {
   /**
    * Replaces 2 or more successive <br> elements with a single <p>.
    * Whitespace between <br> elements are ignored. For example:
    *   <div>foo<br>bar<br> <br><br>abc</div>
    * will become:
    *   <div>foo<br>bar<p>abc</p></div>
    */
   _replaceBrs: function (elem) {
-    this._forEachNode(elem.getElementsByTagName("br"), function(br) {
+    this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) {
       var next = br.nextSibling;
 
       // Whether 2 or more <br> elements have been found and replaced with a
       // <p> block.
       var replaced = false;
 
       // If we find a <br> chain, remove the <br>s until we hit another element
       // or non-whitespace. This leaves behind the first <br> in the chain
@@ -461,46 +481,45 @@ Readability.prototype = {
 
     // Do these last as the previous stuff may have removed junk
     // that will affect these
     this._cleanConditionally(articleContent, "table");
     this._cleanConditionally(articleContent, "ul");
     this._cleanConditionally(articleContent, "div");
 
     // Remove extra paragraphs
-    this._forEachNode(articleContent.getElementsByTagName('p'), function(paragraph) {
+    this._removeNodes(articleContent.getElementsByTagName('p'), function (paragraph) {
       var imgCount = paragraph.getElementsByTagName('img').length;
       var embedCount = paragraph.getElementsByTagName('embed').length;
       var objectCount = paragraph.getElementsByTagName('object').length;
       // At this point, nasty iframes have been removed, only remain embedded video ones.
       var iframeCount = paragraph.getElementsByTagName('iframe').length;
       var totalCount = imgCount + embedCount + objectCount + iframeCount;
 
-      if (totalCount === 0 && !this._getInnerText(paragraph, false))
-        paragraph.parentNode.removeChild(paragraph);
+      return totalCount === 0 && !this._getInnerText(paragraph, false);
     });
 
-    this._forEachNode(articleContent.getElementsByTagName("br"), function(br) {
+    this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) {
       var next = this._nextElement(br.nextSibling);
       if (next && next.tagName == "P")
         br.parentNode.removeChild(br);
     });
   },
 
   /**
    * Initialize a node with the readability object. Also checks the
    * className/id for special names to add to its score.
    *
    * @param Element
    * @return void
   **/
   _initializeNode: function(node) {
     node.readability = {"contentScore": 0};
 
-    switch(node.tagName) {
+    switch (node.tagName) {
       case 'DIV':
         node.readability.contentScore += 5;
         break;
 
       case 'PRE':
       case 'TD':
       case 'BLOCKQUOTE':
         node.readability.contentScore += 3;
@@ -609,17 +628,17 @@ Readability.prototype = {
 
     return false;
   },
 
   _getNodeAncestors: function(node, maxDepth) {
     maxDepth = maxDepth || 0;
     var i = 0, ancestors = [];
     while (node.parentNode) {
-      ancestors.push(node.parentNode)
+      ancestors.push(node.parentNode);
       if (maxDepth && ++i === maxDepth)
         break;
       node = node.parentNode;
     }
     return ancestors;
   },
 
   /***
@@ -1037,27 +1056,22 @@ Readability.prototype = {
   },
 
   /**
    * Removes script tags from the document.
    *
    * @param Element
   **/
   _removeScripts: function(doc) {
-    this._forEachNode(doc.getElementsByTagName('script'), function(scriptNode) {
+    this._removeNodes(doc.getElementsByTagName('script'), function(scriptNode) {
       scriptNode.nodeValue = "";
       scriptNode.removeAttribute('src');
-
-      if (scriptNode.parentNode)
-        scriptNode.parentNode.removeChild(scriptNode);
+      return true;
     });
-    this._forEachNode(doc.getElementsByTagName('noscript'), function(noscriptNode) {
-      if (noscriptNode.parentNode)
-        noscriptNode.parentNode.removeChild(noscriptNode);
-    });
+    this._removeNodes(doc.getElementsByTagName('noscript'));
   },
 
   /**
    * Check if this node has only whitespace and a single P element
    * Returns false if the DIV node contains non-empty text nodes
    * or if it contains no P or more than 1 element.
    *
    * @param Element
@@ -1096,29 +1110,28 @@ Readability.prototype = {
    * @return string
   **/
   _getInnerText: function(e, normalizeSpaces) {
     normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
     var textContent = e.textContent.trim();
 
     if (normalizeSpaces) {
       return textContent.replace(this.REGEXPS.normalize, " ");
-    } else {
-      return textContent;
     }
+    return textContent;
   },
 
   /**
    * Get the number of times a string s appears in the node e.
    *
    * @param Element
    * @param string - what to split on. Default is ","
    * @return number (integer)
   **/
-  _getCharCount: function(e,s) {
+  _getCharCount: function(e, s) {
     s = s || ",";
     return this._getInnerText(e).split(s).length - 1;
   },
 
   /**
    * Remove the style attribute on every e and under.
    * TODO: Test if getElementsByTagName(*) is faster.
    *
@@ -1377,44 +1390,42 @@ Readability.prototype = {
     for (var page in possiblePages) {
       if (possiblePages.hasOwnProperty(page)) {
         if (possiblePages[page].score >= 50 &&
           (!topPage || topPage.score < possiblePages[page].score))
           topPage = possiblePages[page];
       }
     }
 
+    var nextHref = null;
     if (topPage) {
-      var nextHref = topPage.href.replace(/\/$/,'');
+      nextHref = topPage.href.replace(/\/$/, '');
 
       this.log('NEXT PAGE IS ' + nextHref);
       this._parsedPages[nextHref] = true;
-      return nextHref;
-    } else {
-      return null;
     }
+    return nextHref;
   },
 
   _successfulRequest: function(request) {
     return (request.status >= 200 && request.status < 300) ||
         request.status === 304 ||
          (request.status === 0 && request.responseText);
   },
 
   _ajax: function(url, options) {
     var request = new XMLHttpRequest();
 
     function respondToReadyState(readyState) {
       if (request.readyState === 4) {
         if (this._successfulRequest(request)) {
           if (options.success)
             options.success(request);
-        } else {
-          if (options.error)
-            options.error(request);
+        } else if (options.error) {
+          options.error(request);
         }
       }
     }
 
     if (typeof options === 'undefined')
       options = {};
 
     request.onreadystatechange = respondToReadyState;
@@ -1457,45 +1468,44 @@ Readability.prototype = {
 
           // First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page.
           var eTag = r.getResponseHeader('ETag');
           if (eTag) {
             if (eTag in this._pageETags) {
               this.log("Exact duplicate page found via ETag. Aborting.");
               articlePage.style.display = 'none';
               return;
-            } else {
-              this._pageETags[eTag] = 1;
             }
+            this._pageETags[eTag] = 1;
           }
 
           // TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
           var page = doc.createElement("DIV");
 
           // Do some preprocessing to our HTML to make it ready for appending.
           // - Remove any script tags. Swap and reswap newlines with a unicode
           //   character because multiline regex doesn't work in javascript.
           // - Turn any noscript tags into divs so that we can parse them. This
           //   allows us to find any next page links hidden via javascript.
           // - Turn all double br's into p's - was handled by prepDocument in the original view.
           //   Maybe in the future abstract out prepDocument to work for both the original document
           //   and AJAX-added pages.
-          var responseHtml = r.responseText.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
-          responseHtml = responseHtml.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
-          responseHtml = responseHtml.replace(/\uffff/g,'\n').replace(/<(\/?)noscript/gi, '<$1div');
+          var responseHtml = r.responseText.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
+          responseHtml = responseHtml.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
+          responseHtml = responseHtml.replace(/\uffff/g, '\n').replace(/<(\/?)noscript/gi, '<$1div');
           responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>');
 
           page.innerHTML = responseHtml;
           this._replaceBrs(page);
 
           // Reset all flags for the next page, as they will search through it and
           // disable as necessary at the end of grabArticle.
           this._flags = 0x1 | 0x2 | 0x4;
 
-          var nextPageLink = this._findNextPageLink(page);
+          var secondNextPageLink = this._findNextPageLink(page);
 
           // NOTE: if we end up supporting _appendNextPage(), we'll need to
           // change this call to be async
           var content = this._grabArticle(page);
 
           if (!content) {
             this.log("No content found in page to append. Aborting.");
             return;
@@ -1524,18 +1534,18 @@ Readability.prototype = {
           // After the page has rendered, post process the content. This delay is necessary because,
           // in webkit at least, offsetWidth is not set in time to determine image width. We have to
           // wait a little bit for reflow to finish before we can fix floating images.
           setTimeout((function() {
             this._postProcessContent(thisPage);
           }).bind(this), 500);
 
 
-          if (nextPageLink)
-            this._appendNextPage(nextPageLink);
+          if (secondNextPageLink)
+            this._appendNextPage(secondNextPageLink);
         }
       });
     }).bind(this)(nextPageLink, articlePage);
   },
 
   /**
    * Get an elements class/id weight. Uses regular expressions to tell if this
    * element looks good or bad.
@@ -1576,33 +1586,33 @@ Readability.prototype = {
    *
    * @param Element
    * @param string tag to clean
    * @return void
    **/
   _clean: function(e, tag) {
     var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
 
-    this._forEachNode(e.getElementsByTagName(tag), function(element) {
+    this._removeNodes(e.getElementsByTagName(tag), function(element) {
       // Allow youtube and vimeo videos through as people usually want to see those.
       if (isEmbed) {
         var attributeValues = [].map.call(element.attributes, function(attr) {
           return attr.value;
         }).join("|");
 
         // First, check the elements attributes to see if any of them contain youtube or vimeo
         if (this.REGEXPS.videos.test(attributeValues))
-          return;
+          return false;
 
         // Then check the elements inside this element for the same.
         if (this.REGEXPS.videos.test(element.innerHTML))
-          return;
+          return false;
       }
 
-      element.parentNode.removeChild(element);
+      return true;
     });
   },
 
   /**
    * Check if a given node has one of its ancestor tag name matching the
    * provided one.
    * @param  HTMLElement node
    * @param  String      tagName
@@ -1629,88 +1639,78 @@ Readability.prototype = {
    * "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
    *
    * @return void
    **/
   _cleanConditionally: function(e, tag) {
     if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
       return;
 
-    var tagsList = e.getElementsByTagName(tag);
-    var curTagsLength = tagsList.length;
     var isList = tag === "ul" || tag === "ol";
 
     // Gather counts for other typical elements embedded within.
     // Traverse backwards so we can remove nodes at the same time
     // without effecting the traversal.
     //
     // TODO: Consider taking into account original contentScore here.
-    for (var i = curTagsLength-1; i >= 0; i -= 1) {
-      var weight = this._getClassWeight(tagsList[i]);
+    this._removeNodes(e.getElementsByTagName(tag), function(node) {
+      var weight = this._getClassWeight(node);
       var contentScore = 0;
 
-      this.log("Cleaning Conditionally", tagsList[i]);
+      this.log("Cleaning Conditionally", node);
 
       if (weight + contentScore < 0) {
-        tagsList[i].parentNode.removeChild(tagsList[i]);
-      } else if (this._getCharCount(tagsList[i],',') < 10) {
+        return true;
+      }
+
+      if (this._getCharCount(node, ',') < 10) {
         // If there are not very many commas, and the number of
         // non-paragraph elements is more than paragraphs or other
         // ominous signs, remove the element.
-        var p = tagsList[i].getElementsByTagName("p").length;
-        var img = tagsList[i].getElementsByTagName("img").length;
-        var li = tagsList[i].getElementsByTagName("li").length-100;
-        var input = tagsList[i].getElementsByTagName("input").length;
+        var p = node.getElementsByTagName("p").length;
+        var img = node.getElementsByTagName("img").length;
+        var li = node.getElementsByTagName("li").length-100;
+        var input = node.getElementsByTagName("input").length;
 
         var embedCount = 0;
-        var embeds = tagsList[i].getElementsByTagName("embed");
+        var embeds = node.getElementsByTagName("embed");
         for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
           if (!this.REGEXPS.videos.test(embeds[ei].src))
             embedCount += 1;
         }
 
-        var linkDensity = this._getLinkDensity(tagsList[i]);
-        var contentLength = this._getInnerText(tagsList[i]).length;
-        var toRemove = false;
-        if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
-          toRemove = true;
-        } else if (!isList && li > p) {
-          toRemove = true;
-        } else if (input > Math.floor(p/3)) {
-          toRemove = true;
-        } else if (!isList && contentLength < 25 && (img === 0 || img > 2)) {
-          toRemove = true;
-        } else if (!isList && weight < 25 && linkDensity > 0.2) {
-          toRemove = true;
-        } else if (weight >= 25 && linkDensity > 0.5) {
-          toRemove = true;
-        } else if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
-          toRemove = true;
-        }
+        var linkDensity = this._getLinkDensity(node);
+        var contentLength = this._getInnerText(node).length;
 
-        if (toRemove) {
-          tagsList[i].parentNode.removeChild(tagsList[i]);
-        }
+        var haveToRemove =
+          // Make an exception for elements with no p's and exactly 1 img.
+          (img > p && !this._hasAncestorTag(node, "figure")) ||
+          (!isList && li > p) ||
+          (input > Math.floor(p/3)) ||
+          (!isList && contentLength < 25 && (img === 0 || img > 2)) ||
+          (!isList && weight < 25 && linkDensity > 0.2) ||
+          (weight >= 25 && linkDensity > 0.5) ||
+          ((embedCount === 1 && contentLength < 75) || embedCount > 1);
+        return haveToRemove;
       }
-    }
+      return false;
+    });
   },
 
   /**
    * Clean out spurious headers from an Element. Checks things like classnames and link density.
    *
    * @param Element
    * @return void
   **/
   _cleanHeaders: function(e) {
     for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
-      var headers = e.getElementsByTagName('h' + headerIndex);
-      for (var i = headers.length - 1; i >= 0; i -= 1) {
-        if (this._getClassWeight(headers[i]) < 0)
-          headers[i].parentNode.removeChild(headers[i]);
-      }
+      this._removeNodes(e.getElementsByTagName('h' + headerIndex), function (header) {
+        return this._getClassWeight(header) < 0;
+      });
     }
   },
 
   _flagIsActive: function(flag) {
     return (this._flags & flag) > 0;
   },
 
   _addFlag: function(flag) {
@@ -1724,16 +1724,32 @@ Readability.prototype = {
   /**
    * Decides whether or not the document is reader-able without parsing the whole thing.
    *
    * @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
    */
   isProbablyReaderable: function(helperIsVisible) {
     var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
 
+    // Get <div> nodes which have <br> node(s) and append them into the `nodes` variable.
+    // Some articles' DOM structures might look like
+    // <div>
+    //   Sentences<br>
+    //   <br>
+    //   Sentences<br>
+    // </div>
+    var brNodes = this._getAllNodesWithTag(this._doc, ["div > br"]);
+    if (brNodes.length) {
+      var set = new Set();
+      [].forEach.call(brNodes, function(node) {
+        set.add(node.parentNode);
+      });
+      nodes = [].concat.apply(Array.from(set), nodes);
+    }
+
     // FIXME we should have a fallback for helperIsVisible, but this is
     // problematic because of jsdom's elem.style handling - see
     // https://github.com/mozilla/readability/pull/186 for context.
 
     var score = 0;
     // This is a little cheeky, we use the accumulator 'score' to decide what to return from
     // this callback:
     return this._someNode(nodes, function(node) {
@@ -1827,17 +1843,21 @@ Readability.prototype = {
     // the article's content.
     if (!metadata.excerpt) {
       var paragraphs = articleContent.getElementsByTagName("p");
       if (paragraphs.length > 0) {
         metadata.excerpt = paragraphs[0].textContent.trim();
       }
     }
 
-    return { uri: this._uri,
-             title: articleTitle,
-             byline: metadata.byline || this._articleByline,
-             dir: this._articleDir,
-             content: articleContent.innerHTML,
-             length: articleContent.textContent.length,
-             excerpt: metadata.excerpt };
+    var textContent = articleContent.textContent;
+    return {
+      uri: this._uri,
+      title: articleTitle,
+      byline: metadata.byline || this._articleByline,
+      dir: this._articleDir,
+      content: articleContent.innerHTML,
+      textContent: textContent,
+      length: textContent.length,
+      excerpt: metadata.excerpt,
+    };
   }
 };
author	Evan Tseng <evan@tseng.io>
	Tue, 01 Nov 2016 18:54:06 +0800
changeset 432530	b61753cb1abfadf8947809abc4b92b148c77d4fc
parent 432230	3e73fd638e687a4d7f46613586e5156b8e2af846
child 535689	565910436cae26439c62023b6638fb33aa73bc6c
push id	34354
push user	bmo:evan@tseng.io
push date	Wed, 02 Nov 2016 09:35:57 +0000
reviewers	Gijs
bugs	1177619
milestone	52.0a1
toolkit/components/reader/.eslintrc.js		file \| annotate \| diff \| comparison \| revisions
toolkit/components/reader/JSDOMParser.js		file \| annotate \| diff \| comparison \| revisions
toolkit/components/reader/Readability.js		file \| annotate \| diff \| comparison \| revisions