Bug 1444082 - sync reader mode to github tip ( 8525c6af36d3badbe27c4672a6f2dd99ddb4097f ), r?johannh draft
authorGijs Kruitbosch <gijskruitbosch@gmail.com>
Thu, 08 Mar 2018 14:35:02 +0000
changeset 765883 3e0882632df464cd63d6b8eb5ddf84567aba7f5f
parent 764977 55d91695f4bb951c224005155baef054a786c1f7
push id102171
push usergijskruitbosch@gmail.com
push dateSat, 10 Mar 2018 14:46:30 +0000
reviewersjohannh
bugs1444082
milestone60.0a1
Bug 1444082 - sync reader mode to github tip ( 8525c6af36d3badbe27c4672a6f2dd99ddb4097f ), r?johannh MozReview-Commit-ID: LZLFf9kyUR5
mobile/android/tests/browser/robocop/reader_mode_pages/not_an_article.html
toolkit/components/reader/JSDOMParser.js
toolkit/components/reader/Readability.js
toolkit/components/reader/ReaderWorker.js
toolkit/components/reader/test/readerModeNonArticle.html
--- a/mobile/android/tests/browser/robocop/reader_mode_pages/not_an_article.html
+++ b/mobile/android/tests/browser/robocop/reader_mode_pages/not_an_article.html
@@ -29,104 +29,15 @@
 <body class=""
       data-readonly="false"
       data-static-url="//support.cdn.mozilla.net/static/"
       data-orientation="right"
       data-ga-push="[]"
             data-usernames-api="/en-US/users/api/usernames"
 >
 
-<nav class="scrollable">
-  <div id="search-bar">
-      <form id="search" action="/en-US/search">
-                  <input type="hidden" name="product" value="mobile" />
-              <input name="q" placeholder="Search Mozilla Support" required="required" type="search" value="">
-    <button class="icon-sprite" type="submit">Search</button>
-  </form>
-
-  </div>
-
-  <a href="/en-US/products">Home</a>
-          <a href="/en-US/questions/new">Ask a question</a>
-  <a href="/en-US/questions">Support Forum</a>
-
-  <header>Navigation</header>
-  <a href="/en-US/get-involved">Help other users</a>
-  <a href="?&amp;mobile=0">Switch to desktop site</a>
-
-  <header>Profile</header>
-      <a href="/en-US/users/login">Sign in</a>
-  
-      <header>Languages</header>
-            <a href="/en-US/locales" class="locale-picker">Switch language</a>
-  </nav>
-
-<header class="slide-on-exposed">
-  <div id="menu-button" class="icon-sprite"></div>
-            <h1>
-              Firefox for Android
-          </h1>
-  </header>
-
-
-<div class="wrapper slide-on-exposed">
-  <section id="content">
-      <ul id="topics">
-          <li>
-        <a href="/en-US/products/mobile/get-started" class="cf">
-          <img src="//support.cdn.mozilla.net/static/img/blank.png" class="topic-sprite topic-get-started" alt="">
-          Learn the Basics: get started
-        </a>
-      </li>
-                <li>
-        <a href="/en-US/products/mobile/download-and-install" class="cf">
-          <img src="//support.cdn.mozilla.net/static/img/blank.png" class="topic-sprite topic-download-and-install" alt="">
-          Download, install and migration
-        </a>
-      </li>
-                <li>
-        <a href="/en-US/products/mobile/privacy-and-security" class="cf">
-          <img src="//support.cdn.mozilla.net/static/img/blank.png" class="topic-sprite topic-privacy-and-security" alt="">
-          Privacy and security settings
-        </a>
-      </li>
-                <li>
-        <a href="/en-US/products/mobile/customize" class="cf">
-          <img src="//support.cdn.mozilla.net/static/img/blank.png" class="topic-sprite topic-customize" alt="">
-          Customize controls, options and add-ons
-        </a>
-      </li>
-                <li>
-        <a href="/en-US/products/mobile/sync" class="cf">
-          <img src="//support.cdn.mozilla.net/static/img/blank.png" class="topic-sprite topic-sync" alt="">
-          Firefox Sync settings
-        </a>
-      </li>
-                <li>
-        <a href="/en-US/products/mobile/fix-problems" class="cf">
-          <img src="//support.cdn.mozilla.net/static/img/blank.png" class="topic-sprite topic-fix-problems" alt="">
-          Fix slowness, crashing, error messages and other problems
-        </a>
-      </li>
-                <li>
-            <a href="/en-US/kb/get-community-support" class="cf">
-              <img src="//support.cdn.mozilla.net/static/img/blank.png" class="topic-sprite topic-get-community-support" alt="">
-              Get community support
-            </a>
-          </li>
-            </ul>
-
-      </section>
-
-  <footer>
-      </footer>
-
-  <ul id="notifications">
-          </ul>
-</div>
-
 
 <script src="//support.cdn.mozilla.net/static/jsi18n/en-us/javascript.js?beb7c1e"></script>
 
 <script src="//support.cdn.mozilla.net/static/js/mobile/common-min.js?build=beb7c1e"></script>
 
 </body>
 </html>
--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@@ -555,17 +555,18 @@
       delete this._textContent;
     },
     set textContent(newText) {
       this._textContent = newText;
       delete this._innerHTML;
     },
   };
 
-  var Document = function () {
+  var Document = function (url) {
+    this.documentURI = url;
     this.styleSheets = [];
     this.childNodes = [];
     this.children = [];
   };
 
   Document.prototype = {
     __proto__: Node.prototype,
 
@@ -595,16 +596,30 @@
       return node;
     },
 
     createTextNode: function (text) {
       var node = new Text();
       node.textContent = text;
       return node;
     },
+
+    get baseURI() {
+      if (!this.hasOwnProperty("_baseURI")) {
+        this._baseURI = this.documentURI;
+        var baseElements = this.getElementsByTagName("base");
+        var href = baseElements[0] && baseElements[0].getAttribute("href");
+        if (href) {
+          try {
+            this._baseURI = (new URL(href, this._baseURI)).href;
+          } catch (ex) {/* Just fall back to documentURI */}
+        }
+      }
+      return this._baseURI;
+    },
   };
 
   var Element = function (tag) {
     this.attributes = [];
     this.childNodes = [];
     this.children = [];
     this.nextElementSibling = this.previousElementSibling = null;
     this.localName = tag.toLowerCase();
@@ -1113,19 +1128,19 @@
       }
 
       return node;
     },
 
     /**
      * Parses an HTML string and returns a JS implementation of the Document.
      */
-    parse: function (html) {
+    parse: function (html, url) {
       this.html = html;
-      var doc = this.doc = new Document();
+      var doc = this.doc = new Document(url);
       this.readChildren(doc);
 
       // If this is an HTML document, remove root-level children except for the
       // <html> node
       if (doc.documentElement) {
         for (var i = doc.childNodes.length; --i >= 0;) {
           var child = doc.childNodes[i];
           if (child !== doc.documentElement) {
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -36,16 +36,17 @@
 function Readability(uri, doc, options) {
   options = options || {};
 
   this._uri = uri;
   this._doc = doc;
   this._articleTitle = null;
   this._articleByline = null;
   this._articleDir = null;
+  this._attempts = [];
 
   // Configurable options
   this._debug = !!options.debug;
   this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
   this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
   this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;
   this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
 
@@ -270,44 +271,30 @@ Readability.prototype = {
   /**
    * Converts each <a> and <img> uri in the given element to an absolute URI,
    * ignoring #ref URIs.
    *
    * @param Element
    * @return void
    */
   _fixRelativeUris: function(articleContent) {
-    var scheme = this._uri.scheme;
-    var prePath = this._uri.prePath;
-    var pathBase = this._uri.pathBase;
-
+    var baseURI = this._doc.baseURI;
+    var documentURI = this._doc.documentURI;
     function toAbsoluteURI(uri) {
-      // If this is already an absolute URI, return it.
-      if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri))
+      // Leave hash links alone if the base URI matches the document URI:
+      if (baseURI == documentURI && uri.charAt(0) == "#") {
         return uri;
-
-      // Scheme-rooted relative URI.
-      if (uri.substr(0, 2) == "//")
-        return scheme + "://" + uri.substr(2);
-
-      // Prepath-rooted relative URI.
-      if (uri[0] == "/")
-        return prePath + uri;
-
-      // Dotslash relative URI.
-      if (uri.indexOf("./") === 0)
-        return pathBase + uri.slice(2);
-
-      // Ignore hash URIs:
-      if (uri[0] == "#")
-        return uri;
-
-      // Standard relative URI; add entire path. pathBase already includes a
-      // trailing "/".
-      return pathBase + uri;
+      }
+      // Otherwise, resolve against base URI:
+      try {
+        return new URL(uri, baseURI).href;
+      } catch (ex) {
+        // Something went wrong, just return the original:
+      }
+      return uri;
     }
 
     var links = articleContent.getElementsByTagName("a");
     this._forEachNode(links, function(link) {
       var href = link.getAttribute("href");
       if (href) {
         // Replace links with javascript: URIs with text content, since
         // they won't work after scripts have been removed from the page.
@@ -530,16 +517,17 @@ Readability.prototype = {
 
     // Clean out junk from the article content
     this._cleanConditionally(articleContent, "form");
     this._cleanConditionally(articleContent, "fieldset");
     this._clean(articleContent, "object");
     this._clean(articleContent, "embed");
     this._clean(articleContent, "h1");
     this._clean(articleContent, "footer");
+    this._clean(articleContent, "link");
 
     // Clean out elements have "share" in their id/class combinations from final top candidates,
     // which means we don't remove the top candidates even they have "share".
     this._forEachNode(articleContent.children, function(topCandidate) {
       this._cleanMatchedNodes(topCandidate, /share/);
     });
 
     // If there is only one h2 and its text content substantially equals article title,
@@ -1084,34 +1072,55 @@ Readability.prototype = {
           div.appendChild(children[0]);
         }
         articleContent.appendChild(div);
       }
 
       if (this._debug)
         this.log("Article content after paging: " + articleContent.innerHTML);
 
+      var parseSuccessful = true;
+
       // Now that we've gone through the full algorithm, check to see if
       // we got any meaningful content. If we didn't, we may need to re-run
       // grabArticle with different flags set. This gives us a higher likelihood of
       // finding the content, and the sieve approach gives us a higher likelihood of
       // finding the -right- content.
-      if (this._getInnerText(articleContent, true).length < this._wordThreshold) {
+      var textLength = this._getInnerText(articleContent, true).length;
+      if (textLength < this._wordThreshold) {
+        parseSuccessful = false;
         page.innerHTML = pageCacheHtml;
 
         if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
           this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
+          this._attempts.push({articleContent: articleContent, textLength: textLength});
         } else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
           this._removeFlag(this.FLAG_WEIGHT_CLASSES);
+          this._attempts.push({articleContent: articleContent, textLength: textLength});
         } else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
           this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
+          this._attempts.push({articleContent: articleContent, textLength: textLength});
         } else {
-          return null;
+          this._attempts.push({articleContent: articleContent, textLength: textLength});
+          // No luck after removing flags, just return the longest text we found during the different loops
+          this._attempts.sort(function (a, b) {
+            return a.textLength < b.textLength;
+          });
+
+          // But first check if we actually have something
+          if (!this._attempts[0].textLength) {
+            return null;
+          }
+
+          articleContent = this._attempts[0].articleContent;
+          parseSuccessful = true;
         }
-      } else {
+      }
+
+      if (parseSuccessful) {
         // Find out text direction from ancestors of final top candidate.
         var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
         this._someNode(ancestors, function(ancestor) {
           if (!ancestor.tagName)
             return false;
           var articleDir = ancestor.getAttribute("dir");
           if (articleDir) {
             this._articleDir = articleDir;
--- a/toolkit/components/reader/ReaderWorker.js
+++ b/toolkit/components/reader/ReaderWorker.js
@@ -42,12 +42,12 @@ var Agent = {
    *
    * @param {object} uri URI data for the document.
    * @param {string} serializedDoc The serialized document.
    * @param {object} options Options object to pass to Readability.
    *
    * @return {object} Article object returned from Readability.
    */
   parseDocument(uri, serializedDoc, options) {
-    let doc = new JSDOMParser().parse(serializedDoc);
+    let doc = new JSDOMParser().parse(serializedDoc, uri.spec);
     return new Readability(uri, doc, options).parse();
   },
 };
--- a/toolkit/components/reader/test/readerModeNonArticle.html
+++ b/toolkit/components/reader/test/readerModeNonArticle.html
@@ -1,14 +1,9 @@
 <!DOCTYPE html>
 <html>
 <head>
 <title>Non article title</title>
 <meta name="description" content="This is the non-article description." />
 </head>
 <body>
-<header>Site header</header>
-<div>
-<h1>Non article title</h1>
-<p>Woot!</p>
-</div>
 </body>
 </html>