author Evan Tseng <evan@tseng.io>

Mon, 06 Feb 2017 12:03:06 +0800

changeset 479713 6d6fb32bb36414b3b164c4111e861b6314c3e64d

parent 479141 20a8536b0bfac74389d3a57bd8dd957d98779ce1

child 488512 5a83a5ccc7971ddd80d34f6acecb462d1e1e7658

child 488514 6dc329f43c0eeea94ab905d567d213728f31b009

child 488532 fa28d4fef366279b5116a8c7de96f33d47e009cb

child 488733 8dfa9b0b4987769492e4d927cedc704c73f58e66

child 488779 3f9b1b7a18a38f4278e396a66ac69f863e87e7c4

push id 44328

push user bmo:evan@tseng.io

push date Tue, 07 Feb 2017 03:32:26 +0000

reviewers Gijs

bugs 1300697, 1259763, 1167568

milestone 54.0a1

toolkit/components/reader/Readability.js file | annotate | diff | comparison | revisions
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -34,16 +34,17 @@
  * @param {Object}       options The options object.
  */
 function Readability(uri, doc, options) {
   options = options || {};
 
   this._uri = uri;
   this._doc = doc;
   this._biggestFrame = false;
+  this._articleTitle = null;
   this._articleByline = null;
   this._articleDir = null;
 
   // Configureable options
   this._debug = !!options.debug;
   this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
   this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
   this._maxPages = options.maxPages || this.DEFAULT_MAX_PAGES;
@@ -114,17 +115,17 @@ Readability.prototype = {
   DEFAULT_MAX_PAGES: 5,
 
   // Element tags to score by default.
   DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
 
   // All of the regular expressions in use within readability.
   // Defined up here so we don't instantiate them repeatedly in loops.
   REGEXPS: {
-    unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|ad-break|agegate|pagination|pager|popup|yom-remote/i,
+    unlikelyCandidates: /banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote/i,
     okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
     positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
     negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
     extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
     byline: /byline|author|dateline|writtenby|p-author/i,
     replaceFonts: /<(\/?)font[^>]*>/gi,
     normalize: /\s{2,}/g,
     videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
@@ -484,20 +485,28 @@ Readability.prototype = {
     this._clean(articleContent, "footer");
 
     // Clean out elements have "share" in their id/class combinations from final top candidates,
     // which means we don't remove the top candidates even they have "share".
     this._forEachNode(articleContent.children, function(topCandidate) {
       this._cleanMatchedNodes(topCandidate, /share/);
     });
 
-    // If there is only one h2, they are probably using it as a header
-    // and not a subheader, so remove it since we already have a header.
-    if (articleContent.getElementsByTagName('h2').length === 1)
-      this._clean(articleContent, "h2");
+    // If there is only one h2 and its text content substantially equals article title,
+    // they are probably using it as a header and not a subheader,
+    // so remove it since we already extract the title separately.
+    var h2 = articleContent.getElementsByTagName('h2');
+    if (h2.length === 1) {
+      var lengthSimilarRate = (h2[0].textContent.length - this._articleTitle.length) / this._articleTitle.length;
+      if (Math.abs(lengthSimilarRate) < 0.5 &&
+          (lengthSimilarRate > 0 ? h2[0].textContent.includes(this._articleTitle) :
+                                   this._articleTitle.includes(h2[0].textContent))) {
+        this._clean(articleContent, "h2");
+      }
+    }
 
     this._clean(articleContent, "iframe");
     this._clean(articleContent, "input");
     this._clean(articleContent, "textarea");
     this._clean(articleContent, "select");
     this._clean(articleContent, "button");
     this._cleanHeaders(articleContent);
 
@@ -709,16 +718,25 @@ Readability.prototype = {
               node.tagName !== "BODY" &&
               node.tagName !== "A") {
             this.log("Removing unlikely candidate - " + matchString);
             node = this._removeAndGetNext(node);
             continue;
           }
         }
 
+        // Remove empty DIV, SECTION, and HEADER nodes
+        if ((node.tagName === "DIV" || node.tagName === "SECTION" || node.tagName === "HEADER" ||
+             node.tagName === "H1" || node.tagName === "H2" || node.tagName === "H3" ||
+             node.tagName === "H4" || node.tagName === "H5" || node.tagName === "H6") &&
+            this._isEmptyElement(node)) {
+          node = this._removeAndGetNext(node);
+          continue;
+        }
+
         if (this.DEFAULT_TAGS_TO_SCORE.indexOf(node.tagName) !== -1) {
           elementsToScore.push(node);
         }
 
         // Turn all divs that don't have children block level elements into p's
         if (node.tagName === "DIV") {
           // Sites like http://mobile.slate.com encloses each paragraph with a DIV
           // element. DIVs with only a P element inside and no text content can be
@@ -729,17 +747,17 @@ Readability.prototype = {
             node.parentNode.replaceChild(newNode, node);
             node = newNode;
           } else if (!this._hasChildBlockElement(node)) {
             node = this._setNodeTag(node, "P");
             elementsToScore.push(node);
           } else {
             // EXPERIMENTAL
             this._forEachNode(node.childNodes, function(childNode) {
-              if (childNode.nodeType === Node.TEXT_NODE) {
+              if (childNode.nodeType === Node.TEXT_NODE && childNode.textContent.trim().length > 0) {
                 var p = doc.createElement('p');
                 p.textContent = childNode.textContent;
                 p.style.display = 'inline';
                 p.className = 'readability-styled';
                 node.replaceChild(p, childNode);
               }
             });
           }
@@ -900,16 +918,27 @@ Readability.prototype = {
           if (parentScore > lastScore) {
             // Alright! We found a better parent to use.
             topCandidate = parentOfTopCandidate;
             break;
           }
           lastScore = parentOfTopCandidate.readability.contentScore;
           parentOfTopCandidate = parentOfTopCandidate.parentNode;
         }
+
+        // If the top candidate is the only child, use parent instead. This will help sibling
+        // joining logic when adjacent content is actually located in parent's sibling node.
+        parentOfTopCandidate = topCandidate.parentNode;
+        while (parentOfTopCandidate.tagName != "BODY" && parentOfTopCandidate.children.length == 1) {
+          topCandidate = parentOfTopCandidate;
+          parentOfTopCandidate = topCandidate.parentNode;
+        }
+        if (!topCandidate.readability) {
+          this._initializeNode(topCandidate);
+        }
       }
 
       // Now that we have the top candidate, look through its siblings for content
       // that might also be related. Things like preambles, content split by ads
       // that we removed, etc.
       var articleContent = doc.createElement("DIV");
       if (isPaging)
         articleContent.id = "readability-content";
@@ -1152,16 +1181,22 @@ Readability.prototype = {
 
     // And there should be no text nodes with real content
     return !this._someNode(element.childNodes, function(node) {
       return node.nodeType === Node.TEXT_NODE &&
              this.REGEXPS.hasContent.test(node.textContent);
     });
   },
 
+  _isEmptyElement: function(node) {
+    return node.nodeType === Node.ELEMENT_NODE &&
+      node.children.length == 0 &&
+      node.textContent.trim().length == 0;
+  },
+
   /**
    * Determine whether element has any children block level elements.
    *
    * @param Element
    */
   _hasChildBlockElement: function (element) {
     return this._someNode(element.childNodes, function(node) {
       return this.DIV_TO_P_ELEMS.indexOf(node.tagName) !== -1 ||
@@ -1739,17 +1774,17 @@ Readability.prototype = {
           if (!this.REGEXPS.videos.test(embeds[ei].src))
             embedCount += 1;
         }
 
         var linkDensity = this._getLinkDensity(node);
         var contentLength = this._getInnerText(node).length;
 
         var haveToRemove =
-          (img > 1 && img > p && !this._hasAncestorTag(node, "figure")) ||
+          (img > 1 && p / img < 0.5 && !this._hasAncestorTag(node, "figure")) ||
           (!isList && li > p) ||
           (input > Math.floor(p/3)) ||
           (!isList && contentLength < 25 && (img === 0 || img > 2) && !this._hasAncestorTag(node, "figure")) ||
           (!isList && weight < 25 && linkDensity > 0.2) ||
           (weight >= 25 && linkDensity > 0.5) ||
           ((embedCount === 1 && contentLength < 75) || embedCount > 1);
         return haveToRemove;
       }
@@ -1896,17 +1931,17 @@ Readability.prototype = {
     // this._parsedPages[uri.spec.replace(/\/$/, '')] = true;
 
     // Pull out any possible next page link first.
     // var nextPageLink = this._findNextPageLink(doc.body);
 
     this._prepDocument();
 
     var metadata = this._getArticleMetadata();
-    var articleTitle = metadata.title;
+    this._articleTitle = metadata.title;
 
     var articleContent = this._grabArticle();
     if (!articleContent)
       return null;
 
     this.log("Grabbed: " + articleContent.innerHTML);
 
     this._postProcessContent(articleContent);
@@ -1927,17 +1962,17 @@ Readability.prototype = {
       if (paragraphs.length > 0) {
         metadata.excerpt = paragraphs[0].textContent.trim();
       }
     }
 
     var textContent = articleContent.textContent;
     return {
       uri: this._uri,
-      title: articleTitle,
+      title: this._articleTitle,
       byline: metadata.byline || this._articleByline,
       dir: this._articleDir,
       content: articleContent.innerHTML,
       textContent: textContent,
       length: textContent.length,
       excerpt: metadata.excerpt,
     };
   }
author	Evan Tseng <evan@tseng.io>
	Mon, 06 Feb 2017 12:03:06 +0800
changeset 479713	6d6fb32bb36414b3b164c4111e861b6314c3e64d
parent 479141	20a8536b0bfac74389d3a57bd8dd957d98779ce1
child 488512	5a83a5ccc7971ddd80d34f6acecb462d1e1e7658
child 488514	6dc329f43c0eeea94ab905d567d213728f31b009
child 488532	fa28d4fef366279b5116a8c7de96f33d47e009cb
child 488733	8dfa9b0b4987769492e4d927cedc704c73f58e66
child 488779	3f9b1b7a18a38f4278e396a66ac69f863e87e7c4
push id	44328
push user	bmo:evan@tseng.io
push date	Tue, 07 Feb 2017 03:32:26 +0000
reviewers	Gijs
bugs	1300697, 1259763, 1167568
milestone	54.0a1