Bug 1444082 - sync reader mode to github tip ( 8525c6af36d3badbe27c4672a6f2dd99ddb4097f ), r?johannh
MozReview-Commit-ID: LZLFf9kyUR5
--- a/mobile/android/tests/browser/robocop/reader_mode_pages/not_an_article.html
+++ b/mobile/android/tests/browser/robocop/reader_mode_pages/not_an_article.html
@@ -29,104 +29,15 @@
<body class=""
data-readonly="false"
data-static-url="//support.cdn.mozilla.net/static/"
data-orientation="right"
data-ga-push="[]"
data-usernames-api="/en-US/users/api/usernames"
>
-<nav class="scrollable">
- <div id="search-bar">
- <form id="search" action="/en-US/search">
- <input type="hidden" name="product" value="mobile" />
- <input name="q" placeholder="Search Mozilla Support" required="required" type="search" value="">
- <button class="icon-sprite" type="submit">Search</button>
- </form>
-
- </div>
-
- <a href="/en-US/products">Home</a>
- <a href="/en-US/questions/new">Ask a question</a>
- <a href="/en-US/questions">Support Forum</a>
-
- <header>Navigation</header>
- <a href="/en-US/get-involved">Help other users</a>
- <a href="?&mobile=0">Switch to desktop site</a>
-
- <header>Profile</header>
- <a href="/en-US/users/login">Sign in</a>
-
- <header>Languages</header>
- <a href="/en-US/locales" class="locale-picker">Switch language</a>
- </nav>
-
-<header class="slide-on-exposed">
- <div id="menu-button" class="icon-sprite"></div>
- <h1>
- Firefox for Android
- </h1>
- </header>
-
-
-<div class="wrapper slide-on-exposed">
- <section id="content">
- <ul id="topics">
- <li>
- <a href="/en-US/products/mobile/get-started" class="cf">
- <img src="//support.cdn.mozilla.net/static/img/blank.png" class="topic-sprite topic-get-started" alt="">
- Learn the Basics: get started
- </a>
- </li>
- <li>
- <a href="/en-US/products/mobile/download-and-install" class="cf">
- <img src="//support.cdn.mozilla.net/static/img/blank.png" class="topic-sprite topic-download-and-install" alt="">
- Download, install and migration
- </a>
- </li>
- <li>
- <a href="/en-US/products/mobile/privacy-and-security" class="cf">
- <img src="//support.cdn.mozilla.net/static/img/blank.png" class="topic-sprite topic-privacy-and-security" alt="">
- Privacy and security settings
- </a>
- </li>
- <li>
- <a href="/en-US/products/mobile/customize" class="cf">
- <img src="//support.cdn.mozilla.net/static/img/blank.png" class="topic-sprite topic-customize" alt="">
- Customize controls, options and add-ons
- </a>
- </li>
- <li>
- <a href="/en-US/products/mobile/sync" class="cf">
- <img src="//support.cdn.mozilla.net/static/img/blank.png" class="topic-sprite topic-sync" alt="">
- Firefox Sync settings
- </a>
- </li>
- <li>
- <a href="/en-US/products/mobile/fix-problems" class="cf">
- <img src="//support.cdn.mozilla.net/static/img/blank.png" class="topic-sprite topic-fix-problems" alt="">
- Fix slowness, crashing, error messages and other problems
- </a>
- </li>
- <li>
- <a href="/en-US/kb/get-community-support" class="cf">
- <img src="//support.cdn.mozilla.net/static/img/blank.png" class="topic-sprite topic-get-community-support" alt="">
- Get community support
- </a>
- </li>
- </ul>
-
- </section>
-
- <footer>
- </footer>
-
- <ul id="notifications">
- </ul>
-</div>
-
<script src="//support.cdn.mozilla.net/static/jsi18n/en-us/javascript.js?beb7c1e"></script>
<script src="//support.cdn.mozilla.net/static/js/mobile/common-min.js?build=beb7c1e"></script>
</body>
</html>
--- a/toolkit/components/reader/JSDOMParser.js
+++ b/toolkit/components/reader/JSDOMParser.js
@@ -555,17 +555,18 @@
delete this._textContent;
},
set textContent(newText) {
this._textContent = newText;
delete this._innerHTML;
},
};
- var Document = function () {
+ var Document = function (url) {
+ this.documentURI = url;
this.styleSheets = [];
this.childNodes = [];
this.children = [];
};
Document.prototype = {
__proto__: Node.prototype,
@@ -595,16 +596,30 @@
return node;
},
createTextNode: function (text) {
var node = new Text();
node.textContent = text;
return node;
},
+
+ get baseURI() {
+ if (!this.hasOwnProperty("_baseURI")) {
+ this._baseURI = this.documentURI;
+ var baseElements = this.getElementsByTagName("base");
+ var href = baseElements[0] && baseElements[0].getAttribute("href");
+ if (href) {
+ try {
+ this._baseURI = (new URL(href, this._baseURI)).href;
+ } catch (ex) {/* Just fall back to documentURI */}
+ }
+ }
+ return this._baseURI;
+ },
};
var Element = function (tag) {
this.attributes = [];
this.childNodes = [];
this.children = [];
this.nextElementSibling = this.previousElementSibling = null;
this.localName = tag.toLowerCase();
@@ -1113,19 +1128,19 @@
}
return node;
},
/**
* Parses an HTML string and returns a JS implementation of the Document.
*/
- parse: function (html) {
+ parse: function (html, url) {
this.html = html;
- var doc = this.doc = new Document();
+ var doc = this.doc = new Document(url);
this.readChildren(doc);
// If this is an HTML document, remove root-level children except for the
// <html> node
if (doc.documentElement) {
for (var i = doc.childNodes.length; --i >= 0;) {
var child = doc.childNodes[i];
if (child !== doc.documentElement) {
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -36,16 +36,17 @@
function Readability(uri, doc, options) {
options = options || {};
this._uri = uri;
this._doc = doc;
this._articleTitle = null;
this._articleByline = null;
this._articleDir = null;
+ this._attempts = [];
// Configurable options
this._debug = !!options.debug;
this._maxElemsToParse = options.maxElemsToParse || this.DEFAULT_MAX_ELEMS_TO_PARSE;
this._nbTopCandidates = options.nbTopCandidates || this.DEFAULT_N_TOP_CANDIDATES;
this._wordThreshold = options.wordThreshold || this.DEFAULT_WORD_THRESHOLD;
this._classesToPreserve = this.CLASSES_TO_PRESERVE.concat(options.classesToPreserve || []);
@@ -270,44 +271,30 @@ Readability.prototype = {
/**
* Converts each <a> and <img> uri in the given element to an absolute URI,
* ignoring #ref URIs.
*
* @param Element
* @return void
*/
_fixRelativeUris: function(articleContent) {
- var scheme = this._uri.scheme;
- var prePath = this._uri.prePath;
- var pathBase = this._uri.pathBase;
-
+ var baseURI = this._doc.baseURI;
+ var documentURI = this._doc.documentURI;
function toAbsoluteURI(uri) {
- // If this is already an absolute URI, return it.
- if (/^[a-zA-Z][a-zA-Z0-9\+\-\.]*:/.test(uri))
+ // Leave hash links alone if the base URI matches the document URI:
+ if (baseURI == documentURI && uri.charAt(0) == "#") {
return uri;
-
- // Scheme-rooted relative URI.
- if (uri.substr(0, 2) == "//")
- return scheme + "://" + uri.substr(2);
-
- // Prepath-rooted relative URI.
- if (uri[0] == "/")
- return prePath + uri;
-
- // Dotslash relative URI.
- if (uri.indexOf("./") === 0)
- return pathBase + uri.slice(2);
-
- // Ignore hash URIs:
- if (uri[0] == "#")
- return uri;
-
- // Standard relative URI; add entire path. pathBase already includes a
- // trailing "/".
- return pathBase + uri;
+ }
+ // Otherwise, resolve against base URI:
+ try {
+ return new URL(uri, baseURI).href;
+ } catch (ex) {
+ // Something went wrong, just return the original:
+ }
+ return uri;
}
var links = articleContent.getElementsByTagName("a");
this._forEachNode(links, function(link) {
var href = link.getAttribute("href");
if (href) {
// Replace links with javascript: URIs with text content, since
// they won't work after scripts have been removed from the page.
@@ -530,16 +517,17 @@ Readability.prototype = {
// Clean out junk from the article content
this._cleanConditionally(articleContent, "form");
this._cleanConditionally(articleContent, "fieldset");
this._clean(articleContent, "object");
this._clean(articleContent, "embed");
this._clean(articleContent, "h1");
this._clean(articleContent, "footer");
+ this._clean(articleContent, "link");
// Clean out elements have "share" in their id/class combinations from final top candidates,
// which means we don't remove the top candidates even they have "share".
this._forEachNode(articleContent.children, function(topCandidate) {
this._cleanMatchedNodes(topCandidate, /share/);
});
// If there is only one h2 and its text content substantially equals article title,
@@ -1084,34 +1072,55 @@ Readability.prototype = {
div.appendChild(children[0]);
}
articleContent.appendChild(div);
}
if (this._debug)
this.log("Article content after paging: " + articleContent.innerHTML);
+ var parseSuccessful = true;
+
// Now that we've gone through the full algorithm, check to see if
// we got any meaningful content. If we didn't, we may need to re-run
// grabArticle with different flags set. This gives us a higher likelihood of
// finding the content, and the sieve approach gives us a higher likelihood of
// finding the -right- content.
- if (this._getInnerText(articleContent, true).length < this._wordThreshold) {
+ var textLength = this._getInnerText(articleContent, true).length;
+ if (textLength < this._wordThreshold) {
+ parseSuccessful = false;
page.innerHTML = pageCacheHtml;
if (this._flagIsActive(this.FLAG_STRIP_UNLIKELYS)) {
this._removeFlag(this.FLAG_STRIP_UNLIKELYS);
+ this._attempts.push({articleContent: articleContent, textLength: textLength});
} else if (this._flagIsActive(this.FLAG_WEIGHT_CLASSES)) {
this._removeFlag(this.FLAG_WEIGHT_CLASSES);
+ this._attempts.push({articleContent: articleContent, textLength: textLength});
} else if (this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY)) {
this._removeFlag(this.FLAG_CLEAN_CONDITIONALLY);
+ this._attempts.push({articleContent: articleContent, textLength: textLength});
} else {
- return null;
+ this._attempts.push({articleContent: articleContent, textLength: textLength});
+ // No luck after removing flags, just return the longest text we found during the different loops
+ this._attempts.sort(function (a, b) {
+ return a.textLength < b.textLength;
+ });
+
+ // But first check if we actually have something
+ if (!this._attempts[0].textLength) {
+ return null;
+ }
+
+ articleContent = this._attempts[0].articleContent;
+ parseSuccessful = true;
}
- } else {
+ }
+
+ if (parseSuccessful) {
// Find out text direction from ancestors of final top candidate.
var ancestors = [parentOfTopCandidate, topCandidate].concat(this._getNodeAncestors(parentOfTopCandidate));
this._someNode(ancestors, function(ancestor) {
if (!ancestor.tagName)
return false;
var articleDir = ancestor.getAttribute("dir");
if (articleDir) {
this._articleDir = articleDir;
--- a/toolkit/components/reader/ReaderWorker.js
+++ b/toolkit/components/reader/ReaderWorker.js
@@ -42,12 +42,12 @@ var Agent = {
*
* @param {object} uri URI data for the document.
* @param {string} serializedDoc The serialized document.
* @param {object} options Options object to pass to Readability.
*
* @return {object} Article object returned from Readability.
*/
parseDocument(uri, serializedDoc, options) {
- let doc = new JSDOMParser().parse(serializedDoc);
+ let doc = new JSDOMParser().parse(serializedDoc, uri.spec);
return new Readability(uri, doc, options).parse();
},
};
--- a/toolkit/components/reader/test/readerModeNonArticle.html
+++ b/toolkit/components/reader/test/readerModeNonArticle.html
@@ -1,14 +1,9 @@
<!DOCTYPE html>
<html>
<head>
<title>Non article title</title>
<meta name="description" content="This is the non-article description." />
</head>
<body>
-<header>Site header</header>
-<div>
-<h1>Non article title</h1>
-<p>Woot!</p>
-</div>
</body>
</html>