new file mode 100644
--- /dev/null
+++ b/toolkit/components/reader/.eslintrc.js
@@ -0,0 +1,199 @@
+"use strict";
+
+module.exports = {
+ "rules": {
+ // Braces only needed for multi-line arrow function blocks
+ // "arrow-body-style": [2, "as-needed"],
+
+ // Require spacing around =>
+ // "arrow-spacing": 2,
+
+ // Always require spacing around a single line block
+ // "block-spacing": 1,
+
+ // No newline before open brace for a block
+ "brace-style": 2,
+
+ // No space before always a space after a comma
+ "comma-spacing": [2, {"before": false, "after": true}],
+
+ // Commas at the end of the line not the start
+ // "comma-style": 2,
+
+ // Don't require spaces around computed properties
+ // "computed-property-spacing": [2, "never"],
+
+ // Functions must always return something or nothing
+ "consistent-return": 2,
+
+ // Require braces around blocks that start a new line
+ // Note that this rule is likely to be overridden on a per-directory basis
+ // very frequently.
+ // "curly": [2, "multi-line"],
+
+ // Always require a trailing EOL
+ "eol-last": 2,
+
+ // Require function* name()
+ // "generator-star-spacing": [2, {"before": false, "after": true}],
+
+ // Two space indent
+ "indent": [2, 2, { "SwitchCase": 1 }],
+
+ // Space after colon not before in property declarations
+ "key-spacing": [2, { "beforeColon": false, "afterColon": true, "mode": "minimum" }],
+
+ // Unix linebreaks
+ "linebreak-style": [2, "unix"],
+
+ // Always require parenthesis for new calls
+ "new-parens": 2,
+
+ // Use [] instead of Array()
+ // "no-array-constructor": 2,
+
+ // No duplicate arguments in function declarations
+ "no-dupe-args": 2,
+
+ // No duplicate keys in object declarations
+ "no-dupe-keys": 2,
+
+ // No duplicate cases in switch statements
+ "no-duplicate-case": 2,
+
+ // No labels
+ "no-labels": 2,
+
+ // If an if block ends with a return no need for an else block
+ "no-else-return": 2,
+
+ // No empty statements
+ "no-empty": 2,
+
+ // No empty character classes in regex
+ "no-empty-character-class": 2,
+
+ // Disallow empty destructuring
+ "no-empty-pattern": 2,
+
+ // No assiging to exception variable
+ // "no-ex-assign": 2,
+
+ // No using !! where casting to boolean is already happening
+ // "no-extra-boolean-cast": 2,
+
+ // No double semicolon
+ "no-extra-semi": 2,
+
+ // No overwriting defined functions
+ "no-func-assign": 2,
+
+ // Declarations in Program or Function Body
+ "no-inner-declarations": 2,
+
+ // No invalid regular expresions
+ "no-invalid-regexp": 2,
+
+ // No odd whitespace characters
+ "no-irregular-whitespace": 2,
+
+ // No single if block inside an else block
+ "no-lonely-if": 2,
+
+ // No mixing spaces and tabs in indent
+ "no-mixed-spaces-and-tabs": [2, "smart-tabs"],
+
+ // No unnecessary spacing
+ "no-multi-spaces": [2, { exceptions: { "AssignmentExpression": true, "VariableDeclarator": true, "ArrayExpression": true, "ObjectExpression": true } }],
+
+ // No reassigning native JS objects
+ "no-native-reassign": 2,
+
+ // No (!foo in bar)
+ "no-negated-in-lhs": 2,
+
+ // Nested ternary statements are confusing
+ "no-nested-ternary": 2,
+
+ // Use {} instead of new Object()
+ // "no-new-object": 2,
+
+ // No Math() or JSON()
+ "no-obj-calls": 2,
+
+ // No octal literals
+ "no-octal": 2,
+
+ // No redeclaring variables
+ "no-redeclare": 2,
+
+ // No unnecessary comparisons
+ "no-self-compare": 2,
+
+ // No declaring variables from an outer scope
+ "no-shadow": 2,
+
+ // No declaring variables that hide things like arguments
+ "no-shadow-restricted-names": 2,
+
+ // No spaces between function name and parentheses
+ "no-spaced-func": 2,
+
+ // No trailing whitespace
+ "no-trailing-spaces": 2,
+
+ // No using undeclared variables
+ // "no-undef": 2,
+
+ // Error on newline where a semicolon is needed
+ "no-unexpected-multiline": 2,
+
+ // No unreachable statements
+ "no-unreachable": 2,
+
+ // No expressions where a statement is expected
+ // "no-unused-expressions": 2,
+
+ // No declaring variables that are never used
+ "no-unused-vars": [2, {"vars": "all", "args": "none"}],
+
+ // No using variables before defined
+ // "no-use-before-define": [2, "nofunc"],
+
+ // No using with
+ "no-with": 2,
+
+ // Always require semicolon at end of statement
+ "semi": [2, "always"],
+
+ // Require space after keywords
+ "keyword-spacing": 2,
+
+ // Require space before blocks
+ "space-before-blocks": 2,
+
+ // Never use spaces before function parentheses
+ // "space-before-function-paren": [2, { "anonymous": "always", "named": "never" }],
+
+ // Require spaces before finally, catch, etc.
+ // "space-before-keywords": [2, "always"],
+
+ // No space padding in parentheses
+ // "space-in-parens": [2, "never"],
+
+ // Require spaces around operators
+ // "space-infix-ops": 2,
+
+ // Require spaces after return, throw and case
+ // "space-return-throw-case": 2,
+
+ // ++ and -- should not need spacing
+ // "space-unary-ops": [2, { "words": true, "nonwords": false }],
+
+ // No comparisons to NaN
+ "use-isnan": 2,
+
+ // Only check typeof against valid results
+ "valid-typeof": 2,
+ },
+}
--- a/toolkit/components/reader/Readability.js
+++ b/toolkit/components/reader/Readability.js
@@ -21,25 +21,24 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* This code is heavily based on Arc90's readability.js (1.7.1) script
* available at: http://code.google.com/p/arc90labs-readability
*/
-var root = this;
/**
* Public constructor.
* @param {Object} uri The URI descriptor object.
* @param {HTMLDocument} doc The document to parse.
* @param {Object} options The options object.
*/
-var Readability = function(uri, doc, options) {
+function Readability(uri, doc, options) {
options = options || {};
this._uri = uri;
this._doc = doc;
this._biggestFrame = false;
this._articleByline = null;
this._articleDir = null;
@@ -78,22 +77,22 @@ var Readability = function(uri, doc, opt
var elDesc = "";
if (e.id)
elDesc = "(#" + e.id + classDesc + ")";
else if (classDesc)
elDesc = "(" + classDesc + ")";
return rv + elDesc;
};
this.log = function () {
- if ("dump" in root) {
+ if (typeof dump !== undefined) {
var msg = Array.prototype.map.call(arguments, function(x) {
return (x && x.nodeName) ? logEl(x) : x;
}).join(" ");
dump("Reader: (Readability) " + msg + "\n");
- } else if ("console" in root) {
+ } else if (typeof console !== undefined) {
var args = ["Reader: (Readability) "].concat(arguments);
console.log.apply(console, args);
}
};
} else {
this.log = function () {};
}
}
@@ -117,20 +116,20 @@ Readability.prototype = {
// Element tags to score by default.
DEFAULT_TAGS_TO_SCORE: "section,h2,h3,h4,h5,h6,p,td,pre".toUpperCase().split(","),
// All of the regular expressions in use within readability.
// Defined up here so we don't instantiate them repeatedly in loops.
REGEXPS: {
unlikelyCandidates: /banner|combx|comment|community|disqus|extra|foot|header|menu|modal|related|remark|rss|share|shoutbox|sidebar|skyscraper|sponsor|ad-break|agegate|pagination|pager|popup/i,
okMaybeItsACandidate: /and|article|body|column|main|shadow/i,
- positive: /article|body|content|entry|hentry|main|page|pagination|post|text|blog|story/i,
+ positive: /article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story/i,
negative: /hidden|^hid$| hid$| hid |^hid |banner|combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget/i,
extraneous: /print|archive|comment|discuss|e[\-]?mail|share|reply|all|login|sign|single|utility/i,
- byline: /byline|author|dateline|writtenby/i,
+ byline: /byline|author|dateline|writtenby|p-author/i,
replaceFonts: /<(\/?)font[^>]*>/gi,
normalize: /\s{2,}/g,
videos: /\/\/(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com/i,
nextLink: /(next|weiter|continue|>([^\|]|$)|»([^\|]|$))/i,
prevLink: /(prev|earl|old|new|<|«)/i,
whitespace: /^\s*$/,
hasContent: /\S$/,
},
@@ -146,28 +145,51 @@ Readability.prototype = {
* @return void
**/
_postProcessContent: function(articleContent) {
// Readability cannot open relative uris so we convert them to absolute uris.
this._fixRelativeUris(articleContent);
},
/**
+ * Iterates over a NodeList, calls `filterFn` for each node and removes node
+ * if function returned `true`.
+ *
+ * If function is not passed, removes all the nodes in node list.
+ *
+ * @param NodeList nodeList The no
+ * @param Function filterFn
+ * @return void
+ */
+ _removeNodes: function(nodeList, filterFn) {
+ for (var i = nodeList.length - 1; i >= 0; i--) {
+ var node = nodeList[i];
+ var parentNode = node.parentNode;
+ if (parentNode) {
+ if (!filterFn || filterFn.call(this, node, i, nodeList)) {
+ parentNode.removeChild(node);
+ }
+ }
+ }
+ },
+
+ /**
* Iterate over a NodeList, which doesn't natively fully implement the Array
* interface.
*
* For convenience, the current object context is applied to the provided
* iterate function.
*
* @param NodeList nodeList The NodeList.
* @param Function fn The iterate function.
+ * @param Boolean backward Whether to use backward iteration.
* @return void
*/
- _forEachNode: function(nodeList, fn) {
- return Array.prototype.forEach.call(nodeList, fn, this);
+ _forEachNode: function(nodeList, fn, backward) {
+ Array.prototype.forEach.call(nodeList, fn, this);
},
/**
* Iterate over a NodeList, return true if any of the provided iterate
* function calls returns true, false otherwise.
*
* For convenience, the current object context is applied to the
* provided iterate function.
@@ -278,23 +300,23 @@ Readability.prototype = {
var origTitle = "";
try {
curTitle = origTitle = doc.title;
// If they had an element with id "title" in their HTML
if (typeof curTitle !== "string")
curTitle = origTitle = this._getInnerText(doc.getElementsByTagName('title')[0]);
- } catch(e) {}
+ } catch (e) {/* ignore exceptions setting the title. */}
if (curTitle.match(/ [\|\-] /)) {
- curTitle = origTitle.replace(/(.*)[\|\-] .*/gi,'$1');
+ curTitle = origTitle.replace(/(.*)[\|\-] .*/gi, '$1');
if (curTitle.split(' ').length < 3)
- curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi,'$1');
+ curTitle = origTitle.replace(/[^\|\-]*[\|\-](.*)/gi, '$1');
} else if (curTitle.indexOf(': ') !== -1) {
// Check if we have an heading containing this exact string, so we
// could assume it's the full title.
var headings = this._concatNodeLists(
doc.getElementsByTagName('h1'),
doc.getElementsByTagName('h2')
);
var match = this._someNode(headings, function(heading) {
@@ -329,19 +351,17 @@ Readability.prototype = {
* This includes things like stripping javascript, CSS, and handling terrible markup.
*
* @return void
**/
_prepDocument: function() {
var doc = this._doc;
// Remove all style tags in head
- this._forEachNode(doc.getElementsByTagName("style"), function(styleNode) {
- styleNode.parentNode.removeChild(styleNode);
- });
+ this._removeNodes(doc.getElementsByTagName("style"));
if (doc.body) {
this._replaceBrs(doc.body);
}
this._forEachNode(doc.getElementsByTagName("font"), function(fontNode) {
this._setNodeTag(fontNode, "SPAN");
});
@@ -365,17 +385,17 @@ Readability.prototype = {
/**
* Replaces 2 or more successive <br> elements with a single <p>.
* Whitespace between <br> elements are ignored. For example:
* <div>foo<br>bar<br> <br><br>abc</div>
* will become:
* <div>foo<br>bar<p>abc</p></div>
*/
_replaceBrs: function (elem) {
- this._forEachNode(elem.getElementsByTagName("br"), function(br) {
+ this._forEachNode(this._getAllNodesWithTag(elem, ["br"]), function(br) {
var next = br.nextSibling;
// Whether 2 or more <br> elements have been found and replaced with a
// <p> block.
var replaced = false;
// If we find a <br> chain, remove the <br>s until we hit another element
// or non-whitespace. This leaves behind the first <br> in the chain
@@ -461,46 +481,45 @@ Readability.prototype = {
// Do these last as the previous stuff may have removed junk
// that will affect these
this._cleanConditionally(articleContent, "table");
this._cleanConditionally(articleContent, "ul");
this._cleanConditionally(articleContent, "div");
// Remove extra paragraphs
- this._forEachNode(articleContent.getElementsByTagName('p'), function(paragraph) {
+ this._removeNodes(articleContent.getElementsByTagName('p'), function (paragraph) {
var imgCount = paragraph.getElementsByTagName('img').length;
var embedCount = paragraph.getElementsByTagName('embed').length;
var objectCount = paragraph.getElementsByTagName('object').length;
// At this point, nasty iframes have been removed, only remain embedded video ones.
var iframeCount = paragraph.getElementsByTagName('iframe').length;
var totalCount = imgCount + embedCount + objectCount + iframeCount;
- if (totalCount === 0 && !this._getInnerText(paragraph, false))
- paragraph.parentNode.removeChild(paragraph);
+ return totalCount === 0 && !this._getInnerText(paragraph, false);
});
- this._forEachNode(articleContent.getElementsByTagName("br"), function(br) {
+ this._forEachNode(this._getAllNodesWithTag(articleContent, ["br"]), function(br) {
var next = this._nextElement(br.nextSibling);
if (next && next.tagName == "P")
br.parentNode.removeChild(br);
});
},
/**
* Initialize a node with the readability object. Also checks the
* className/id for special names to add to its score.
*
* @param Element
* @return void
**/
_initializeNode: function(node) {
node.readability = {"contentScore": 0};
- switch(node.tagName) {
+ switch (node.tagName) {
case 'DIV':
node.readability.contentScore += 5;
break;
case 'PRE':
case 'TD':
case 'BLOCKQUOTE':
node.readability.contentScore += 3;
@@ -609,17 +628,17 @@ Readability.prototype = {
return false;
},
_getNodeAncestors: function(node, maxDepth) {
maxDepth = maxDepth || 0;
var i = 0, ancestors = [];
while (node.parentNode) {
- ancestors.push(node.parentNode)
+ ancestors.push(node.parentNode);
if (maxDepth && ++i === maxDepth)
break;
node = node.parentNode;
}
return ancestors;
},
/***
@@ -1037,27 +1056,22 @@ Readability.prototype = {
},
/**
* Removes script tags from the document.
*
* @param Element
**/
_removeScripts: function(doc) {
- this._forEachNode(doc.getElementsByTagName('script'), function(scriptNode) {
+ this._removeNodes(doc.getElementsByTagName('script'), function(scriptNode) {
scriptNode.nodeValue = "";
scriptNode.removeAttribute('src');
-
- if (scriptNode.parentNode)
- scriptNode.parentNode.removeChild(scriptNode);
+ return true;
});
- this._forEachNode(doc.getElementsByTagName('noscript'), function(noscriptNode) {
- if (noscriptNode.parentNode)
- noscriptNode.parentNode.removeChild(noscriptNode);
- });
+ this._removeNodes(doc.getElementsByTagName('noscript'));
},
/**
* Check if this node has only whitespace and a single P element
* Returns false if the DIV node contains non-empty text nodes
* or if it contains no P or more than 1 element.
*
* @param Element
@@ -1096,29 +1110,28 @@ Readability.prototype = {
* @return string
**/
_getInnerText: function(e, normalizeSpaces) {
normalizeSpaces = (typeof normalizeSpaces === 'undefined') ? true : normalizeSpaces;
var textContent = e.textContent.trim();
if (normalizeSpaces) {
return textContent.replace(this.REGEXPS.normalize, " ");
- } else {
- return textContent;
}
+ return textContent;
},
/**
* Get the number of times a string s appears in the node e.
*
* @param Element
* @param string - what to split on. Default is ","
* @return number (integer)
**/
- _getCharCount: function(e,s) {
+ _getCharCount: function(e, s) {
s = s || ",";
return this._getInnerText(e).split(s).length - 1;
},
/**
* Remove the style attribute on every e and under.
* TODO: Test if getElementsByTagName(*) is faster.
*
@@ -1377,44 +1390,42 @@ Readability.prototype = {
for (var page in possiblePages) {
if (possiblePages.hasOwnProperty(page)) {
if (possiblePages[page].score >= 50 &&
(!topPage || topPage.score < possiblePages[page].score))
topPage = possiblePages[page];
}
}
+ var nextHref = null;
if (topPage) {
- var nextHref = topPage.href.replace(/\/$/,'');
+ nextHref = topPage.href.replace(/\/$/, '');
this.log('NEXT PAGE IS ' + nextHref);
this._parsedPages[nextHref] = true;
- return nextHref;
- } else {
- return null;
}
+ return nextHref;
},
_successfulRequest: function(request) {
return (request.status >= 200 && request.status < 300) ||
request.status === 304 ||
(request.status === 0 && request.responseText);
},
_ajax: function(url, options) {
var request = new XMLHttpRequest();
function respondToReadyState(readyState) {
if (request.readyState === 4) {
if (this._successfulRequest(request)) {
if (options.success)
options.success(request);
- } else {
- if (options.error)
- options.error(request);
+ } else if (options.error) {
+ options.error(request);
}
}
}
if (typeof options === 'undefined')
options = {};
request.onreadystatechange = respondToReadyState;
@@ -1457,45 +1468,44 @@ Readability.prototype = {
// First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page.
var eTag = r.getResponseHeader('ETag');
if (eTag) {
if (eTag in this._pageETags) {
this.log("Exact duplicate page found via ETag. Aborting.");
articlePage.style.display = 'none';
return;
- } else {
- this._pageETags[eTag] = 1;
}
+ this._pageETags[eTag] = 1;
}
// TODO: this ends up doubling up page numbers on NYTimes articles. Need to generically parse those away.
var page = doc.createElement("DIV");
// Do some preprocessing to our HTML to make it ready for appending.
// - Remove any script tags. Swap and reswap newlines with a unicode
// character because multiline regex doesn't work in javascript.
// - Turn any noscript tags into divs so that we can parse them. This
// allows us to find any next page links hidden via javascript.
// - Turn all double br's into p's - was handled by prepDocument in the original view.
// Maybe in the future abstract out prepDocument to work for both the original document
// and AJAX-added pages.
- var responseHtml = r.responseText.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
- responseHtml = responseHtml.replace(/\n/g,'\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
- responseHtml = responseHtml.replace(/\uffff/g,'\n').replace(/<(\/?)noscript/gi, '<$1div');
+ var responseHtml = r.responseText.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
+ responseHtml = responseHtml.replace(/\n/g, '\uffff').replace(/<script.*?>.*?<\/script>/gi, '');
+ responseHtml = responseHtml.replace(/\uffff/g, '\n').replace(/<(\/?)noscript/gi, '<$1div');
responseHtml = responseHtml.replace(this.REGEXPS.replaceFonts, '<$1span>');
page.innerHTML = responseHtml;
this._replaceBrs(page);
// Reset all flags for the next page, as they will search through it and
// disable as necessary at the end of grabArticle.
this._flags = 0x1 | 0x2 | 0x4;
- var nextPageLink = this._findNextPageLink(page);
+ var secondNextPageLink = this._findNextPageLink(page);
// NOTE: if we end up supporting _appendNextPage(), we'll need to
// change this call to be async
var content = this._grabArticle(page);
if (!content) {
this.log("No content found in page to append. Aborting.");
return;
@@ -1524,18 +1534,18 @@ Readability.prototype = {
// After the page has rendered, post process the content. This delay is necessary because,
// in webkit at least, offsetWidth is not set in time to determine image width. We have to
// wait a little bit for reflow to finish before we can fix floating images.
setTimeout((function() {
this._postProcessContent(thisPage);
}).bind(this), 500);
- if (nextPageLink)
- this._appendNextPage(nextPageLink);
+ if (secondNextPageLink)
+ this._appendNextPage(secondNextPageLink);
}
});
}).bind(this)(nextPageLink, articlePage);
},
/**
* Get an elements class/id weight. Uses regular expressions to tell if this
* element looks good or bad.
@@ -1576,33 +1586,33 @@ Readability.prototype = {
*
* @param Element
* @param string tag to clean
* @return void
**/
_clean: function(e, tag) {
var isEmbed = ["object", "embed", "iframe"].indexOf(tag) !== -1;
- this._forEachNode(e.getElementsByTagName(tag), function(element) {
+ this._removeNodes(e.getElementsByTagName(tag), function(element) {
// Allow youtube and vimeo videos through as people usually want to see those.
if (isEmbed) {
var attributeValues = [].map.call(element.attributes, function(attr) {
return attr.value;
}).join("|");
// First, check the elements attributes to see if any of them contain youtube or vimeo
if (this.REGEXPS.videos.test(attributeValues))
- return;
+ return false;
// Then check the elements inside this element for the same.
if (this.REGEXPS.videos.test(element.innerHTML))
- return;
+ return false;
}
- element.parentNode.removeChild(element);
+ return true;
});
},
/**
* Check if a given node has one of its ancestor tag name matching the
* provided one.
* @param HTMLElement node
* @param String tagName
@@ -1629,88 +1639,78 @@ Readability.prototype = {
* "Fishy" is an algorithm based on content length, classnames, link density, number of images & embeds, etc.
*
* @return void
**/
_cleanConditionally: function(e, tag) {
if (!this._flagIsActive(this.FLAG_CLEAN_CONDITIONALLY))
return;
- var tagsList = e.getElementsByTagName(tag);
- var curTagsLength = tagsList.length;
var isList = tag === "ul" || tag === "ol";
// Gather counts for other typical elements embedded within.
// Traverse backwards so we can remove nodes at the same time
// without effecting the traversal.
//
// TODO: Consider taking into account original contentScore here.
- for (var i = curTagsLength-1; i >= 0; i -= 1) {
- var weight = this._getClassWeight(tagsList[i]);
+ this._removeNodes(e.getElementsByTagName(tag), function(node) {
+ var weight = this._getClassWeight(node);
var contentScore = 0;
- this.log("Cleaning Conditionally", tagsList[i]);
+ this.log("Cleaning Conditionally", node);
if (weight + contentScore < 0) {
- tagsList[i].parentNode.removeChild(tagsList[i]);
- } else if (this._getCharCount(tagsList[i],',') < 10) {
+ return true;
+ }
+
+ if (this._getCharCount(node, ',') < 10) {
// If there are not very many commas, and the number of
// non-paragraph elements is more than paragraphs or other
// ominous signs, remove the element.
- var p = tagsList[i].getElementsByTagName("p").length;
- var img = tagsList[i].getElementsByTagName("img").length;
- var li = tagsList[i].getElementsByTagName("li").length-100;
- var input = tagsList[i].getElementsByTagName("input").length;
+ var p = node.getElementsByTagName("p").length;
+ var img = node.getElementsByTagName("img").length;
+ var li = node.getElementsByTagName("li").length-100;
+ var input = node.getElementsByTagName("input").length;
var embedCount = 0;
- var embeds = tagsList[i].getElementsByTagName("embed");
+ var embeds = node.getElementsByTagName("embed");
for (var ei = 0, il = embeds.length; ei < il; ei += 1) {
if (!this.REGEXPS.videos.test(embeds[ei].src))
embedCount += 1;
}
- var linkDensity = this._getLinkDensity(tagsList[i]);
- var contentLength = this._getInnerText(tagsList[i]).length;
- var toRemove = false;
- if (img > p && !this._hasAncestorTag(tagsList[i], "figure")) {
- toRemove = true;
- } else if (!isList && li > p) {
- toRemove = true;
- } else if (input > Math.floor(p/3)) {
- toRemove = true;
- } else if (!isList && contentLength < 25 && (img === 0 || img > 2)) {
- toRemove = true;
- } else if (!isList && weight < 25 && linkDensity > 0.2) {
- toRemove = true;
- } else if (weight >= 25 && linkDensity > 0.5) {
- toRemove = true;
- } else if ((embedCount === 1 && contentLength < 75) || embedCount > 1) {
- toRemove = true;
- }
+ var linkDensity = this._getLinkDensity(node);
+ var contentLength = this._getInnerText(node).length;
- if (toRemove) {
- tagsList[i].parentNode.removeChild(tagsList[i]);
- }
+ var haveToRemove =
+ // Make an exception for elements with no p's and exactly 1 img.
+ (img > p && !this._hasAncestorTag(node, "figure")) ||
+ (!isList && li > p) ||
+ (input > Math.floor(p/3)) ||
+ (!isList && contentLength < 25 && (img === 0 || img > 2)) ||
+ (!isList && weight < 25 && linkDensity > 0.2) ||
+ (weight >= 25 && linkDensity > 0.5) ||
+ ((embedCount === 1 && contentLength < 75) || embedCount > 1);
+ return haveToRemove;
}
- }
+ return false;
+ });
},
/**
* Clean out spurious headers from an Element. Checks things like classnames and link density.
*
* @param Element
* @return void
**/
_cleanHeaders: function(e) {
for (var headerIndex = 1; headerIndex < 3; headerIndex += 1) {
- var headers = e.getElementsByTagName('h' + headerIndex);
- for (var i = headers.length - 1; i >= 0; i -= 1) {
- if (this._getClassWeight(headers[i]) < 0)
- headers[i].parentNode.removeChild(headers[i]);
- }
+ this._removeNodes(e.getElementsByTagName('h' + headerIndex), function (header) {
+ return this._getClassWeight(header) < 0;
+ });
}
},
_flagIsActive: function(flag) {
return (this._flags & flag) > 0;
},
_addFlag: function(flag) {
@@ -1724,16 +1724,32 @@ Readability.prototype = {
/**
* Decides whether or not the document is reader-able without parsing the whole thing.
*
* @return boolean Whether or not we suspect parse() will suceeed at returning an article object.
*/
isProbablyReaderable: function(helperIsVisible) {
var nodes = this._getAllNodesWithTag(this._doc, ["p", "pre"]);
+ // Get <div> nodes which have <br> node(s) and append them into the `nodes` variable.
+ // Some articles' DOM structures might look like
+ // <div>
+ // Sentences<br>
+ // <br>
+ // Sentences<br>
+ // </div>
+ var brNodes = this._getAllNodesWithTag(this._doc, ["div > br"]);
+ if (brNodes.length) {
+ var set = new Set();
+ [].forEach.call(brNodes, function(node) {
+ set.add(node.parentNode);
+ });
+ nodes = [].concat.apply(Array.from(set), nodes);
+ }
+
// FIXME we should have a fallback for helperIsVisible, but this is
// problematic because of jsdom's elem.style handling - see
// https://github.com/mozilla/readability/pull/186 for context.
var score = 0;
// This is a little cheeky, we use the accumulator 'score' to decide what to return from
// this callback:
return this._someNode(nodes, function(node) {
@@ -1827,17 +1843,21 @@ Readability.prototype = {
// the article's content.
if (!metadata.excerpt) {
var paragraphs = articleContent.getElementsByTagName("p");
if (paragraphs.length > 0) {
metadata.excerpt = paragraphs[0].textContent.trim();
}
}
- return { uri: this._uri,
- title: articleTitle,
- byline: metadata.byline || this._articleByline,
- dir: this._articleDir,
- content: articleContent.innerHTML,
- length: articleContent.textContent.length,
- excerpt: metadata.excerpt };
+ var textContent = articleContent.textContent;
+ return {
+ uri: this._uri,
+ title: articleTitle,
+ byline: metadata.byline || this._articleByline,
+ dir: this._articleDir,
+ content: articleContent.innerHTML,
+ textContent: textContent,
+ length: textContent.length,
+ excerpt: metadata.excerpt,
+ };
}
};