Bug 1301715 - Extract website metadata and make it available. r?ahunt draft
authorSebastian Kaspari <s.kaspari@gmail.com>
Thu, 08 Sep 2016 10:20:24 +0200
changeset 417166 00519e7502ff7930eb9b5efd2ef73679509ab7fd
parent 416562 f0e6cc6360213ba21fd98c887b55fce5c680df68
child 417167 66f1fa26ab65c2e973f6f9d21824a8c5e3aa24dd
push id30349
push users.kaspari@gmail.com
push dateFri, 23 Sep 2016 17:59:39 +0000
reviewersahunt
bugs1301715
milestone52.0a1
Bug 1301715 - Extract website metadata and make it available. r?ahunt This patch introduces WebsiteMetadata.jsm which imports fathom and page-metadata-parser. The code has been slightly modified to not depend on more node libraries. On DOMContentLoaded the module will extract the metadata asynchronously and send it with a 'Website:Metadata' event. MozReview-Commit-ID: LxhYOTvvdsF
mobile/android/chrome/content/browser.js
mobile/android/modules/WebsiteMetadata.jsm
mobile/android/modules/moz.build
--- a/mobile/android/chrome/content/browser.js
+++ b/mobile/android/chrome/content/browser.js
@@ -111,16 +111,18 @@ XPCOMUtils.defineLazyModuleGetter(this, 
                                   "resource://gre/modules/Notifications.jsm");
 
 XPCOMUtils.defineLazyModuleGetter(this, "ReaderMode", "resource://gre/modules/ReaderMode.jsm");
 
 XPCOMUtils.defineLazyModuleGetter(this, "Snackbars", "resource://gre/modules/Snackbars.jsm");
 
 XPCOMUtils.defineLazyModuleGetter(this, "RuntimePermissions", "resource://gre/modules/RuntimePermissions.jsm");
 
+XPCOMUtils.defineLazyModuleGetter(this, "WebsiteMetadata", "resource://gre/modules/WebsiteMetadata.jsm");
+
 XPCOMUtils.defineLazyServiceGetter(this, "FontEnumerator",
   "@mozilla.org/gfx/fontenumerator;1",
   "nsIFontEnumerator");
 
 var lazilyLoadedBrowserScripts = [
   ["SelectHelper", "chrome://browser/content/SelectHelper.js"],
   ["InputWidgetHelper", "chrome://browser/content/InputWidgetHelper.js"],
   ["MasterPassword", "chrome://browser/content/MasterPassword.js"],
@@ -3935,16 +3937,19 @@ Tab.prototype = {
           this.browser.addEventListener("click", ErrorPageEventHandler, true);
           let listener = function() {
             this.browser.removeEventListener("click", ErrorPageEventHandler, true);
             this.browser.removeEventListener("pagehide", listener, true);
           }.bind(this);
 
           this.browser.addEventListener("pagehide", listener, true);
         }
+
+        WebsiteMetadata.parseAsynchronously(this.browser.contentDocument);
+
         break;
       }
 
       case "DOMFormHasPassword": {
         LoginManagerContent.onDOMFormHasPassword(aEvent,
                                                  this.browser.contentWindow);
 
         // Send logins for this hostname to Java.
new file mode 100644
--- /dev/null
+++ b/mobile/android/modules/WebsiteMetadata.jsm
@@ -0,0 +1,468 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+'use strict';
+
+const { classes: Cc, interfaces: Ci, utils: Cu } = Components;
+
+this.EXPORTED_SYMBOLS = ["WebsiteMetadata"];
+
+Cu.import("resource://gre/modules/XPCOMUtils.jsm");
+
+XPCOMUtils.defineLazyModuleGetter(this, "Messaging", "resource://gre/modules/Messaging.jsm");
+XPCOMUtils.defineLazyModuleGetter(this, "Task", "resource://gre/modules/Task.jsm");
+
+var WebsiteMetadata = {
+  /**
+   * Asynchronously parse the document extract metadata. A 'Website:Metadata' event with the metadata
+   * will be sent.
+   */
+  parseAsynchronously: function(doc) {
+    Task.spawn(function() {
+      let metadata = getMetadata(doc, doc.location.href);
+
+      let msg = {
+        type: 'Website:Metadata',
+        location: doc.location.href,
+        metadata: metadata,
+      };
+
+      Messaging.sendRequest(msg);
+    });
+  }
+};
+
+// #################################################################################################
+// # Modified version of makeUrlAbsolute() to not import url parser library (and dependencies)
+// #################################################################################################
+
+function makeUrlAbsolute(context, relative) {
+	var a = context.doc.createElement('a');
+    a.href = relative;
+    return a.href;
+}
+
+// #################################################################################################
+// # page-metadata-parser
+// # https://github.com/mozilla/page-metadata-parser/
+// # 61c58cbd0f0bf2153df832a388a79c66b288b98c
+// #################################################################################################
+
+function buildRuleset(name, rules, processors) {
+  const reversedRules = Array.from(rules).reverse();
+  const builtRuleset = ruleset(...reversedRules.map(([query, handler], order) => rule(
+    dom(query),
+    node => [{
+      score: order,
+      flavor: name,
+      notes: handler(node),
+    }]
+  )));
+
+  return (doc, context) => {
+    const kb = builtRuleset.score(doc);
+    const maxNode = kb.max(name);
+
+    if (maxNode) {
+      let value = maxNode.flavors.get(name);
+
+      if (processors) {
+        processors.forEach(processor => {
+          value = processor(value, context);
+        });
+      }
+
+      if (value) {
+        if (value.trim) {
+          return value.trim();
+        }
+        return value;
+      }
+    }
+  };
+}
+
+const metadataRules = {
+  description: {
+    rules: [
+      ['meta[property="og:description"]', node => node.element.getAttribute('content')],
+      ['meta[name="description"]', node => node.element.getAttribute('content')],
+    ],
+  },
+
+  icon_url: {
+    rules: [
+      ['link[rel="apple-touch-icon"]', node => node.element.getAttribute('href')],
+      ['link[rel="apple-touch-icon-precomposed"]', node => node.element.getAttribute('href')],
+      ['link[rel="icon"]', node => node.element.getAttribute('href')],
+      ['link[rel="fluid-icon"]', node => node.element.getAttribute('href')],
+      ['link[rel="shortcut icon"]', node => node.element.getAttribute('href')],
+      ['link[rel="Shortcut Icon"]', node => node.element.getAttribute('href')],
+      ['link[rel="mask-icon"]', node => node.element.getAttribute('href')],
+    ],
+    processors: [
+      (icon_url, context) => makeUrlAbsolute(context, icon_url)
+    ]
+  },
+
+  image_url: {
+    rules: [
+      ['meta[property="og:image:secure_url"]', node => node.element.getAttribute('content')],
+      ['meta[property="og:image:url"]', node => node.element.getAttribute('content')],
+      ['meta[property="og:image"]', node => node.element.getAttribute('content')],
+      ['meta[property="twitter:image"]', node => node.element.getAttribute('content')],
+      ['meta[name="thumbnail"]', node => node.element.getAttribute('content')],
+    ],
+    processors: [
+      (image_url, context) => makeUrlAbsolute(context, image_url)
+    ],
+  },
+
+  keywords: {
+    rules: [
+      ['meta[name="keywords"]', node => node.element.getAttribute('content')],
+    ],
+    processors: [
+      (keywords) => keywords.split(',').map((keyword) => keyword.trim()),
+    ]
+  },
+
+  title: {
+    rules: [
+      ['meta[property="og:title"]', node => node.element.getAttribute('content')],
+      ['meta[property="twitter:title"]', node => node.element.getAttribute('content')],
+      ['meta[name="hdl"]', node => node.element.getAttribute('content')],
+      ['title', node => node.element.text],
+    ],
+  },
+
+  type: {
+    rules: [
+      ['meta[property="og:type"]', node => node.element.getAttribute('content')],
+    ],
+  },
+
+  url: {
+    rules: [
+      ['meta[property="og:url"]', node => node.element.getAttribute('content')],
+      ['link[rel="canonical"]', node => node.element.getAttribute('href')],
+    ],
+  },
+};
+
+function getMetadata(doc, url, rules) {
+  const metadata = {};
+  const context = {url,doc};
+  const ruleSet = rules || metadataRules;
+
+  Object.keys(ruleSet).map(metadataKey => {
+    const metadataRule = ruleSet[metadataKey];
+
+    if(Array.isArray(metadataRule.rules)) {
+      const builtRule = buildRuleset(metadataKey, metadataRule.rules, metadataRule.processors);
+      metadata[metadataKey] = builtRule(doc, context);
+    } else {
+      metadata[metadataKey] = getMetadata(doc, url, metadataRule);
+    }
+  });
+
+  return metadata;
+}
+
+// #################################################################################################
+// # Fathom dependencies resolved
+// #################################################################################################
+
+// const {forEach} = require('wu');
+function forEach(fn, obj) {
+    for (let x of obj) {
+        fn(x);
+    }
+}
+
+function best(iterable, by, isBetter) {
+    let bestSoFar, bestKeySoFar;
+    let isFirst = true;
+    forEach(
+        function (item) {
+            const key = by(item);
+            if (isBetter(key, bestKeySoFar) || isFirst) {
+                bestSoFar = item;
+                bestKeySoFar = key;
+                isFirst = false;
+            }
+        },
+        iterable);
+    if (isFirst) {
+        throw new Error('Tried to call best() on empty iterable');
+    }
+    return bestSoFar;
+}
+
+// const {max} = require('./utils');
+function max(iterable, by = identity) {
+    return best(iterable, by, (a, b) => a > b);
+}
+
+// #################################################################################################
+// # Fathom
+// # https://github.com/mozilla/fathom
+// # cac59e470816f17fc1efd4a34437b585e3e451cd
+// #################################################################################################
+
+// Get a key of a map, first setting it to a default value if it's missing.
+function getDefault(map, key, defaultMaker) {
+    if (map.has(key)) {
+        return map.get(key);
+    }
+    const defaultValue = defaultMaker();
+    map.set(key, defaultValue);
+    return defaultValue;
+}
+
+
+// Construct a filtration network of rules.
+function ruleset(...rules) {
+    const rulesByInputFlavor = new Map();  // [someInputFlavor: [rule, ...]]
+
+    // File each rule under its input flavor:
+    forEach(rule => getDefault(rulesByInputFlavor, rule.source.inputFlavor, () => []).push(rule),
+            rules);
+
+    return {
+        // Iterate over a DOM tree or subtree, building up a knowledgebase, a
+        // data structure holding scores and annotations for interesting
+        // elements. Return the knowledgebase.
+        //
+        // This is the "rank" portion of the rank-and-yank algorithm.
+        score: function (tree) {
+            const kb = knowledgebase();
+
+            // Introduce the whole DOM into the KB as flavor 'dom' to get
+            // things started:
+            const nonterminals = [[{tree}, 'dom']];  // [[node, flavor], [node, flavor], ...]
+
+            // While there are new facts, run the applicable rules over them to
+            // generate even newer facts. Repeat until everything's fully
+            // digested. Rules run in no particular guaranteed order.
+            while (nonterminals.length) {
+                const [inNode, inFlavor] = nonterminals.pop();
+                for (let rule of getDefault(rulesByInputFlavor, inFlavor, () => [])) {
+                    const outFacts = resultsOf(rule, inNode, inFlavor, kb);
+                    for (let fact of outFacts) {
+                        const outNode = kb.nodeForElement(fact.element);
+
+                        // No matter whether or not this flavor has been
+                        // emitted before for this node, we multiply the score.
+                        // We want to be able to add rules that refine the
+                        // scoring of a node, without having to rewire the path
+                        // of flavors that winds through the ruleset.
+                        //
+                        // 1 score per Node is plenty. That simplifies our
+                        // data, our rankers, our flavor system (since we don't
+                        // need to represent score axes), and our engine. If
+                        // somebody wants more score axes, they can fake it
+                        // themselves with notes, thus paying only for what
+                        // they eat. (We can even provide functions that help
+                        // with that.) Most rulesets will probably be concerned
+                        // with scoring only 1 thing at a time anyway. So,
+                        // rankers return a score multiplier + 0 or more new
+                        // flavors with optional notes. Facts can never be
+                        // deleted from the KB by rankers (or order would start
+                        // to matter); after all, they're *facts*.
+                        outNode.score *= fact.score;
+
+                        // Add a new annotation to a node--but only if there
+                        // wasn't already one of the given flavor already
+                        // there; otherwise there's no point.
+                        //
+                        // You might argue that we might want to modify an
+                        // existing note here, but that would be a bad
+                        // idea. Notes of a given flavor should be
+                        // considered immutable once laid down. Otherwise, the
+                        // order of execution of same-flavored rules could
+                        // matter, hurting pluggability. Emit a new flavor and
+                        // a new note if you want to do that.
+                        //
+                        // Also, choosing not to add a new fact to nonterminals
+                        // when we're not adding a new flavor saves the work of
+                        // running the rules against it, which would be
+                        // entirely redundant and perform no new work (unless
+                        // the rankers were nondeterministic, but don't do
+                        // that).
+                        if (!outNode.flavors.has(fact.flavor)) {
+                            outNode.flavors.set(fact.flavor, fact.notes);
+                            kb.indexNodeByFlavor(outNode, fact.flavor);  // TODO: better encapsulation rather than indexing explicitly
+                            nonterminals.push([outNode, fact.flavor]);
+                        }
+                    }
+                }
+            }
+            return kb;
+        }
+    };
+}
+
+
+// Construct a container for storing and querying facts, where a fact has a
+// flavor (used to dispatch further rules upon), a corresponding DOM element, a
+// score, and some other arbitrary notes opaque to fathom.
+function knowledgebase() {
+    const nodesByFlavor = new Map();  // Map{'texty' -> [NodeA],
+                                      //     'spiffy' -> [NodeA, NodeB]}
+                                      // NodeA = {element: <someElement>,
+                                      //
+                                      //          // Global nodewide score. Add
+                                      //          // custom ones with notes if
+                                      //          // you want.
+                                      //          score: 8,
+                                      //
+                                      //          // Flavors is a map of flavor names to notes:
+                                      //          flavors: Map{'texty' -> {ownText: 'blah',
+                                      //                                   someOtherNote: 'foo',
+                                      //                                   someCustomScore: 10},
+                                      //                       // This is an empty note:
+                                      //                       'fluffy' -> undefined}}
+    const nodesByElement = new Map();
+
+    return {
+        // Return the "node" (our own data structure that we control) that
+        // corresponds to a given DOM element, creating one if necessary.
+        nodeForElement: function (element) {
+            return getDefault(nodesByElement,
+                              element,
+                              () => ({element,
+                                      score: 1,
+                                      flavors: new Map()}));
+        },
+
+        // Return the highest-scored node of the given flavor, undefined if
+        // there is none.
+        max: function (flavor) {
+            const nodes = nodesByFlavor.get(flavor);
+            return nodes === undefined ? undefined : max(nodes, node => node.score);
+        },
+
+        // Let the KB know that a new flavor has been added to an element.
+        indexNodeByFlavor: function (node, flavor) {
+            getDefault(nodesByFlavor, flavor, () => []).push(node);
+        },
+
+        nodesOfFlavor: function (flavor) {
+            return getDefault(nodesByFlavor, flavor, () => []);
+        }
+    };
+}
+
+
+// Apply a rule (as returned by a call to rule()) to a fact, and return the
+// new facts that result.
+function resultsOf(rule, node, flavor, kb) {
+    // If more types of rule pop up someday, do fancier dispatching here.
+    return rule.source.flavor === 'flavor' ? resultsOfFlavorRule(rule, node, flavor) : resultsOfDomRule(rule, node, kb);
+}
+
+
+// Pull the DOM tree off the special property of the root "dom" fact, and query
+// against it.
+function *resultsOfDomRule(rule, specialDomNode, kb) {
+    // Use the special "tree" property of the special starting node:
+    const matches = specialDomNode.tree.querySelectorAll(rule.source.selector);
+
+    for (let i = 0; i < matches.length; i++) {  // matches is a NodeList, which doesn't conform to iterator protocol
+        const element = matches[i];
+        const newFacts = explicitFacts(rule.ranker(kb.nodeForElement(element)));
+        for (let fact of newFacts) {
+            if (fact.element === undefined) {
+                fact.element = element;
+            }
+            if (fact.flavor === undefined) {
+                throw new Error('Rankers of dom() rules must return a flavor in each fact. Otherwise, there is no way for that fact to be used later.');
+            }
+            yield fact;
+        }
+    }
+}
+
+
+function *resultsOfFlavorRule(rule, node, flavor) {
+    const newFacts = explicitFacts(rule.ranker(node));
+
+    for (let fact of newFacts) {
+        // If the ranker didn't specify a different element, assume it's
+        // talking about the one we passed in:
+        if (fact.element === undefined) {
+            fact.element = node.element;
+        }
+        if (fact.flavor === undefined) {
+            fact.flavor = flavor;
+        }
+        yield fact;
+    }
+}
+
+
+// Take the possibly abbreviated output of a ranker function, and make it
+// explicitly an iterable with a defined score.
+//
+// Rankers can return undefined, which means "no facts", a single fact, or an
+// array of facts.
+function *explicitFacts(rankerResult) {
+    const array = (rankerResult === undefined) ? [] : (Array.isArray(rankerResult) ? rankerResult : [rankerResult]);
+    for (let fact of array) {
+        if (fact.score === undefined) {
+            fact.score = 1;
+        }
+        yield fact;
+    }
+}
+
+
+// TODO: For the moment, a lot of responsibility is on the rankers to return a
+// pretty big data structure of up to 4 properties. This is a bit verbose for
+// an arrow function (as I hope we can use most of the time) and the usual case
+// will probably be returning just a score multiplier. Make that case more
+// concise.
+
+// TODO: It is likely that rankers should receive the notes of their input type
+// as a 2nd arg, for brevity.
+
+
+// Return a condition that uses a DOM selector to find its matches from the
+// original DOM tree.
+//
+// For consistency, Nodes will still be delivered to the transformers, but
+// they'll have empty flavors and score = 1.
+//
+// Condition constructors like dom() and flavor() build stupid, introspectable
+// objects that the query engine can read. They don't actually do the query
+// themselves. That way, the query planner can be smarter than them, figuring
+// out which indices to use based on all of them. (We'll probably keep a heap
+// by each dimension's score and a hash by flavor name, for starters.) Someday,
+// fancy things like this may be possible: rule(and(tag('p'), klass('snork')),
+// ...)
+function dom(selector) {
+    return {
+        flavor: 'dom',
+        inputFlavor: 'dom',
+        selector
+    };
+}
+
+
+// Return a condition that discriminates on nodes of the knowledgebase by flavor.
+function flavor(inputFlavor) {
+    return {
+        flavor: 'flavor',
+        inputFlavor
+    };
+}
+
+
+function rule(source, ranker) {
+    return {
+        source,
+        ranker
+    };
+}
--- a/mobile/android/modules/moz.build
+++ b/mobile/android/modules/moz.build
@@ -24,9 +24,10 @@ EXTRA_JS_MODULES += [
     'PageActions.jsm',
     'Prompt.jsm',
     'RuntimePermissions.jsm',
     'Sanitizer.jsm',
     'SharedPreferences.jsm',
     'Snackbars.jsm',
     'SSLExceptions.jsm',
     'TabMirror.jsm',
+    'WebsiteMetadata.jsm'
 ]