pushlog: API to cache pushlog lookups (bug 1303904); r?glob draft
authorGregory Szorc <gps@mozilla.com>
Thu, 08 Jun 2017 13:32:37 -0700
changeset 11237 bb140762d07e7164199f302892a6ca058f4da014
parent 11236 6504363eb1aaf7d66a163e6df855fbd7735bcdc2
child 11238 10b4978a9f5f3e58beafc66b159620962abf89cc
push id1708
push usergszorc@mozilla.com
push dateMon, 19 Jun 2017 18:36:36 +0000
reviewersglob
bugs1303904
pushlog: API to cache pushlog lookups (bug 1303904); r?glob The pushlog API is not conducive to performance because various methods result in a SQLite database handle being opened for every lookup. When doing things like looking up pushlog entries for every node in a list, the overhead can really add up. This commit introduces a context manager for populating a cache to facilitate rapid node -> push lookups. This commit is somewhat hacky. There are a number of alternatives I'd prefer, such as a bulk lookup API or automatic caching. A bulk lookup API isn't usable in the case of the automationrelevance web command because the function that calls the pushlog API is per changeset, not per collection. And I'm not yet willing to implement automatic caching because caching is hard and I don't want to invite bugs. A context manager, however, allows us to create caches with scoped lifetimes thus limiting the caching (and bugs related to it) to consumers who opt into the cache. Yes, it's hacky. But it gets the job done. To prove the cache works, the automationrelevance web command has been updated to use it. This drops time to first byte on json-automationrelevance/5dddbefdf759f09b1411f33fa0920835b919fc81 HTTP requests against the mozilla-aurora repo from ~80s to ~1.5s. However, there is still a major source of clownshoes for that request... MozReview-Commit-ID: 37bP49sHxKZ
hgext/hgmo/__init__.py
hgext/pushlog/__init__.py
--- a/hgext/hgmo/__init__.py
+++ b/hgext/hgmo/__init__.py
@@ -439,38 +439,46 @@ def automationrelevancewebcommand(web, r
 
     csets = []
     # Query an unfiltered repo because sometimes automation wants to run against
     # changesets that have since become hidden. The response exposes whether the
     # requested node is visible, so consumers can make intelligent decisions
     # about what to do if the changeset isn't visible.
     urepo = repo.unfiltered()
     revs = list(urepo.revs('automationrelevant(%r)', req.form['node'][0]))
-    for rev in revs:
-        ctx = urepo[rev]
-        entry = webutil.changelistentry(web, ctx, tmpl)
-        # Some items in changelistentry are generators, which json.dumps()
-        # can't handle. So we expand them.
-        for k, v in entry.items():
-            # "files" is a generator that attempts to call a template.
-            # Don't even bother and just repopulate it.
-            if k == 'files':
-                entry['files'] = sorted(ctx.files())
-            elif k == 'allparents':
-                entry['parents'] = [p['node'] for p in v()]
-                del entry['allparents']
-            # These aren't interesting to us, so prune them. The
-            # original impetus for this was because "changelogtag"
-            # isn't part of the json template and adding it is non-trivial.
-            elif k in deletefields:
-                del entry[k]
-            elif isinstance(v, types.GeneratorType):
-                entry[k] = list(v)
+
+    # The pushlog extensions wraps webutil.commonentry and the way it is called
+    # means pushlog opens a SQLite connection on every call. This is inefficient.
+    # So we pre load and cache data for pushlog entries we care about.
+    cl = urepo.changelog
+    nodes = [cl.node(rev) for rev in revs]
 
-        csets.append(entry)
+    with repo.unfiltered().pushlog.cache_data_for_nodes(nodes):
+        for rev in revs:
+            ctx = urepo[rev]
+            entry = webutil.changelistentry(web, ctx, tmpl)
+            # Some items in changelistentry are generators, which json.dumps()
+            # can't handle. So we expand them.
+            for k, v in entry.items():
+                # "files" is a generator that attempts to call a template.
+                # Don't even bother and just repopulate it.
+                if k == 'files':
+                    entry['files'] = sorted(ctx.files())
+                elif k == 'allparents':
+                    entry['parents'] = [p['node'] for p in v()]
+                    del entry['allparents']
+                # These aren't interesting to us, so prune them. The
+                # original impetus for this was because "changelogtag"
+                # isn't part of the json template and adding it is non-trivial.
+                elif k in deletefields:
+                    del entry[k]
+                elif isinstance(v, types.GeneratorType):
+                    entry[k] = list(v)
+
+            csets.append(entry)
 
     # Advertise whether the requested revision is visible (non-obsolete).
     if csets:
         visible = csets[-1]['node'] in repo
     else:
         visible = None
 
     data = {
--- a/hgext/pushlog/__init__.py
+++ b/hgext/pushlog/__init__.py
@@ -185,16 +185,22 @@ class pushlog(object):
     '''An interface to pushlog data.'''
 
     def __init__(self, repo):
         '''Create an instance bound to a sqlite database in a path.'''
         # Use a weak ref to avoid cycle. This object's lifetime should be no
         # greater than the repo's.
         self.repo = repo
 
+        # Caches of pushlog data to avoid database I/O.
+        # int -> Push
+        self._push_by_id = {}
+        # bin node -> int
+        self._push_id_by_node = {}
+
     def _getconn(self, readonly=False, tr=None):
         """Get a SQLite connection to the pushlog.
 
         In normal operation, this will return a ``sqlite3.Connection``.
         If the database does not exist, it will be created. If the database
         schema is not present or up to date, it will be updated. An error will
         be raised if any of this could not be performed.
 
@@ -383,16 +389,20 @@ class pushlog(object):
     def pushfromnode(self, node):
         """Obtain info about a push that added the specified changeset.
 
         Returns a Push namedtuple of (pushid, who, when, [nodes]) or None if
         there is no pushlog info for this node.
 
         Argument is specified as binary node.
         """
+        if node in self._push_id_by_node:
+            pushid = self._push_id_by_node[node]
+            return self._push_by_id[pushid] if pushid is not None else None
+
         with self.conn(readonly=True) as c:
             if not c:
                 return None
 
             return self._push_from_node(c, node)
 
     def _push_from_node(self, conn, node):
         res = conn.execute('SELECT pushid from changesets WHERE node=?',
@@ -406,29 +416,62 @@ class pushlog(object):
         return self.pushfromnode(ctx.node())
 
     def pushfromid(self, conn, pushid):
         """Obtain a push from its numeric push id.
 
         Returns a Push namedtuple or None if there is no push with this push
         id.
         """
+        push = self._push_by_id.get(pushid)
+        if push:
+            return push
+
         res = conn.execute('SELECT id, user, date, node from pushlog '
                            'LEFT JOIN changesets on id=pushid '
                            'WHERE id=? ORDER BY rev ASC', (pushid,))
         nodes = []
         for pushid, who, when, node in res:
             who = who.encode('utf-8')
             nodes.append(node.encode('ascii'))
 
         if not nodes:
             return None
 
         return Push(pushid, who, when, nodes)
 
+    @contextlib.contextmanager
+    def cache_data_for_nodes(self, nodes):
+        """Given an iterable of nodes, cache pushlog data for them.
+
+        Due to current API design, many pushlog methods open a SQLite
+        database, perform a query, then close the database. Calling these
+        within tight loops can be slow.
+
+        This context manager can be used to pre-load pushlog data to
+        avoid inefficient SQLite access patterns.
+        """
+        with self.conn(readonly=True) as c:
+            if not c:
+                return
+
+            try:
+                for node in nodes:
+                    push = self._push_from_node(c, node)
+                    if push:
+                        self._push_id_by_node[node] = push.pushid
+                        self._push_by_id[push.pushid] = push
+                    else:
+                        self._push_id_by_node[node] = None
+
+                yield
+            finally:
+                self._push_id_by_node = {}
+                self._push_by_id = {}
+
     def verify(self):
         # Attempt to create database (since .pushes below won't).
         with self.conn():
             pass
 
         repo = self.repo
         ui = self.repo.ui