pushlog: API to cache pushlog lookups (
bug 1303904); r?glob
The pushlog API is not conducive to performance because various
methods result in a SQLite database handle being opened for every
lookup. When doing things like looking up pushlog entries for
every node in a list, the overhead can really add up.
This commit introduces a context manager for populating a cache
to facilitate rapid node -> push lookups.
This commit is somewhat hacky. There are a number of alternatives
I'd prefer, such as a bulk lookup API or automatic caching. A bulk
lookup API isn't usable in the case of the automationrelevance web
command because the function that calls the pushlog API is per
changeset, not per collection. And I'm not yet willing to implement
automatic caching because caching is hard and I don't want to
invite bugs.
A context manager, however, allows us to create caches with scoped
lifetimes thus limiting the caching (and bugs related to it) to
consumers who opt into the cache. Yes, it's hacky. But it gets the
job done.
To prove the cache works, the automationrelevance web command has
been updated to use it.
This drops time to first byte on
json-automationrelevance/5dddbefdf759f09b1411f33fa0920835b919fc81 HTTP
requests against the mozilla-aurora repo from ~80s to ~1.5s. However,
there is still a major source of clownshoes for that request...
MozReview-Commit-ID: 37bP49sHxKZ
--- a/hgext/hgmo/__init__.py
+++ b/hgext/hgmo/__init__.py
@@ -439,38 +439,46 @@ def automationrelevancewebcommand(web, r
csets = []
# Query an unfiltered repo because sometimes automation wants to run against
# changesets that have since become hidden. The response exposes whether the
# requested node is visible, so consumers can make intelligent decisions
# about what to do if the changeset isn't visible.
urepo = repo.unfiltered()
revs = list(urepo.revs('automationrelevant(%r)', req.form['node'][0]))
- for rev in revs:
- ctx = urepo[rev]
- entry = webutil.changelistentry(web, ctx, tmpl)
- # Some items in changelistentry are generators, which json.dumps()
- # can't handle. So we expand them.
- for k, v in entry.items():
- # "files" is a generator that attempts to call a template.
- # Don't even bother and just repopulate it.
- if k == 'files':
- entry['files'] = sorted(ctx.files())
- elif k == 'allparents':
- entry['parents'] = [p['node'] for p in v()]
- del entry['allparents']
- # These aren't interesting to us, so prune them. The
- # original impetus for this was because "changelogtag"
- # isn't part of the json template and adding it is non-trivial.
- elif k in deletefields:
- del entry[k]
- elif isinstance(v, types.GeneratorType):
- entry[k] = list(v)
+
+ # The pushlog extensions wraps webutil.commonentry and the way it is called
+ # means pushlog opens a SQLite connection on every call. This is inefficient.
+ # So we pre load and cache data for pushlog entries we care about.
+ cl = urepo.changelog
+ nodes = [cl.node(rev) for rev in revs]
- csets.append(entry)
+ with repo.unfiltered().pushlog.cache_data_for_nodes(nodes):
+ for rev in revs:
+ ctx = urepo[rev]
+ entry = webutil.changelistentry(web, ctx, tmpl)
+ # Some items in changelistentry are generators, which json.dumps()
+ # can't handle. So we expand them.
+ for k, v in entry.items():
+ # "files" is a generator that attempts to call a template.
+ # Don't even bother and just repopulate it.
+ if k == 'files':
+ entry['files'] = sorted(ctx.files())
+ elif k == 'allparents':
+ entry['parents'] = [p['node'] for p in v()]
+ del entry['allparents']
+ # These aren't interesting to us, so prune them. The
+ # original impetus for this was because "changelogtag"
+ # isn't part of the json template and adding it is non-trivial.
+ elif k in deletefields:
+ del entry[k]
+ elif isinstance(v, types.GeneratorType):
+ entry[k] = list(v)
+
+ csets.append(entry)
# Advertise whether the requested revision is visible (non-obsolete).
if csets:
visible = csets[-1]['node'] in repo
else:
visible = None
data = {
--- a/hgext/pushlog/__init__.py
+++ b/hgext/pushlog/__init__.py
@@ -185,16 +185,22 @@ class pushlog(object):
'''An interface to pushlog data.'''
def __init__(self, repo):
'''Create an instance bound to a sqlite database in a path.'''
# Use a weak ref to avoid cycle. This object's lifetime should be no
# greater than the repo's.
self.repo = repo
+ # Caches of pushlog data to avoid database I/O.
+ # int -> Push
+ self._push_by_id = {}
+ # bin node -> int
+ self._push_id_by_node = {}
+
def _getconn(self, readonly=False, tr=None):
"""Get a SQLite connection to the pushlog.
In normal operation, this will return a ``sqlite3.Connection``.
If the database does not exist, it will be created. If the database
schema is not present or up to date, it will be updated. An error will
be raised if any of this could not be performed.
@@ -383,16 +389,20 @@ class pushlog(object):
def pushfromnode(self, node):
"""Obtain info about a push that added the specified changeset.
Returns a Push namedtuple of (pushid, who, when, [nodes]) or None if
there is no pushlog info for this node.
Argument is specified as binary node.
"""
+ if node in self._push_id_by_node:
+ pushid = self._push_id_by_node[node]
+ return self._push_by_id[pushid] if pushid is not None else None
+
with self.conn(readonly=True) as c:
if not c:
return None
return self._push_from_node(c, node)
def _push_from_node(self, conn, node):
res = conn.execute('SELECT pushid from changesets WHERE node=?',
@@ -406,29 +416,62 @@ class pushlog(object):
return self.pushfromnode(ctx.node())
def pushfromid(self, conn, pushid):
"""Obtain a push from its numeric push id.
Returns a Push namedtuple or None if there is no push with this push
id.
"""
+ push = self._push_by_id.get(pushid)
+ if push:
+ return push
+
res = conn.execute('SELECT id, user, date, node from pushlog '
'LEFT JOIN changesets on id=pushid '
'WHERE id=? ORDER BY rev ASC', (pushid,))
nodes = []
for pushid, who, when, node in res:
who = who.encode('utf-8')
nodes.append(node.encode('ascii'))
if not nodes:
return None
return Push(pushid, who, when, nodes)
+ @contextlib.contextmanager
+ def cache_data_for_nodes(self, nodes):
+ """Given an iterable of nodes, cache pushlog data for them.
+
+ Due to current API design, many pushlog methods open a SQLite
+ database, perform a query, then close the database. Calling these
+ within tight loops can be slow.
+
+ This context manager can be used to pre-load pushlog data to
+ avoid inefficient SQLite access patterns.
+ """
+ with self.conn(readonly=True) as c:
+ if not c:
+ return
+
+ try:
+ for node in nodes:
+ push = self._push_from_node(c, node)
+ if push:
+ self._push_id_by_node[node] = push.pushid
+ self._push_by_id[push.pushid] = push
+ else:
+ self._push_id_by_node[node] = None
+
+ yield
+ finally:
+ self._push_id_by_node = {}
+ self._push_by_id = {}
+
def verify(self):
# Attempt to create database (since .pushes below won't).
with self.conn():
pass
repo = self.repo
ui = self.repo.ui