Bug 1289514 - Change API and semantics of pushlog parsing; r=catlee
authorGregory Szorc <gps@mozilla.com>
Tue, 26 Jul 2016 11:12:22 -0700
changeset 4876 528c39fe4a5aa67a63e8e66db83595a5545ca7e6
parent 4875 f543c1e5f30a59c1a3e41b454780f33b25194a69
child 4877 93f15cf0f724e3df24c7e411b7ed0673e682d923
push id3643
push userbmo:gps@mozilla.com
push dateTue, 26 Jul 2016 22:50:34 +0000
reviewerscatlee
bugs1289514
Bug 1289514 - Change API and semantics of pushlog parsing; r=catlee Instead of returning the list of pushes, we now return a slightly normalized version of the raw JSON. This allows us to expose things like "lastpushid" to consumers. We also sort the pushes by push id because that is the most appropriate value to sort on since it is monotonically increasing (unlike date, which can have clock skew and some pushlog may contain out-of-order insertions with decreasing data values). We want automation to process pushes in the order they were pushed, so sorting by push ID makes sense. MozReview-Commit-ID: 6cFynKvSpe4
changes/hgpoller.py
test/test_hgpoller.py
--- a/changes/hgpoller.py
+++ b/changes/hgpoller.py
@@ -103,21 +103,39 @@ from twisted.python import log
 from twisted.internet import defer, reactor
 from twisted.internet.task import LoopingCall
 from twisted.web.client import getPage
 
 from buildbot.changes import base, changes
 from buildbot.util import json
 
 
-def _parse_changes(data):
-    pushes = json.loads(data)['pushes'].values()
-    # Sort by push date
-    pushes.sort(key=lambda p: p['date'])
-    return pushes
+def parse_pushlog_json(data):
+    """Parse pushlog data version 2.
+
+    Returns a normalized version of the data. The "pushes" key
+    is a list of pushlog entries in order instead of a dict.
+    """
+    data = json.loads(data)
+
+    pushes = []
+    for pushid, push in data['pushes'].items():
+        # Keys are strings in the JSON. We convert to int so the sort
+        # below works appropriately. (A string sort may not always
+        # have appropriate ordering.)
+        push['pushid'] = int(pushid)
+        pushes.append(push)
+
+    # Sort by the push id because that is the monotonically incrementing
+    # key.
+    pushes.sort(key=lambda p: p['pushid'])
+
+    data['pushes'] = pushes
+
+    return data
 
 
 class Pluggable(object):
     '''The Pluggable class implements a forward for Deferred's that
     can be thrown away.
 
     This is in particular useful when a network request doesn't really
     error in a reasonable time, and you want to make sure that if it
@@ -262,35 +280,35 @@ class BaseHgPoller(BasePoller):
             ## emptyRepo to True so we can trigger builds for new changes there
             # if self.verbose:
                 # log.msg("%s has been reset" % self.baseURL)
             # self.lastChangeset = None
             # self.emptyRepo = True
         return self.super_class.dataFailed(self, res)
 
     def processData(self, query):
-        pushes = _parse_changes(query)
-        if len(pushes) == 0:
+        push_data = parse_pushlog_json(query)
+        if not push_data['pushes']:
             if self.lastChangeset is None:
                 # We don't have a lastChangeset, and there are no changes.  Assume
                 # the repository is empty.
                 self.emptyRepo = True
                 if self.verbose:
                     log.msg("%s is empty" % self.baseURL)
             # Nothing else to do
             return
 
         # We want to add at most self.maxChanges changes per push. If
         # mergePushChanges is True, then we'll get up to maxChanges pushes,
         # each with up to maxChanges changes.
         # Go through the list of pushes backwards, since we want to keep the
         # latest ones and possibly discard earlier ones.
         change_list = []
         too_many = False
-        for push in reversed(pushes):
+        for push in reversed(push_data['pushes']):
             # Used for merging push changes
             c = dict(
                 user=push['user'],
                 date=push['date'],
                 files=[],
                 desc="",
                 node=None,
                 commit_titles=[],
@@ -398,17 +416,17 @@ class BaseHgPoller(BasePoller):
                 self.changeHook(c)
                 self.parent.addChange(c)
 
         # The repository isn't empty any more!
         self.emptyRepo = False
         # Use the last change found by the poller, regardless of if it's on our
         # branch or not. This is so we don't have to constantly ignore it in
         # future polls.
-        self.lastChangeset = pushes[-1]["changesets"][-1]["node"]
+        self.lastChangeset = push_data['pushes'][-1]['changesets'][-1]['node']
         if self.verbose:
             log.msg("last changeset %s on %s" %
                     (self.lastChangeset, self.baseURL))
 
     def changeHook(self, change):
         pass
 
 
--- a/test/test_hgpoller.py
+++ b/test/test_hgpoller.py
@@ -266,17 +266,18 @@ validPushlog = """
 
 malformedPushlog = """
 { "invalid json" }
 """
 
 
 class PushlogParsing(unittest.TestCase):
     def testValidPushlog(self):
-        pushes = hgpoller._parse_changes(validPushlog)
+        push_data = hgpoller.parse_pushlog_json(validPushlog)
+        pushes = push_data['pushes']
         self.failUnlessEqual(len(pushes), 2)
 
         self.failUnlessEqual(pushes[0]['changesets'][0]['node'],
                              '4c23e51a484f077ea27af3ea4a4ee13da5aeb5e6')
         self.failUnlessEqual(pushes[0]['date'], 1282358416)
         self.failUnlessEqual(len(pushes[0]['changesets'][0]['files']), 4)
         self.failUnlessEqual(pushes[0]['changesets'][0]['branch'],
                              'GECKO20b5pre_20100820_RELBRANCH')
@@ -297,20 +298,20 @@ class PushlogParsing(unittest.TestCase):
                              '33be08836cb164f9e546231fc59e9e4cf98ed991')
         self.failUnlessEqual(len(pushes[1]['changesets'][1]['files']), 1)
         self.failUnlessEqual(pushes[1]['changesets'][1]['branch'], 'default')
         self.failUnlessEqual(pushes[1]['changesets'][1]['author'],
                              'Bobby Holley <bobbyholley@gmail.com>')
 
     def testMalformedPushlog(self):
         self.failUnlessRaises(
-            JSONDecodeError, hgpoller._parse_changes, malformedPushlog)
+            JSONDecodeError, hgpoller.parse_pushlog_json, malformedPushlog)
 
     def testEmptyPushlog(self):
-        self.failUnlessRaises(JSONDecodeError, hgpoller._parse_changes, "")
+        self.failUnlessRaises(JSONDecodeError, hgpoller.parse_pushlog_json, "")
 
 
 class RepoBranchHandling(unittest.TestCase):
     def setUp(self):
         self.changes = []
 
     def doTest(self, repo_branch):
         changes = self.changes