robustcheckout: retry after "stream ended unexpectedly" (bug 1291926); r?glob draft
authorGregory Szorc <gps@mozilla.com>
Tue, 10 Jan 2017 18:18:37 -0800
changeset 10185 7befbd63bff207f1e742f17bed5b1415207e2e5d
parent 10184 df7c2d353db35c1ac7f00194e8f5318f877c71c7
push id1465
push userbmo:gps@mozilla.com
push dateFri, 20 Jan 2017 06:33:50 +0000
reviewersglob
bugs1291926
robustcheckout: retry after "stream ended unexpectedly" (bug 1291926); r?glob The most common network failure we've seen in automation is "stream ended unexpectedly." In fact, this appears to be the only network error we see with any frequency. This commit introduces retry logic for network operations. The default behavior is to attempt network pulls up to 3 times. The code doesn't catch all network failures. But it catches the big one: "stream ended unexpectedly," which is raised when reading from changegroups, which constitute the bulk of bytes over the wire for `hg pull` operations. FWIW, I'm not sure of an exhaustive list of network related exceptions Mercurial can emit. We'll likely have to follow up and add more exception detection. Ideally, we'd implement retry logic upstream. Perfect is the enemy of good. MozReview-Commit-ID: 7fRBDNwus4w
hgext/robustcheckout/__init__.py
hgext/robustcheckout/tests/helpers.sh
hgext/robustcheckout/tests/test-server-failure.t
--- a/hgext/robustcheckout/__init__.py
+++ b/hgext/robustcheckout/__init__.py
@@ -10,17 +10,19 @@ times and storage efficiency.
 """
 
 from __future__ import absolute_import
 
 import contextlib
 import errno
 import functools
 import os
+import random
 import re
+import time
 
 from mercurial.i18n import _
 from mercurial.node import hex
 from mercurial import (
     commands,
     error,
     exchange,
     extensions,
@@ -111,21 +113,23 @@ def purgewrapper(orig, ui, *args, **kwar
 
 
 @command('robustcheckout', [
     ('', 'upstream', '', 'URL of upstream repo to clone from'),
     ('r', 'revision', '', 'Revision to check out'),
     ('b', 'branch', '', 'Branch to check out'),
     ('', 'purge', False, 'Whether to purge the working directory'),
     ('', 'sharebase', '', 'Directory where shared repos should be placed'),
+    ('', 'networkattempts', 3, 'Maximum number of attempts for network '
+                               'operations'),
     ],
     '[OPTION]... URL DEST',
     norepo=True)
 def robustcheckout(ui, url, dest, upstream=None, revision=None, branch=None,
-                   purge=False, sharebase=None):
+                   purge=False, sharebase=None, networkattempts=None):
     """Ensure a working copy has the specified revision checked out."""
     if not revision and not branch:
         raise error.Abort('must specify one of --revision or --branch')
 
     if revision and branch:
         raise error.Abort('cannot specify both --revision and --branch')
 
     # Require revision to look like a SHA-1.
@@ -151,22 +155,26 @@ def robustcheckout(ui, url, dest, upstre
     # otherwise we're at the whim of whatever configs are used in automation.
     ui.setconfig('progress', 'delay', 1.0)
     ui.setconfig('progress', 'refresh', 1.0)
     ui.setconfig('progress', 'assume-tty', True)
 
     sharebase = os.path.realpath(sharebase)
 
     return _docheckout(ui, url, dest, upstream, revision, branch, purge,
-                       sharebase)
+                       sharebase, networkattempts)
 
-def _docheckout(ui, url, dest, upstream, revision, branch, purge, sharebase):
+def _docheckout(ui, url, dest, upstream, revision, branch, purge, sharebase,
+                networkattemptlimit, networkattempts=None):
+    if not networkattempts:
+        networkattempts = [1]
+
     def callself():
         return _docheckout(ui, url, dest, upstream, revision, branch, purge,
-                           sharebase)
+                           sharebase, networkattemptlimit, networkattempts)
 
     ui.write('ensuring %s@%s is available at %s\n' % (url, revision or branch,
                                                       dest))
 
     destvfs = scmutil.vfs(dest, audit=False, realpath=True)
 
     if destvfs.exists() and not destvfs.exists('.hg'):
         raise error.Abort('destination exists but no .hg directory')
@@ -222,16 +230,42 @@ def _docheckout(ui, url, dest, upstream,
         """Handle an error.Abort raised during a pull.
 
         Returns True if caller should call ``callself()`` to retry.
         """
         if e.args[0] == _('repository is unrelated'):
             ui.warn('(repository is unrelated; deleting)\n')
             destvfs.rmtree(forcibly=True)
             return True
+        elif e.args[0].startswith(_('stream ended unexpectedly')):
+            ui.warn('%s\n' % e.args[0])
+            if networkattempts[0] < networkattemptlimit:
+                ui.warn('(retrying after network failure on attempt %d of %d)\n' %
+                        (networkattempts[0], networkattemptlimit))
+
+                # Do a backoff on retries to mitigate the thundering herd
+                # problem. This is an exponential backoff with a multipler
+                # plus random jitter thrown in for good measure.
+                # With the default settings, backoffs will be:
+                # 1) 2.5 - 6.5
+                # 2) 5.5 - 9.5
+                # 3) 11.5 - 15.5
+                backoff = (2 ** networkattempts[0] - 1) * 1.5
+                jittermin = ui.configint('robustcheckout', 'retryjittermin', 1000)
+                jittermax = ui.configint('robustcheckout', 'retryjittermax', 5000)
+                backoff += float(random.randint(jittermin, jittermax)) / 1000.0
+                ui.warn('(waiting %.2fs before retry)\n' % backoff)
+                time.sleep(backoff)
+
+                networkattempts[0] += 1
+
+                return True
+            else:
+                raise error.Abort('reached maximum number of network attempts; '
+                                  'giving up\n')
 
         return False
 
     created = False
 
     if not destvfs.exists():
         # Ensure parent directories of destination exist.
         # Mercurial 3.8 removed ensuredirs and made makedirs race safe.
@@ -245,16 +279,20 @@ def _docheckout(ui, url, dest, upstream,
 
         if upstream:
             ui.write('(cloning from upstream repo %s)\n' % upstream)
         cloneurl = upstream or url
 
         try:
             res = hg.clone(ui, {}, cloneurl, dest=dest, update=False,
                            shareopts={'pool': sharebase, 'mode': 'identity'})
+        except error.Abort as e:
+            if handlepullabort(e):
+                return callself()
+            raise
         except error.RepoError as e:
             return handlerepoerror(e)
         except error.RevlogError as e:
             ui.warn('(repo corruption: %s; deleting shared store)\n' % e.message)
             deletesharedstore()
             return callself()
 
         # TODO retry here.
--- a/hgext/robustcheckout/tests/helpers.sh
+++ b/hgext/robustcheckout/tests/helpers.sh
@@ -1,14 +1,18 @@
 cat >> $HGRCPATH << EOF
 [share]
 pool = $TESTTMP/share
 
 [extensions]
 robustcheckout = $TESTDIR/hgext/robustcheckout
+
+[robustcheckout]
+retryjittermin = 0
+retryjittermax = 100
 EOF
 
 mkdir server
 hg init server/repo0
 hg init server/repo1
 hg init server/bad-server
 
 cd server/repo0
--- a/hgext/robustcheckout/tests/test-server-failure.t
+++ b/hgext/robustcheckout/tests/test-server-failure.t
@@ -10,25 +10,77 @@ Extension works with default config
   adding manifests
   adding file changes
   added 2 changesets with 2 changes to 1 files
   searching for changes
   no changes found
   1 files updated, 0 files merged, 0 files removed, 0 files unresolved
   updated to 94086d65796fd7fc8f957a2c5548db17a13f1f1f
 
-Server abort part way through response is handled
-TODO not yet implemented properly
+Server abort part way through response results in retries
 
   $ cp -a server/bad-server server/bad-server-bytelimit
 
   $ cat >> server/bad-server-bytelimit/.hg/hgrc << EOF
   > [badserver]
   > bytelimit = 11
   > EOF
 
-  $ hg robustcheckout http://localhost:$HGPORT/bad-server-bytelimit byte-limit --revision 94086d65796f
+  $ hg robustcheckout http://localhost:$HGPORT/bad-server-bytelimit byte-limit --revision 94086d65796f --sharebase $TESTTMP/bad-server-share
+  ensuring http://localhost:$HGPORT/bad-server-bytelimit@94086d65796f is available at byte-limit
+  (sharing from new pooled repository 96ee1d7354c4ad7372047672c36a1f561e3a6a4c)
+  requesting all changes
+  stream ended unexpectedly (got 0 bytes, expected 4)
+  (retrying after network failure on attempt 1 of 3)
+  (waiting *s before retry) (glob)
+  ensuring http://localhost:$HGPORT/bad-server-bytelimit@94086d65796f is available at byte-limit
+  (sharing from new pooled repository 96ee1d7354c4ad7372047672c36a1f561e3a6a4c)
+  requesting all changes
+  stream ended unexpectedly (got 0 bytes, expected 4)
+  (retrying after network failure on attempt 2 of 3)
+  (waiting *s before retry) (glob)
+  ensuring http://localhost:$HGPORT/bad-server-bytelimit@94086d65796f is available at byte-limit
+  (sharing from new pooled repository 96ee1d7354c4ad7372047672c36a1f561e3a6a4c)
+  requesting all changes
+  stream ended unexpectedly (got 0 bytes, expected 4)
+  abort: reached maximum number of network attempts; giving up
+  
+  [255]
+
+Adjusting the network limit works
+
+  $ hg robustcheckout http://localhost:$HGPORT/bad-server-bytelimit byte-limit --networkattempts 2 --revision 94086d65796f --sharebase $TESTTMP/bad-server-share
   ensuring http://localhost:$HGPORT/bad-server-bytelimit@94086d65796f is available at byte-limit
-  (sharing from existing pooled repository 96ee1d7354c4ad7372047672c36a1f561e3a6a4c)
+  (sharing from new pooled repository 96ee1d7354c4ad7372047672c36a1f561e3a6a4c)
+  requesting all changes
+  stream ended unexpectedly (got 0 bytes, expected 4)
+  (retrying after network failure on attempt 1 of 2)
+  (waiting *s before retry) (glob)
+  ensuring http://localhost:$HGPORT/bad-server-bytelimit@94086d65796f is available at byte-limit
+  (sharing from new pooled repository 96ee1d7354c4ad7372047672c36a1f561e3a6a4c)
+  requesting all changes
+  stream ended unexpectedly (got 0 bytes, expected 4)
+  abort: reached maximum number of network attempts; giving up
+  
+  [255]
+
+Recovering server will result in good clone
+
+  $ echo 5 > server/bad-server-bytelimit/.hg/badserveruntilgood
+
+  $ hg robustcheckout http://localhost:$HGPORT/bad-server-bytelimit byte-limit --revision 94086d65796f --sharebase $TESTTMP/bad-server-share
+  ensuring http://localhost:$HGPORT/bad-server-bytelimit@94086d65796f is available at byte-limit
+  (sharing from new pooled repository 96ee1d7354c4ad7372047672c36a1f561e3a6a4c)
+  requesting all changes
+  stream ended unexpectedly (got 0 bytes, expected 4)
+  (retrying after network failure on attempt 1 of 3)
+  (waiting *s before retry) (glob)
+  ensuring http://localhost:$HGPORT/bad-server-bytelimit@94086d65796f is available at byte-limit
+  (sharing from new pooled repository 96ee1d7354c4ad7372047672c36a1f561e3a6a4c)
+  requesting all changes
+  adding changesets
+  adding manifests
+  adding file changes
+  added 2 changesets with 2 changes to 1 files
   searching for changes
   no changes found
-  abort: stream ended unexpectedly (got 3 bytes, expected 4)
-  [255]
+  1 files updated, 0 files merged, 0 files removed, 0 files unresolved
+  updated to 94086d65796fd7fc8f957a2c5548db17a13f1f1f