Bug 1421734: Download and unpack toolchain artifacts in parallel draft
authorChris AtLee <catlee@mozilla.com>
Mon, 12 Feb 2018 15:07:36 -0500
changeset 763760 727001e9158216440136bdac3c4fa78f56745b00
parent 763748 a4ef1082c51d5b4508882c22487f6c8de5b35e2a
child 763761 0105623dc79b1cb8c002bcb6ed65cb0911f591e8
push id101556
push usercatlee@mozilla.com
push dateTue, 06 Mar 2018 17:57:33 +0000
bugs1421734
milestone60.0a1
Bug 1421734: Download and unpack toolchain artifacts in parallel MozReview-Commit-ID: BMe6zqIqNHP
python/mozbuild/mozbuild/artifacts.py
python/mozbuild/mozbuild/mach_commands.py
--- a/python/mozbuild/mozbuild/artifacts.py
+++ b/python/mozbuild/mozbuild/artifacts.py
@@ -765,37 +765,38 @@ class ArtifactPersistLimit(PersistLimit)
             fs.remove(f.path)
         self._files_size = 0
         self.files = []
 
 
 class ArtifactCache(object):
     '''Fetch Task Cluster artifact URLs and purge least recently used artifacts from disk.'''
 
-    def __init__(self, cache_dir, log=None, skip_cache=False):
+    def __init__(self, cache_dir, log=None, skip_cache=False, cancel_when_done=True):
         mkdir(cache_dir, not_indexed=True)
         self._cache_dir = cache_dir
         self._log = log
         self._skip_cache = skip_cache
         self._persist_limit = ArtifactPersistLimit(log)
         self._download_manager = DownloadManager(
             self._cache_dir, persist_limit=self._persist_limit)
-        self._last_dl_update = -1
+        self._cancel_when_done = cancel_when_done
 
     def log(self, *args, **kwargs):
         if self._log:
             self._log(*args, **kwargs)
 
     def fetch(self, url, force=False):
         fname = os.path.basename(url)
         try:
             # Use the file name from the url if it looks like a hash digest.
             if len(fname) not in (32, 40, 56, 64, 96, 128):
                 raise TypeError()
             binascii.unhexlify(fname)
+            basename = fname
         except TypeError:
             # We download to a temporary name like HASH[:16]-basename to
             # differentiate among URLs with the same basenames.  We used to then
             # extract the build ID from the downloaded artifact and use it to make a
             # human readable unique name, but extracting build IDs is time consuming
             # (especially on Mac OS X, where we must mount a large DMG file).
             hash = hashlib.sha256(url).hexdigest()[:16]
             # Strip query string and fragments.
@@ -810,54 +811,59 @@ class ArtifactCache(object):
             os.remove(path)
 
         self.log(logging.INFO, 'artifact',
             {'path': path},
             'Downloading to temporary location {path}')
         try:
             dl = self._download_manager.download(url, fname)
 
+            _last_dl_update = [-1]
+
             def download_progress(dl, bytes_so_far, total_size):
                 if not total_size:
                     return
                 percent = (float(bytes_so_far) / total_size) * 100
-                now = int(percent / 5)
-                if now == self._last_dl_update:
+                now = int(percent / 10)
+                if now == _last_dl_update[0]:
                     return
-                self._last_dl_update = now
+                _last_dl_update[0] = now
                 self.log(logging.INFO, 'artifact',
-                         {'bytes_so_far': bytes_so_far, 'total_size': total_size, 'percent': percent},
-                         'Downloading... {percent:02.1f} %')
+                         {'bytes_so_far': bytes_so_far, 'total_size': total_size, 'percent': percent, 'basename': basename},
+                         'Downloading {basename}... {percent:02.1f} %')
 
             if dl:
                 dl.set_progress(download_progress)
                 dl.wait()
             else:
                 # Avoid the file being removed if it was in the cache already.
                 path = os.path.join(self._cache_dir, fname)
                 self._persist_limit.register_file(path)
 
             self.log(logging.INFO, 'artifact',
                 {'path': os.path.abspath(mozpath.join(self._cache_dir, fname))},
                 'Downloaded artifact to {path}')
             return os.path.abspath(mozpath.join(self._cache_dir, fname))
         finally:
             # Cancel any background downloads in progress.
-            self._download_manager.cancel()
+            if self._cancel_when_done:
+                self._download_manager.cancel()
 
     def clear_cache(self):
         if self._skip_cache:
             self.log(logging.DEBUG, 'artifact',
                 {},
                 'Skipping cache: ignoring clear_cache!')
             return
 
         self._persist_limit.remove_all()
 
 
+
+
 class Artifacts(object):
     '''Maintain state to efficiently fetch build artifacts from a Firefox tree.'''
 
     def __init__(self, tree, substs, defines, job=None, log=None,
                  cache_dir='.', hg=None, git=None, skip_cache=False,
                  topsrcdir=None):
         if (hg and git) or (not hg and not git):
             raise ValueError("Must provide path to exactly one of hg and git")
--- a/python/mozbuild/mozbuild/mach_commands.py
+++ b/python/mozbuild/mozbuild/mach_commands.py
@@ -37,16 +37,18 @@ from mozbuild.base import (
     MozbuildObject,
 )
 from mozbuild.util import ensureParentDir
 
 from mozbuild.backend import (
     backends,
 )
 
+from concurrent.futures import ThreadPoolExecutor
+
 
 BUILD_WHAT_HELP = '''
 What to build. Can be a top-level make target or a relative directory. If
 multiple options are provided, they will be built serially. Takes dependency
 information from `topsrcdir/build/dumbmake-dependencies` to build additional
 targets as needed. BUILDING ONLY PARTS OF THE TREE CAN RESULT IN BAD TREE
 STATE. USE AT YOUR OWN RISK.
 '''.strip()
@@ -1254,17 +1256,17 @@ class PackageFrontend(MachCommandBase):
                 self.log_manager.structured_filter)
         if not cache_dir:
             cache_dir = os.path.join(self._mach_context.state_dir, 'toolchains')
 
         tooltool_url = (tooltool_url or
                         'https://tooltool.mozilla-releng.net').rstrip('/')
 
         cache = ArtifactCache(cache_dir=cache_dir, log=self.log,
-                              skip_cache=skip_cache)
+                              skip_cache=skip_cache, cancel_when_done=False)
 
         if authentication_file:
             with open(authentication_file, 'rb') as f:
                 token = f.read().strip()
 
             class TooltoolAuthenticator(HTTPAdapter):
                 def send(self, request, *args, **kwargs):
                     request.headers['Authorization'] = \
@@ -1396,17 +1398,19 @@ class PackageFrontend(MachCommandBase):
             if '@' not in f:
                 self.log(logging.ERROR, 'artifact', {},
                          'Expected a list of files of the form path@task-id')
                 return 1
             name, task_id = f.rsplit('@', 1)
             record = ArtifactRecord(task_id, name)
             records[record.filename] = record
 
-        for record in records.itervalues():
+        artifacts = {} if artifact_manifest else None
+
+        def _fetch_and_unpack_record(record):
             self.log(logging.INFO, 'artifact', {'name': record.basename},
                      'Downloading {name}')
             valid = False
             # sleeptime is 60 per retry.py, used by tooltool_wrapper.sh
             for attempt, _ in enumerate(redo.retrier(attempts=retry+1,
                                                      sleeptime=60)):
                 try:
                     record.fetch_with(cache)
@@ -1441,38 +1445,35 @@ class PackageFrontend(MachCommandBase):
                     pass
                 if not valid:
                     os.unlink(record.filename)
                     if attempt < retry:
                         self.log(logging.INFO, 'artifact', {},
                                  'Will retry in a moment...')
                     continue
 
-                downloaded.append(record)
                 break
 
             if not valid:
                 self.log(logging.ERROR, 'artifact', {'name': record.basename},
                          'Failed to download {name}')
-                return 1
+                raise Exception('Failed to download {name}'.format(name=record.basename))
 
-        artifacts = {} if artifact_manifest else None
-
-        for record in downloaded:
             local = os.path.join(os.getcwd(), record.basename)
             if os.path.exists(local):
                 os.unlink(local)
             # unpack_file needs the file with its final name to work
             # (https://github.com/mozilla/build-tooltool/issues/38), so we
             # need to copy it, even though we remove it later. Use hard links
             # when possible.
             try:
                 os.link(record.filename, local)
             except Exception:
                 shutil.copy(record.filename, local)
+
             # Keep a sha256 of each downloaded file, for the chain-of-trust
             # validation.
             if artifact_manifest is not None:
                 with open(local) as fh:
                     h = hashlib.sha256()
                     while True:
                         data = fh.read(1024 * 1024)
                         if not data:
@@ -1480,28 +1481,37 @@ class PackageFrontend(MachCommandBase):
                         h.update(data)
                 artifacts[record.url] = {
                     'sha256': h.hexdigest(),
                 }
             if record.unpack and not no_unpack:
                 unpack_file(local, record.setup)
                 os.unlink(local)
 
+            return record
+
+        with ThreadPoolExecutor(8) as pool:
+            for record in pool.map(_fetch_and_unpack_record, records.values()):
+                downloaded.append(record)
+
+        cache._download_manager.cancel()
+
         if not downloaded:
             self.log(logging.ERROR, 'artifact', {}, 'Nothing to download')
             if files:
                 return 1
 
         if artifacts:
             ensureParentDir(artifact_manifest)
             with open(artifact_manifest, 'w') as fh:
                 json.dump(artifacts, fh, indent=4, sort_keys=True)
 
         return 0
 
+
 class StaticAnalysisSubCommand(SubCommand):
     def __call__(self, func):
         after = SubCommand.__call__(self, func)
         args = [
             CommandArgument('--verbose', '-v', action='store_true',
                             help='Print verbose output.'),
         ]
         for arg in args: