Bug 1432390 - Use zstandard and requests modules instead of spawning curl | zstd in docker.load_image. r?dustin draft
authorMike Hommey <mh+mozilla@glandium.org>
Wed, 24 Jan 2018 11:18:13 +0900
changeset 747404 dc7843942621b0d664b3589cd8fd5e1d3baef460
parent 747403 9ca475c2cec2cff6413928d9d1629f145c18281b
child 747405 b9962c6edd18aba26f86acc5e3c3cfc9cdc85bc5
push id96904
push userbmo:mh+mozilla@glandium.org
push dateFri, 26 Jan 2018 01:20:26 +0000
reviewersdustin
bugs1432390
milestone60.0a1
Bug 1432390 - Use zstandard and requests modules instead of spawning curl | zstd in docker.load_image. r?dustin The zstd command we spawn, if available at all, might be the wrong version: zstd changed its stream format in an incompatible way at some point, and the version shipped in e.g. Ubuntu 16.04 uses the old format, while the version taskcluster relies on uses the new format. Relying on gps's zstandard library allows to ensure we use the right version. Another advantage is that we can trivially pip install it in a virtualenv if it isn't available on the system running the command. If we're ridding ourselves of the subprocess spawning for zstd, we might as well cover curl as well. Especially considering the error handling when subprocesses are involved is not trivial, such that the current error handling code is actually broken and leads to dead-lock conditions, when, for example, curl is still waiting for the python side to read data, but the python side is not reading data anymore because an exception was thrown in the tar reading loop.
taskcluster/mach_commands.py
taskcluster/taskgraph/docker.py
--- a/taskcluster/mach_commands.py
+++ b/taskcluster/mach_commands.py
@@ -370,31 +370,43 @@ class MachCommands(MachCommandBase):
                 for depname, dep in named_links_dict[key].iteritems():
                     if regexprogram.match(dep):
                         filterededges.add((key, dep, depname))
         filtered_taskgraph = TaskGraph(filteredtasks, Graph(set(filteredtasks), filterededges))
         return filtered_taskgraph
 
 
 @CommandProvider
-class TaskClusterImagesProvider(object):
+class TaskClusterImagesProvider(MachCommandBase):
+    def _ensure_sztd(self):
+        try:
+            import zstd
+            # There are two zstd libraries that exist in the wild, ensure we
+            # have the right one.
+            zstd.ZstdCompressor
+            zstd.ZstdDecompressor
+        except (ImportError, AttributeError):
+            self._activate_virtualenv()
+            self.virtualenv_manager.install_pip_package('zstandard==0.8.1')
+
     @Command('taskcluster-load-image', category="ci",
              description="Load a pre-built Docker image")
     @CommandArgument('--task-id',
                      help="Load the image at public/image.tar.zst in this task,"
                           "rather than searching the index")
     @CommandArgument('-t', '--tag',
                      help="tag that the image should be loaded as. If not "
                           "image will be loaded with tag from the tarball",
                      metavar="name:tag")
     @CommandArgument('image_name', nargs='?',
                      help="Load the image of this name based on the current"
                           "contents of the tree (as built for mozilla-central"
                           "or mozilla-inbound)")
     def load_image(self, image_name, task_id, tag):
+        self._ensure_sztd()
         from taskgraph.docker import load_image_by_name, load_image_by_task_id
         if not image_name and not task_id:
             print("Specify either IMAGE-NAME or TASK-ID")
             sys.exit(1)
         try:
             if task_id:
                 ok = load_image_by_task_id(task_id, tag)
             else:
--- a/taskcluster/taskgraph/docker.py
+++ b/taskcluster/taskgraph/docker.py
@@ -14,16 +14,17 @@ import tempfile
 import which
 from subprocess import Popen, PIPE
 from io import BytesIO
 
 from taskgraph.util import docker
 from taskgraph.util.taskcluster import (
     find_task_id,
     get_artifact_url,
+    get_session,
 )
 from taskgraph.util.cached_tasks import cached_index_path
 from . import GECKO
 
 
 def load_image_by_name(image_name, tag=None):
     context_path = os.path.join(GECKO, 'taskcluster', 'docker', image_name)
     context_hash = docker.generate_context_hash(GECKO, context_path, image_name)
@@ -108,39 +109,70 @@ def build_image(name, args=None):
         print('*' * 50)
         print('WARNING: no VERSION file found in image directory.')
         print('Image is not suitable for deploying/pushing.')
         print('Create an image suitable for deploying/pushing by creating')
         print('a VERSION file in the image directory.')
         print('*' * 50)
 
 
+# The zstandard library doesn't expose a file-like interface for its
+# decompressor, but an iterator. Support for a file-like interface is due in
+# next release. In the meanwhile, we use this proxy class to turn the iterator
+# into a file-like.
+class IteratorReader(object):
+    def __init__(self, iterator):
+        self._iterator = iterator
+        self._buf = b''
+
+    def read(self, size):
+        result = b''
+        while len(result) < size:
+            wanted = min(size - len(result), len(self._buf))
+            if not self._buf:
+                try:
+                    self._buf = memoryview(next(self._iterator))
+                except StopIteration:
+                    break
+            result += self._buf[:wanted].tobytes()
+            self._buf = self._buf[wanted:]
+        return result
+
+
 def load_image(url, imageName=None, imageTag=None):
     """
     Load docker image from URL as imageName:tag, if no imageName or tag is given
     it will use whatever is inside the zstd compressed tarball.
 
     Returns an object with properties 'image', 'tag' and 'layer'.
     """
+    import zstd
+
     # If imageName is given and we don't have an imageTag
     # we parse out the imageTag from imageName, or default it to 'latest'
     # if no imageName and no imageTag is given, 'repositories' won't be rewritten
     if imageName and not imageTag:
         if ':' in imageName:
             imageName, imageTag = imageName.split(':', 1)
         else:
             imageTag = 'latest'
 
-    curl, zstd, docker = None, None, None
+    docker = None
     image, tag, layer = None, None, None
     try:
-        # Setup piping: curl | zstd | tarin
-        curl = Popen(['curl', '-#', '--fail', '-L', '--retry', '8', url], stdout=PIPE)
-        zstd = Popen(['zstd', '-d'], stdin=curl.stdout, stdout=PIPE)
-        tarin = tarfile.open(mode='r|', fileobj=zstd.stdout)
+        print("Downloading from {}".format(url))
+        # get_session() gets us a requests.Session set to retry several times.
+        req = get_session().get(url, stream=True)
+        req.raise_for_status()
+        decompressed_reader = IteratorReader(zstd.ZstdDecompressor().read_from(req.raw))
+        tarin = tarfile.open(
+            mode='r|',
+            fileobj=decompressed_reader,
+            bufsize=zstd.DECOMPRESSION_RECOMMENDED_OUTPUT_SIZE)
+
         # Seutp piping: tarout | docker
         docker = Popen(['docker', 'load'], stdin=PIPE)
         tarout = tarfile.open(mode='w|', fileobj=docker.stdin, format=tarfile.GNU_FORMAT)
 
         # Read from tarin and write to tarout
         for member in tarin:
             # Write non-file members directly (don't use extractfile on links)
             if not member.isfile():
@@ -170,30 +202,16 @@ def load_image(url, imageName=None, imag
                 reader = BytesIO(data)
                 member.size = len(data)
 
             # Add member and reader
             tarout.addfile(member, reader)
             reader.close()
         tarout.close()
     finally:
-        def trykill(proc):
-            try:
-                proc.kill()
-            except:
-                pass
-
-        # Check that all subprocesses finished correctly
-        if curl and curl.wait() != 0:
-            trykill(zstd)
-            trykill(docker)
-            raise Exception('failed to download from url: {}'.format(url))
-        if zstd and zstd.wait() != 0:
-            trykill(docker)
-            raise Exception('zstd decompression failed')
         if docker:
             docker.stdin.close()
         if docker and docker.wait() != 0:
             raise Exception('loading into docker failed')
 
     # Check that we found a repositories file
     if not image or not tag or not layer:
         raise Exception('No repositories file found!')