Bug 1343718 - Limit artifact downloads by size rather than by number. r=chmanchester draft
authorMike Hommey <mh+mozilla@glandium.org>
Wed, 12 Apr 2017 16:17:52 +0900
changeset 562405 fbfe40552b2be4940ff66cea8619e84e20a15252
parent 562404 4bfd15f53cf0885a8b0984c0c8532a612f0bb4ae
child 562411 766d07981229c686e8c16aa07c7d5e0268763f18
push id54023
push userbmo:mh+mozilla@glandium.org
push dateThu, 13 Apr 2017 23:16:24 +0000
reviewerschmanchester
bugs1343718
milestone55.0a1
Bug 1343718 - Limit artifact downloads by size rather than by number. r=chmanchester This adds a unit test for the expected behavior, and adds the modules from mach_bootstrap.py that are missing in the virtualenv for the test to run.
build/virtualenv_packages.txt
python/mozbuild/mozbuild/artifacts.py
python/mozbuild/mozbuild/test/python.ini
python/mozbuild/mozbuild/test/test_artifacts.py
--- a/build/virtualenv_packages.txt
+++ b/build/virtualenv_packages.txt
@@ -33,8 +33,11 @@ pyasn1_modules.pth:python/pyasn1-modules
 redo.pth:python/redo
 requests.pth:python/requests
 rsa.pth:python/rsa
 futures.pth:python/futures
 ecc.pth:python/PyECC
 xpcshell.pth:testing/xpcshell
 pyyaml.pth:python/pyyaml/lib
 pytoml.pth:python/pytoml
+pylru.pth:python/pylru
+taskcluster.pth:taskcluster
+dlmanager.pth:python/dlmanager
--- a/python/mozbuild/mozbuild/artifacts.py
+++ b/python/mozbuild/mozbuild/artifacts.py
@@ -93,19 +93,22 @@ NUM_PUSHHEADS_TO_QUERY_PER_PARENT = 50  
 # There isn't really such a thing as a reasonable default here, because we don't
 # know how many pushheads we'll need to look at to find a build with our artifacts,
 # and we don't know how many changesets will be in each push. For now we assume
 # we'll find a build in the last 50 pushes, assuming each push contains 10 changesets.
 NUM_REVISIONS_TO_QUERY = 500
 
 MAX_CACHED_TASKS = 400  # Number of pushheads to cache Task Cluster task data for.
 
-# Number of downloaded artifacts to cache.  Each artifact can be very large,
-# so don't make this to large!  TODO: make this a size (like 500 megs) rather than an artifact count.
-MAX_CACHED_ARTIFACTS = 6
+# Minimum number of downloaded artifacts to keep. Each artifact can be very large,
+# so don't make this to large!
+MIN_CACHED_ARTIFACTS = 6
+
+# Maximum size of the downloaded artifacts to keep in cache, in bytes (1GiB).
+MAX_CACHED_ARTIFACTS_SIZE = 1024 * 1024 * 1024
 
 # Downloaded artifacts are cached, and a subset of their contents extracted for
 # easy installation.  This is most noticeable on Mac OS X: since mounting and
 # copying from DMG files is very slow, we extract the desired binaries to a
 # separate archive for fast re-installation.
 PROCESSED_SUFFIX = '.processed.jar'
 
 CANDIDATE_TREES = (
@@ -659,47 +662,112 @@ class TaskCache(CacheManager):
             # public/build/buildprops.json for this purpose.
             url = get_artifact_url(taskId, artifact_name)
             urls.append(url)
         if not urls:
             raise ValueError('Task for {namespace} existed, but no artifacts found!'.format(namespace=namespace))
         return urls
 
 
-class ArtifactCache(CacheManager):
+class ArtifactPersistLimit(PersistLimit):
+    '''Handle persistence for artifacts cache
+
+    When instantiating a DownloadManager, it starts by filling the
+    PersistLimit instance it's given with register_dir_content.
+    In practice, this registers all the files already in the cache directory.
+    After a download finishes, the newly downloaded file is registered, and the
+    oldest files registered to the PersistLimit instance are removed depending
+    on the size and file limits it's configured for.
+    This is all good, but there are a few tweaks we want here:
+    - We have pickle files in the cache directory that we don't want purged.
+    - Files that were just downloaded in the same session shouldn't be purged.
+      (if for some reason we end up downloading more than the default max size,
+       we don't want the files to be purged)
+    To achieve this, this subclass of PersistLimit inhibits the register_file
+    method for pickle files and tracks what files were downloaded in the same
+    session to avoid removing them.
+
+    The register_file method may be used to register cache matches too, so that
+    later sessions know they were freshly used.
+    '''
+
+    def __init__(self, log=None):
+        super(ArtifactPersistLimit, self).__init__(
+            size_limit=MAX_CACHED_ARTIFACTS_SIZE,
+            file_limit=MIN_CACHED_ARTIFACTS)
+        self._log = log
+        self._registering_dir = False
+        self._downloaded_now = set()
+
+    def log(self, *args, **kwargs):
+        if self._log:
+            self._log(*args, **kwargs)
+
+    def register_file(self, path):
+        if path.endswith('.pickle'):
+            return
+        if not self._registering_dir:
+            # Touch the file so that subsequent calls to a mach artifact
+            # command know it was recently used. While remove_old_files
+            # is based on access time, in various cases, the access time is not
+            # updated when just reading the file, so we force an update.
+            try:
+                os.utime(path, None)
+            except OSError:
+                pass
+            self._downloaded_now.add(path)
+        super(ArtifactPersistLimit, self).register_file(path)
+
+    def register_dir_content(self, directory, pattern="*"):
+        self._registering_dir = True
+        super(ArtifactPersistLimit, self).register_dir_content(
+            directory, pattern)
+        self._registering_dir = False
+
+    def remove_old_files(self):
+        from dlmanager import fs
+        files = sorted(self.files, key=lambda f: f.stat.st_atime)
+        kept = []
+        while len(files) > self.file_limit and \
+                self._files_size >= self.size_limit:
+            f = files.pop(0)
+            if f.path in self._downloaded_now:
+                kept.append(f)
+                continue
+            fs.remove(f.path)
+            self.log(logging.INFO, 'artifact',
+                {'filename': f.path},
+                'Purged artifact {filename}')
+            self._files_size -= f.stat.st_size
+        self.files = files + kept
+
+    def remove_all(self):
+        from dlmanager import fs
+        for f in self.files:
+            fs.remove(f.path)
+        self._files_size = 0
+        self.files = []
+
+
+class ArtifactCache(object):
     '''Fetch Task Cluster artifact URLs and purge least recently used artifacts from disk.'''
 
     def __init__(self, cache_dir, log=None, skip_cache=False):
-        # TODO: instead of storing N artifact packages, store M megabytes.
-        CacheManager.__init__(self, cache_dir, 'fetch', MAX_CACHED_ARTIFACTS, cache_callback=self.delete_file, log=log, skip_cache=skip_cache)
         self._cache_dir = cache_dir
-        size_limit = 1024 * 1024 * 1024 # 1Gb in bytes.
-        file_limit = 4 # But always keep at least 4 old artifacts around.
-        persist_limit = PersistLimit(size_limit, file_limit)
-        self._download_manager = DownloadManager(self._cache_dir, persist_limit=persist_limit)
+        self._log = log
+        self._skip_cache = skip_cache
+        self._persist_limit = ArtifactPersistLimit(log)
+        self._download_manager = DownloadManager(
+            self._cache_dir, persist_limit=self._persist_limit)
         self._last_dl_update = -1
 
-    def delete_file(self, key, value):
-        try:
-            os.remove(value)
-            self.log(logging.INFO, 'artifact',
-                {'filename': value},
-                'Purged artifact {filename}')
-        except (OSError, IOError):
-            pass
+    def log(self, *args, **kwargs):
+        if self._log:
+            self._log(*args, **kwargs)
 
-        try:
-            os.remove(value + PROCESSED_SUFFIX)
-            self.log(logging.INFO, 'artifact',
-                {'filename': value + PROCESSED_SUFFIX},
-                'Purged processed artifact {filename}')
-        except (OSError, IOError):
-            pass
-
-    @cachedmethod(operator.attrgetter('_cache'))
     def fetch(self, url, force=False):
         # We download to a temporary name like HASH[:16]-basename to
         # differentiate among URLs with the same basenames.  We used to then
         # extract the build ID from the downloaded artifact and use it to make a
         # human readable unique name, but extracting build IDs is time consuming
         # (especially on Mac OS X, where we must mount a large DMG file).
         hash = hashlib.sha256(url).hexdigest()[:16]
         fname = hash + '-' + os.path.basename(url)
@@ -727,24 +795,38 @@ class ArtifactCache(CacheManager):
                 self._last_dl_update = now
                 self.log(logging.INFO, 'artifact',
                          {'bytes_so_far': bytes_so_far, 'total_size': total_size, 'percent': percent},
                          'Downloading... {percent:02.1f} %')
 
             if dl:
                 dl.set_progress(download_progress)
                 dl.wait()
+            else:
+                # Avoid the file being removed if it was in the cache already.
+                path = os.path.join(self._cache_dir, fname)
+                self._persist_limit.register_file(path)
+
             self.log(logging.INFO, 'artifact',
                 {'path': os.path.abspath(mozpath.join(self._cache_dir, fname))},
                 'Downloaded artifact to {path}')
             return os.path.abspath(mozpath.join(self._cache_dir, fname))
         finally:
             # Cancel any background downloads in progress.
             self._download_manager.cancel()
 
+    def clear_cache(self):
+        if self._skip_cache:
+            self.log(logging.DEBUG, 'artifact',
+                {},
+                'Skipping cache: ignoring clear_cache!')
+            return
+
+        self._persist_limit.remove_all()
+
 
 class Artifacts(object):
     '''Maintain state to efficiently fetch build artifacts from a Firefox tree.'''
 
     def __init__(self, tree, substs, defines, job=None, log=None,
                  cache_dir='.', hg=None, git=None, skip_cache=False,
                  topsrcdir=None):
         if (hg and git) or (not hg and not git):
@@ -936,16 +1018,18 @@ class Artifacts(object):
             self.log(logging.INFO, 'artifact',
                 {'filename': filename},
                 'Processing contents of {filename}')
             self.log(logging.INFO, 'artifact',
                 {'processed_filename': processed_filename},
                 'Writing processed {processed_filename}')
             self._artifact_job.process_artifact(filename, processed_filename)
 
+        self._artifact_cache._persist_limit.register_file(processed_filename)
+
         self.log(logging.INFO, 'artifact',
             {'processed_filename': processed_filename},
             'Installing from processed {processed_filename}')
 
         # Copy all .so files, avoiding modification where possible.
         ensureParentDir(mozpath.join(distdir, '.dummy'))
 
         with zipfile.ZipFile(processed_filename) as zf:
@@ -966,18 +1050,17 @@ class Artifacts(object):
                     perms |= stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH # u+w, a+r.
                     os.chmod(n, perms)
         return 0
 
     def install_from_url(self, url, distdir):
         self.log(logging.INFO, 'artifact',
             {'url': url},
             'Installing from {url}')
-        with self._artifact_cache as artifact_cache:  # The with block handles persistence.
-            filename = artifact_cache.fetch(url)
+        filename = self._artifact_cache.fetch(url)
         return self.install_from_file(filename, distdir)
 
     def _install_from_hg_pushheads(self, hg_pushheads, distdir):
         """Iterate pairs (hg_hash, {tree-set}) associating hg revision hashes
         and tree-sets they are known to be in, trying to download and
         install from each.
         """
 
--- a/python/mozbuild/mozbuild/test/python.ini
+++ b/python/mozbuild/mozbuild/test/python.ini
@@ -21,16 +21,17 @@
 [configure/test_util.py]
 [controller/test_ccachestats.py]
 [controller/test_clobber.py]
 [frontend/test_context.py]
 [frontend/test_emitter.py]
 [frontend/test_namespaces.py]
 [frontend/test_reader.py]
 [frontend/test_sandbox.py]
+[test_artifacts.py]
 [test_base.py]
 [test_containers.py]
 [test_dotproperties.py]
 [test_expression.py]
 [test_jarmaker.py]
 [test_line_endings.py]
 [test_makeutil.py]
 [test_mozconfig.py]
new file mode 100644
--- /dev/null
+++ b/python/mozbuild/mozbuild/test/test_artifacts.py
@@ -0,0 +1,145 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from __future__ import unicode_literals
+
+import os
+import mozunit
+import time
+import unittest
+from tempfile import mkdtemp
+from shutil import rmtree
+
+from mozbuild.artifacts import ArtifactCache
+from mozbuild import artifacts
+
+
+CONTENTS = {
+    'http://server/foo': b'foo',
+    'http://server/bar': b'bar' * 400,
+    'http://server/qux': b'qux' * 400,
+    'http://server/fuga': b'fuga' * 300,
+    'http://server/hoge': b'hoge' * 300,
+    'http://server/larger': b'larger' * 3000,
+}
+
+class FakeResponse(object):
+    def __init__(self, content):
+        self._content = content
+
+    @property
+    def headers(self):
+        return {
+            'Content-length': str(len(self._content))
+        }
+
+    def iter_content(self, chunk_size):
+        content = memoryview(self._content)
+        while content:
+            yield content[:chunk_size]
+            content = content[chunk_size:]
+
+    def raise_for_status(self):
+        pass
+
+    def close(self):
+        pass
+
+
+class FakeSession(object):
+    def get(self, url, stream=True):
+        assert stream is True
+        return FakeResponse(CONTENTS[url])
+
+
+class TestArtifactCache(unittest.TestCase):
+    def setUp(self):
+        self.min_cached_artifacts = artifacts.MIN_CACHED_ARTIFACTS
+        self.max_cached_artifacts_size = artifacts.MAX_CACHED_ARTIFACTS_SIZE
+        artifacts.MIN_CACHED_ARTIFACTS = 2
+        artifacts.MAX_CACHED_ARTIFACTS_SIZE = 4096
+
+        self._real_utime = os.utime
+        os.utime = self.utime
+        self.timestamp = time.time() - 86400
+
+        self.tmpdir = mkdtemp()
+
+    def tearDown(self):
+        rmtree(self.tmpdir)
+        artifacts.MIN_CACHED_ARTIFACTS = self.min_cached_artifacts
+        artifacts.MAX_CACHED_ARTIFACTS_SIZE = self.max_cached_artifacts_size
+        os.utime = self._real_utime
+
+    def utime(self, path, times):
+        if times is None:
+            # Ensure all downloaded files have a different timestamp
+            times = (self.timestamp, self.timestamp)
+            self.timestamp += 2
+        self._real_utime(path, times)
+
+    def test_artifact_cache_persistence(self):
+        cache = ArtifactCache(self.tmpdir)
+        cache._download_manager.session = FakeSession()
+
+        path = cache.fetch('http://server/foo')
+        expected = [os.path.basename(path)]
+        self.assertEqual(os.listdir(self.tmpdir), expected)
+
+        path = cache.fetch('http://server/bar')
+        expected.append(os.path.basename(path))
+        self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+        # We're downloading more than the cache allows us, but since it's all
+        # in the same session, no purge happens.
+        path = cache.fetch('http://server/qux')
+        expected.append(os.path.basename(path))
+        self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+        path = cache.fetch('http://server/fuga')
+        expected.append(os.path.basename(path))
+        self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+        cache = ArtifactCache(self.tmpdir)
+        cache._download_manager.session = FakeSession()
+
+        # Downloading a new file in a new session purges the oldest files in
+        # the cache.
+        path = cache.fetch('http://server/hoge')
+        expected.append(os.path.basename(path))
+        expected = expected[2:]
+        self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+        # Downloading a file already in the cache leaves the cache untouched
+        cache = ArtifactCache(self.tmpdir)
+        cache._download_manager.session = FakeSession()
+
+        path = cache.fetch('http://server/qux')
+        self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+        # bar was purged earlier, re-downloading it should purge the oldest
+        # downloaded file, which at this point would be qux, but we also
+        # re-downloaded it in the mean time, so the next one (fuga) should be
+        # the purged one.
+        cache = ArtifactCache(self.tmpdir)
+        cache._download_manager.session = FakeSession()
+
+        path = cache.fetch('http://server/bar')
+        expected.append(os.path.basename(path))
+        expected = [p for p in expected if 'fuga' not in p]
+        self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+        # Downloading one file larger than the cache size should still leave
+        # MIN_CACHED_ARTIFACTS files.
+        cache = ArtifactCache(self.tmpdir)
+        cache._download_manager.session = FakeSession()
+
+        path = cache.fetch('http://server/larger')
+        expected.append(os.path.basename(path))
+        expected = expected[-2:]
+        self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+
+if __name__ == '__main__':
+    mozunit.main()