--- a/python/mozbuild/mozbuild/artifacts.py
+++ b/python/mozbuild/mozbuild/artifacts.py
@@ -93,19 +93,22 @@ NUM_PUSHHEADS_TO_QUERY_PER_PARENT = 50
# There isn't really such a thing as a reasonable default here, because we don't
# know how many pushheads we'll need to look at to find a build with our artifacts,
# and we don't know how many changesets will be in each push. For now we assume
# we'll find a build in the last 50 pushes, assuming each push contains 10 changesets.
NUM_REVISIONS_TO_QUERY = 500
MAX_CACHED_TASKS = 400 # Number of pushheads to cache Task Cluster task data for.
-# Number of downloaded artifacts to cache. Each artifact can be very large,
-# so don't make this to large! TODO: make this a size (like 500 megs) rather than an artifact count.
-MAX_CACHED_ARTIFACTS = 6
+# Minimum number of downloaded artifacts to keep. Each artifact can be very large,
+# so don't make this to large!
+MIN_CACHED_ARTIFACTS = 6
+
+# Maximum size of the downloaded artifacts to keep in cache, in bytes (1GiB).
+MAX_CACHED_ARTIFACTS_SIZE = 1024 * 1024 * 1024
# Downloaded artifacts are cached, and a subset of their contents extracted for
# easy installation. This is most noticeable on Mac OS X: since mounting and
# copying from DMG files is very slow, we extract the desired binaries to a
# separate archive for fast re-installation.
PROCESSED_SUFFIX = '.processed.jar'
CANDIDATE_TREES = (
@@ -659,47 +662,112 @@ class TaskCache(CacheManager):
# public/build/buildprops.json for this purpose.
url = get_artifact_url(taskId, artifact_name)
urls.append(url)
if not urls:
raise ValueError('Task for {namespace} existed, but no artifacts found!'.format(namespace=namespace))
return urls
-class ArtifactCache(CacheManager):
+class ArtifactPersistLimit(PersistLimit):
+ '''Handle persistence for artifacts cache
+
+ When instantiating a DownloadManager, it starts by filling the
+ PersistLimit instance it's given with register_dir_content.
+ In practice, this registers all the files already in the cache directory.
+ After a download finishes, the newly downloaded file is registered, and the
+ oldest files registered to the PersistLimit instance are removed depending
+ on the size and file limits it's configured for.
+ This is all good, but there are a few tweaks we want here:
+ - We have pickle files in the cache directory that we don't want purged.
+ - Files that were just downloaded in the same session shouldn't be purged.
+ (if for some reason we end up downloading more than the default max size,
+ we don't want the files to be purged)
+ To achieve this, this subclass of PersistLimit inhibits the register_file
+ method for pickle files and tracks what files were downloaded in the same
+ session to avoid removing them.
+
+ The register_file method may be used to register cache matches too, so that
+ later sessions know they were freshly used.
+ '''
+
+ def __init__(self, log=None):
+ super(ArtifactPersistLimit, self).__init__(
+ size_limit=MAX_CACHED_ARTIFACTS_SIZE,
+ file_limit=MIN_CACHED_ARTIFACTS)
+ self._log = log
+ self._registering_dir = False
+ self._downloaded_now = set()
+
+ def log(self, *args, **kwargs):
+ if self._log:
+ self._log(*args, **kwargs)
+
+ def register_file(self, path):
+ if path.endswith('.pickle'):
+ return
+ if not self._registering_dir:
+ # Touch the file so that subsequent calls to a mach artifact
+ # command know it was recently used. While remove_old_files
+ # is based on access time, in various cases, the access time is not
+ # updated when just reading the file, so we force an update.
+ try:
+ os.utime(path, None)
+ except OSError:
+ pass
+ self._downloaded_now.add(path)
+ super(ArtifactPersistLimit, self).register_file(path)
+
+ def register_dir_content(self, directory, pattern="*"):
+ self._registering_dir = True
+ super(ArtifactPersistLimit, self).register_dir_content(
+ directory, pattern)
+ self._registering_dir = False
+
+ def remove_old_files(self):
+ from dlmanager import fs
+ files = sorted(self.files, key=lambda f: f.stat.st_atime)
+ kept = []
+ while len(files) > self.file_limit and \
+ self._files_size >= self.size_limit:
+ f = files.pop(0)
+ if f.path in self._downloaded_now:
+ kept.append(f)
+ continue
+ fs.remove(f.path)
+ self.log(logging.INFO, 'artifact',
+ {'filename': f.path},
+ 'Purged artifact {filename}')
+ self._files_size -= f.stat.st_size
+ self.files = files + kept
+
+ def remove_all(self):
+ from dlmanager import fs
+ for f in self.files:
+ fs.remove(f.path)
+ self._files_size = 0
+ self.files = []
+
+
+class ArtifactCache(object):
'''Fetch Task Cluster artifact URLs and purge least recently used artifacts from disk.'''
def __init__(self, cache_dir, log=None, skip_cache=False):
- # TODO: instead of storing N artifact packages, store M megabytes.
- CacheManager.__init__(self, cache_dir, 'fetch', MAX_CACHED_ARTIFACTS, cache_callback=self.delete_file, log=log, skip_cache=skip_cache)
self._cache_dir = cache_dir
- size_limit = 1024 * 1024 * 1024 # 1Gb in bytes.
- file_limit = 4 # But always keep at least 4 old artifacts around.
- persist_limit = PersistLimit(size_limit, file_limit)
- self._download_manager = DownloadManager(self._cache_dir, persist_limit=persist_limit)
+ self._log = log
+ self._skip_cache = skip_cache
+ self._persist_limit = ArtifactPersistLimit(log)
+ self._download_manager = DownloadManager(
+ self._cache_dir, persist_limit=self._persist_limit)
self._last_dl_update = -1
- def delete_file(self, key, value):
- try:
- os.remove(value)
- self.log(logging.INFO, 'artifact',
- {'filename': value},
- 'Purged artifact {filename}')
- except (OSError, IOError):
- pass
+ def log(self, *args, **kwargs):
+ if self._log:
+ self._log(*args, **kwargs)
- try:
- os.remove(value + PROCESSED_SUFFIX)
- self.log(logging.INFO, 'artifact',
- {'filename': value + PROCESSED_SUFFIX},
- 'Purged processed artifact {filename}')
- except (OSError, IOError):
- pass
-
- @cachedmethod(operator.attrgetter('_cache'))
def fetch(self, url, force=False):
# We download to a temporary name like HASH[:16]-basename to
# differentiate among URLs with the same basenames. We used to then
# extract the build ID from the downloaded artifact and use it to make a
# human readable unique name, but extracting build IDs is time consuming
# (especially on Mac OS X, where we must mount a large DMG file).
hash = hashlib.sha256(url).hexdigest()[:16]
fname = hash + '-' + os.path.basename(url)
@@ -727,24 +795,38 @@ class ArtifactCache(CacheManager):
self._last_dl_update = now
self.log(logging.INFO, 'artifact',
{'bytes_so_far': bytes_so_far, 'total_size': total_size, 'percent': percent},
'Downloading... {percent:02.1f} %')
if dl:
dl.set_progress(download_progress)
dl.wait()
+ else:
+ # Avoid the file being removed if it was in the cache already.
+ path = os.path.join(self._cache_dir, fname)
+ self._persist_limit.register_file(path)
+
self.log(logging.INFO, 'artifact',
{'path': os.path.abspath(mozpath.join(self._cache_dir, fname))},
'Downloaded artifact to {path}')
return os.path.abspath(mozpath.join(self._cache_dir, fname))
finally:
# Cancel any background downloads in progress.
self._download_manager.cancel()
+ def clear_cache(self):
+ if self._skip_cache:
+ self.log(logging.DEBUG, 'artifact',
+ {},
+ 'Skipping cache: ignoring clear_cache!')
+ return
+
+ self._persist_limit.remove_all()
+
class Artifacts(object):
'''Maintain state to efficiently fetch build artifacts from a Firefox tree.'''
def __init__(self, tree, substs, defines, job=None, log=None,
cache_dir='.', hg=None, git=None, skip_cache=False,
topsrcdir=None):
if (hg and git) or (not hg and not git):
@@ -936,16 +1018,18 @@ class Artifacts(object):
self.log(logging.INFO, 'artifact',
{'filename': filename},
'Processing contents of {filename}')
self.log(logging.INFO, 'artifact',
{'processed_filename': processed_filename},
'Writing processed {processed_filename}')
self._artifact_job.process_artifact(filename, processed_filename)
+ self._artifact_cache._persist_limit.register_file(processed_filename)
+
self.log(logging.INFO, 'artifact',
{'processed_filename': processed_filename},
'Installing from processed {processed_filename}')
# Copy all .so files, avoiding modification where possible.
ensureParentDir(mozpath.join(distdir, '.dummy'))
with zipfile.ZipFile(processed_filename) as zf:
@@ -966,18 +1050,17 @@ class Artifacts(object):
perms |= stat.S_IWUSR | stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH # u+w, a+r.
os.chmod(n, perms)
return 0
def install_from_url(self, url, distdir):
self.log(logging.INFO, 'artifact',
{'url': url},
'Installing from {url}')
- with self._artifact_cache as artifact_cache: # The with block handles persistence.
- filename = artifact_cache.fetch(url)
+ filename = self._artifact_cache.fetch(url)
return self.install_from_file(filename, distdir)
def _install_from_hg_pushheads(self, hg_pushheads, distdir):
"""Iterate pairs (hg_hash, {tree-set}) associating hg revision hashes
and tree-sets they are known to be in, trying to download and
install from each.
"""
new file mode 100644
--- /dev/null
+++ b/python/mozbuild/mozbuild/test/test_artifacts.py
@@ -0,0 +1,145 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from __future__ import unicode_literals
+
+import os
+import mozunit
+import time
+import unittest
+from tempfile import mkdtemp
+from shutil import rmtree
+
+from mozbuild.artifacts import ArtifactCache
+from mozbuild import artifacts
+
+
+CONTENTS = {
+ 'http://server/foo': b'foo',
+ 'http://server/bar': b'bar' * 400,
+ 'http://server/qux': b'qux' * 400,
+ 'http://server/fuga': b'fuga' * 300,
+ 'http://server/hoge': b'hoge' * 300,
+ 'http://server/larger': b'larger' * 3000,
+}
+
+class FakeResponse(object):
+ def __init__(self, content):
+ self._content = content
+
+ @property
+ def headers(self):
+ return {
+ 'Content-length': str(len(self._content))
+ }
+
+ def iter_content(self, chunk_size):
+ content = memoryview(self._content)
+ while content:
+ yield content[:chunk_size]
+ content = content[chunk_size:]
+
+ def raise_for_status(self):
+ pass
+
+ def close(self):
+ pass
+
+
+class FakeSession(object):
+ def get(self, url, stream=True):
+ assert stream is True
+ return FakeResponse(CONTENTS[url])
+
+
+class TestArtifactCache(unittest.TestCase):
+ def setUp(self):
+ self.min_cached_artifacts = artifacts.MIN_CACHED_ARTIFACTS
+ self.max_cached_artifacts_size = artifacts.MAX_CACHED_ARTIFACTS_SIZE
+ artifacts.MIN_CACHED_ARTIFACTS = 2
+ artifacts.MAX_CACHED_ARTIFACTS_SIZE = 4096
+
+ self._real_utime = os.utime
+ os.utime = self.utime
+ self.timestamp = time.time() - 86400
+
+ self.tmpdir = mkdtemp()
+
+ def tearDown(self):
+ rmtree(self.tmpdir)
+ artifacts.MIN_CACHED_ARTIFACTS = self.min_cached_artifacts
+ artifacts.MAX_CACHED_ARTIFACTS_SIZE = self.max_cached_artifacts_size
+ os.utime = self._real_utime
+
+ def utime(self, path, times):
+ if times is None:
+ # Ensure all downloaded files have a different timestamp
+ times = (self.timestamp, self.timestamp)
+ self.timestamp += 2
+ self._real_utime(path, times)
+
+ def test_artifact_cache_persistence(self):
+ cache = ArtifactCache(self.tmpdir)
+ cache._download_manager.session = FakeSession()
+
+ path = cache.fetch('http://server/foo')
+ expected = [os.path.basename(path)]
+ self.assertEqual(os.listdir(self.tmpdir), expected)
+
+ path = cache.fetch('http://server/bar')
+ expected.append(os.path.basename(path))
+ self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+ # We're downloading more than the cache allows us, but since it's all
+ # in the same session, no purge happens.
+ path = cache.fetch('http://server/qux')
+ expected.append(os.path.basename(path))
+ self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+ path = cache.fetch('http://server/fuga')
+ expected.append(os.path.basename(path))
+ self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+ cache = ArtifactCache(self.tmpdir)
+ cache._download_manager.session = FakeSession()
+
+ # Downloading a new file in a new session purges the oldest files in
+ # the cache.
+ path = cache.fetch('http://server/hoge')
+ expected.append(os.path.basename(path))
+ expected = expected[2:]
+ self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+ # Downloading a file already in the cache leaves the cache untouched
+ cache = ArtifactCache(self.tmpdir)
+ cache._download_manager.session = FakeSession()
+
+ path = cache.fetch('http://server/qux')
+ self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+ # bar was purged earlier, re-downloading it should purge the oldest
+ # downloaded file, which at this point would be qux, but we also
+ # re-downloaded it in the mean time, so the next one (fuga) should be
+ # the purged one.
+ cache = ArtifactCache(self.tmpdir)
+ cache._download_manager.session = FakeSession()
+
+ path = cache.fetch('http://server/bar')
+ expected.append(os.path.basename(path))
+ expected = [p for p in expected if 'fuga' not in p]
+ self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+ # Downloading one file larger than the cache size should still leave
+ # MIN_CACHED_ARTIFACTS files.
+ cache = ArtifactCache(self.tmpdir)
+ cache._download_manager.session = FakeSession()
+
+ path = cache.fetch('http://server/larger')
+ expected.append(os.path.basename(path))
+ expected = expected[-2:]
+ self.assertEqual(sorted(os.listdir(self.tmpdir)), sorted(expected))
+
+
+if __name__ == '__main__':
+ mozunit.main()