Bug 1409876 - Record cache history to aid failure analysis; r?dustin
We introduce a per-cache .cachelog file containing important events
in the cache's history such as creation, requirements adjusting,
and utilization. If cache validation fails, we print the cache log.
If a previous task was responsible for getting the cache in a bad
state, its TASK_ID should be printed, allowing us to more easily
identify mis-configured tasks.
MozReview-Commit-ID: BJun5Hi5w0s
--- a/taskcluster/docker/recipes/run-task
+++ b/taskcluster/docker/recipes/run-task
@@ -290,16 +290,19 @@ def main(args):
# validate that use of the cache is acceptable.
#
# We /could/ blow away the cache data on requirements mismatch.
# While this would be convenient, this could result in "competing" tasks
# effectively undoing the other's work. This would slow down task
# execution in aggregate. Without monitoring for this, people may not notice
# the problem and tasks would be slower than they could be. We follow the
# principle of "fail fast" to ensure optimal task execution.
+ #
+ # We also write an audit log of who used the caches. This log is printed
+ # during failures to help aid debugging.
if 'TASKCLUSTER_CACHES' in os.environ:
caches = os.environ['TASKCLUSTER_CACHES'].split(';')
del os.environ['TASKCLUSTER_CACHES']
else:
caches = []
if 'TASKCLUSTER_UNTRUSTED_CACHES' in os.environ:
@@ -316,37 +319,48 @@ def main(args):
# to force the use of a new cache.
b'version=1',
# Include the UID and GID the task will run as to ensure that tasks
# with different UID and GID don't share the same cache.
b'uid=%d' % uid,
b'gid=%d' % gid,
}
+ def write_audit_entry(path, msg):
+ now = datetime.datetime.utcnow().isoformat()
+ with open(path, 'ab') as fh:
+ fh.write(b'[%sZ %s] %s\n' % (
+ now, os.environ.get('TASK_ID', 'UNKNOWN'), msg))
+
for cache in caches:
if not os.path.isdir(cache):
print('error: cache %s is not a directory; this should never '
'happen' % cache)
return 1
requires_path = os.path.join(cache, '.cacherequires')
+ audit_path = os.path.join(cache, '.cachelog')
# The cache is empty. Configure it.
if not os.listdir(cache):
print_line(b'cache', b'cache %s is empty; writing requirements: '
b'%s\n' % (cache, b' '.join(sorted(our_requirements))))
# We write a requirements file so future invocations know what the
# requirements are.
with open(requires_path, 'wb') as fh:
fh.write(b'\n'.join(sorted(our_requirements)))
# And make it read-only as a precaution against deletion.
os.chmod(requires_path, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
+ write_audit_entry(audit_path,
+ 'created; requirements: %s' %
+ ', '.join(sorted(our_requirements)))
+
set_dir_permissions(cache, uid, gid)
# The cache has content and we have a requirements file. Validate
# requirements alignment.
elif os.path.exists(requires_path):
with open(requires_path, 'rb') as fh:
wanted_requirements = set(fh.read().splitlines())
@@ -369,39 +383,57 @@ def main(args):
b'changing ownership to facilitate cache use\n' %
cache)
chown_recursive(cache, user.pw_name, group.gr_name, uid, gid)
# And write out the updated reality.
with open(requires_path, 'wb') as fh:
fh.write(b'\n'.join(sorted(our_requirements)))
+ write_audit_entry(audit_path,
+ 'chown; requirements: %s' %
+ ', '.join(sorted(our_requirements)))
+
elif missing:
print('error: requirements for populated cache %s differ from '
'this task' % cache)
print('cache requirements: %s' % ' '.join(sorted(
wanted_requirements)))
print('our requirements: %s' % ' '.join(sorted(
our_requirements)))
if any(s.startswith(('uid=', 'gid=')) for s in missing):
print(CACHE_UID_GID_MISMATCH)
+
+ write_audit_entry(audit_path,
+ 'requirements mismatch; wanted: %s' %
+ ', '.join(sorted(our_requirements)))
+
+ print('')
+ print('audit log:')
+ with open(audit_path, 'rb') as fh:
+ print(fh.read())
+
return 1
+ else:
+ write_audit_entry(audit_path, 'used')
# We don't need to adjust permissions here because the cache is
# associated with a uid/gid and the first task should have set
# a proper owner/group.
# The cache has content and no requirements file. This shouldn't
# happen because run-task should be the first thing that touches a
# cache.
else:
print('error: cache %s is not empty and is missing a '
'.cacherequires file; the cache names for this task are '
'likely mis-configured or TASKCLUSTER_CACHES is not set '
'properly' % cache)
+
+ write_audit_entry(audit_path, 'missing .cacherequires')
return 1
if 'TASKCLUSTER_VOLUMES' in os.environ:
volumes = os.environ['TASKCLUSTER_VOLUMES'].split(';')
del os.environ['TASKCLUSTER_VOLUMES']
else:
volumes = []