Bug 1409876 - Record cache history to aid failure analysis; r?dustin draft
authorGregory Szorc <gps@mozilla.com>
Wed, 25 Oct 2017 16:58:37 -0700
changeset 686953 af411ca4a3ec46f517eec253395cbf174e9719d2
parent 686540 64bab5cbb9b63808d04babfbcfba3175fd99f69d
child 686954 4a8a9d1528101d3a6734839c3c7722bb58be7202
push id86355
push usergszorc@mozilla.com
push dateThu, 26 Oct 2017 16:53:42 +0000
reviewersdustin
bugs1409876
milestone58.0a1
Bug 1409876 - Record cache history to aid failure analysis; r?dustin We introduce a per-cache .cachelog file containing important events in the cache's history such as creation, requirements adjusting, and utilization. If cache validation fails, we print the cache log. If a previous task was responsible for getting the cache in a bad state, its TASK_ID should be printed, allowing us to more easily identify mis-configured tasks. MozReview-Commit-ID: BJun5Hi5w0s
taskcluster/docker/recipes/run-task
--- a/taskcluster/docker/recipes/run-task
+++ b/taskcluster/docker/recipes/run-task
@@ -290,16 +290,19 @@ def main(args):
     # validate that use of the cache is acceptable.
     #
     # We /could/ blow away the cache data on requirements mismatch.
     # While this would be convenient, this could result in "competing" tasks
     # effectively undoing the other's work. This would slow down task
     # execution in aggregate. Without monitoring for this, people may not notice
     # the problem and tasks would be slower than they could be. We follow the
     # principle of "fail fast" to ensure optimal task execution.
+    #
+    # We also write an audit log of who used the caches. This log is printed
+    # during failures to help aid debugging.
 
     if 'TASKCLUSTER_CACHES' in os.environ:
         caches = os.environ['TASKCLUSTER_CACHES'].split(';')
         del os.environ['TASKCLUSTER_CACHES']
     else:
         caches = []
 
     if 'TASKCLUSTER_UNTRUSTED_CACHES' in os.environ:
@@ -316,37 +319,48 @@ def main(args):
         # to force the use of a new cache.
         b'version=1',
         # Include the UID and GID the task will run as to ensure that tasks
         # with different UID and GID don't share the same cache.
         b'uid=%d' % uid,
         b'gid=%d' % gid,
     }
 
+    def write_audit_entry(path, msg):
+        now = datetime.datetime.utcnow().isoformat()
+        with open(path, 'ab') as fh:
+            fh.write(b'[%sZ %s] %s\n' % (
+                     now, os.environ.get('TASK_ID', 'UNKNOWN'), msg))
+
     for cache in caches:
         if not os.path.isdir(cache):
             print('error: cache %s is not a directory; this should never '
                   'happen' % cache)
             return 1
 
         requires_path = os.path.join(cache, '.cacherequires')
+        audit_path = os.path.join(cache, '.cachelog')
 
         # The cache is empty. Configure it.
         if not os.listdir(cache):
             print_line(b'cache', b'cache %s is empty; writing requirements: '
                        b'%s\n' % (cache, b' '.join(sorted(our_requirements))))
 
             # We write a requirements file so future invocations know what the
             # requirements are.
             with open(requires_path, 'wb') as fh:
                 fh.write(b'\n'.join(sorted(our_requirements)))
 
             # And make it read-only as a precaution against deletion.
             os.chmod(requires_path, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
 
+            write_audit_entry(audit_path,
+                              'created; requirements: %s' %
+                              ', '.join(sorted(our_requirements)))
+
             set_dir_permissions(cache, uid, gid)
 
         # The cache has content and we have a requirements file. Validate
         # requirements alignment.
         elif os.path.exists(requires_path):
             with open(requires_path, 'rb') as fh:
                 wanted_requirements = set(fh.read().splitlines())
 
@@ -369,39 +383,57 @@ def main(args):
                            b'changing ownership to facilitate cache use\n' %
                            cache)
                 chown_recursive(cache, user.pw_name, group.gr_name, uid, gid)
 
                 # And write out the updated reality.
                 with open(requires_path, 'wb') as fh:
                     fh.write(b'\n'.join(sorted(our_requirements)))
 
+                write_audit_entry(audit_path,
+                                  'chown; requirements: %s' %
+                                  ', '.join(sorted(our_requirements)))
+
             elif missing:
                 print('error: requirements for populated cache %s differ from '
                       'this task' % cache)
                 print('cache requirements: %s' % ' '.join(sorted(
                     wanted_requirements)))
                 print('our requirements:   %s' % ' '.join(sorted(
                     our_requirements)))
                 if any(s.startswith(('uid=', 'gid=')) for s in missing):
                     print(CACHE_UID_GID_MISMATCH)
+
+                write_audit_entry(audit_path,
+                                  'requirements mismatch; wanted: %s' %
+                                  ', '.join(sorted(our_requirements)))
+
+                print('')
+                print('audit log:')
+                with open(audit_path, 'rb') as fh:
+                    print(fh.read())
+
                 return 1
+            else:
+                write_audit_entry(audit_path, 'used')
 
             # We don't need to adjust permissions here because the cache is
             # associated with a uid/gid and the first task should have set
             # a proper owner/group.
 
         # The cache has content and no requirements file. This shouldn't
         # happen because run-task should be the first thing that touches a
         # cache.
         else:
             print('error: cache %s is not empty and is missing a '
                   '.cacherequires file; the cache names for this task are '
                   'likely mis-configured or TASKCLUSTER_CACHES is not set '
                   'properly' % cache)
+
+            write_audit_entry(audit_path, 'missing .cacherequires')
             return 1
 
     if 'TASKCLUSTER_VOLUMES' in os.environ:
         volumes = os.environ['TASKCLUSTER_VOLUMES'].split(';')
         del os.environ['TASKCLUSTER_VOLUMES']
     else:
         volumes = []