Bug 1391789 - Add UID and GID to cache parameters; r?dustin draft
authorGregory Szorc <gps@mozilla.com>
Fri, 18 Aug 2017 15:30:15 -0700
changeset 649334 5d2ea08eda6d65dd05cd6cb2b7276e3ea648e722
parent 649333 96125a25f54afbeb126c792d6ff66ab29e38dc11
child 727063 cbb9942f0f4371697fc83ec1f288c11580e9ddd7
push id75015
push userbmo:gps@mozilla.com
push dateSat, 19 Aug 2017 00:52:56 +0000
reviewersdustin
bugs1391789
milestone57.0a1
Bug 1391789 - Add UID and GID to cache parameters; r?dustin The UID and GID that a task executes under is dynamic. As a result, caches need to be aware of the UID and GID that owns files otherwise subsequent tasks could run into permission denied errors. This is why `run-task --chown-recursive` exists. By recursively changing ownership of persisted files, we ensure the current task is able to read and write all existing files. When you take a step back, you realize that chowning of cached files is an expensive workaround. Yes, this results in cache hits. But the cost is you potentially have to perform hundreds of thousands of I/O system calls to mass chown. The ideal situation is that UID/GID is consistent across tasks on any given cache and potentially expensive permissions setting can be avoided. So, that's what this commit does. We add the task's UID and GID to run-task's requirements. When we first see a cache, we record a UID and GID with it and chown the empty cache directory to that UID and GID. Subsequent tasks using this cache *must* use the same UID and GID or else run-task will fail. Since run-task now guarantees that all cache consumers use the same UID and GID, we can avoid a potentially expensive recursive chown. So we eliminate --chown-recursive for all caches. Ideally we'd eliminate it everywhere. But some paths are optionally caches. We'll likely need a primitive that declares "potentially caches" paths so run-task can chown them if and only if they aren't caches. This change will shine light on all tasks using inconsistent UID and GID values on the same cache. Bustage is expected. Unfortunately, we can't easily know what will break. So it will be one of those things where we will have to fix problems as they arise. Fortunately, because caches are now tied to the content of run-task, we only need to back out this change and things should revert to caches without UID and GID pinning requirements and everything will work again. MozReview-Commit-ID: 2ka4rOnnXIp
taskcluster/docker/recipes/run-task
taskcluster/taskgraph/transforms/job/hazard.py
taskcluster/taskgraph/transforms/job/mozharness.py
taskcluster/taskgraph/transforms/job/spidermonkey.py
taskcluster/taskgraph/transforms/job/toolchain.py
--- a/taskcluster/docker/recipes/run-task
+++ b/taskcluster/docker/recipes/run-task
@@ -280,34 +280,44 @@ def main(args):
 
     our_requirements = {
         # Include a version string that we can bump whenever to trigger
         # fresh caches. The actual value is not relevant and doesn't need
         # to follow any explicit order. Since taskgraph bakes this file's
         # hash into cache names, any change to this file/version is sufficient
         # to force the use of a new cache.
         b'version=1',
+        # Include the UID and GID the task will run as to ensure that tasks
+        # with different UID and GID don't share the same cache.
+        b'uid=%d' % uid,
+        b'gid=%d' % gid,
     }
 
     for cache in caches:
         if not os.path.isdir(cache):
             print('cache %s is not a directory; this should never happen')
             return 1
 
         requires_path = os.path.join(cache, '.cacherequires')
 
-        # The cache is empty. No validation necessary. Just set up our
-        # requirements file.
+        # The cache is empty. Configure it.
         if not os.listdir(cache):
+            # We write a requirements file so future invocations know what the
+            # requirements are.
             with open(requires_path, 'wb') as fh:
                 fh.write(b'\n'.join(sorted(our_requirements)))
 
             # And make it read-only as a precaution against deletion.
             os.chmod(requires_path, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
 
+            # Since the uid/gid is baked into the requirements, set the cache as
+            # owned by this uid/gid, since it should be the only uid/gid to
+            # ever use it.
+            set_dir_permissions(cache, uid, gid)
+
         # The cache has content and we have a requirements file. Validate
         # requirements alignment.
         elif os.path.exists(requires_path):
             with open(requires_path, 'rb') as fh:
                 wanted_requirements = set(fh.read().splitlines())
 
                 missing = wanted_requirements - our_requirements
                 if missing:
--- a/taskcluster/taskgraph/transforms/job/hazard.py
+++ b/taskcluster/taskgraph/transforms/job/hazard.py
@@ -61,14 +61,13 @@ def docker_worker_hazard(config, job, ta
         env['MOZCONFIG'] = run['mozconfig']
 
     # build-haz-linux.sh needs this otherwise it assumes the checkout is in
     # the workspace.
     env['GECKO_DIR'] = '/home/worker/checkouts/gecko'
 
     worker['command'] = [
         '/home/worker/bin/run-task',
-        '--chown-recursive', '/home/worker/tooltool-cache',
         '--chown-recursive', '/home/worker/workspace',
         '--vcs-checkout', '/home/worker/checkouts/gecko',
         '--',
         '/bin/bash', '-c', run['command']
     ]
--- a/taskcluster/taskgraph/transforms/job/mozharness.py
+++ b/taskcluster/taskgraph/transforms/job/mozharness.py
@@ -166,25 +166,23 @@ def mozharness_on_docker_worker_setup(co
     worker['retry-exit-status'] = 4
 
     docker_worker_setup_secrets(config, job, taskdesc)
 
     command = [
         '/home/worker/bin/run-task',
         # Various caches/volumes are default owned by root:root.
         '--chown-recursive', '/home/worker/workspace',
-        '--chown-recursive', '/home/worker/tooltool-cache',
         '--vcs-checkout', '/home/worker/workspace/build/src',
         '--tools-checkout', '/home/worker/workspace/build/tools',
         '--',
+        '/home/worker/workspace/build/src/{}'.format(
+            run.get('job-script', 'taskcluster/scripts/builder/build-linux.sh')
+        ),
     ]
-    command.append("/home/worker/workspace/build/src/{}".format(
-        run.get('job-script',
-                "taskcluster/scripts/builder/build-linux.sh"
-                )))
 
     worker['command'] = command
 
 
 @run_job_using("generic-worker", "mozharness", schema=mozharness_run_schema)
 def mozharness_on_generic_worker(config, job, taskdesc):
     assert job['worker']['os'] == 'windows', 'only supports windows right now'
 
--- a/taskcluster/taskgraph/transforms/job/spidermonkey.py
+++ b/taskcluster/taskgraph/transforms/job/spidermonkey.py
@@ -60,16 +60,16 @@ def docker_worker_spidermonkey(config, j
     script = "build-sm.sh"
     if run['using'] == 'spidermonkey-package':
         script = "build-sm-package.sh"
     elif run['using'] == 'spidermonkey-mozjs-crate':
         script = "build-sm-mozjs-crate.sh"
 
     worker['command'] = [
         '/home/worker/bin/run-task',
+        # Not always a cache. So not consistently owned.
         '--chown-recursive', '/home/worker/workspace',
-        '--chown-recursive', '/home/worker/tooltool-cache',
         '--vcs-checkout', '/home/worker/workspace/build/src',
         '--',
         '/bin/bash',
         '-c',
         'cd /home/worker && workspace/build/src/taskcluster/scripts/builder/%s' % script
     ]
--- a/taskcluster/taskgraph/transforms/job/toolchain.py
+++ b/taskcluster/taskgraph/transforms/job/toolchain.py
@@ -115,17 +115,16 @@ def docker_worker_toolchain(config, job,
     if run['tooltool-downloads']:
         internal = run['tooltool-downloads'] == 'internal'
         docker_worker_add_tooltool(config, job, taskdesc, internal=internal)
 
     worker['command'] = [
         '/home/worker/bin/run-task',
         # Various caches/volumes are default owned by root:root.
         '--chown-recursive', '/home/worker/workspace',
-        '--chown-recursive', '/home/worker/tooltool-cache',
         '--vcs-checkout=/home/worker/workspace/build/src',
         '--',
         'bash',
         '-c',
         'cd /home/worker && '
         './workspace/build/src/taskcluster/scripts/misc/{}'.format(
             run['script'])
     ]