Bug 1391789 - Add UID and GID to cache parameters; r?dustin
The UID and GID that a task executes under is dynamic. As a result,
caches need to be aware of the UID and GID that owns files otherwise
subsequent tasks could run into permission denied errors. This is
why `run-task --chown-recursive` exists. By recursively changing
ownership of persisted files, we ensure the current task is able
to read and write all existing files.
When you take a step back, you realize that chowning of cached
files is an expensive workaround. Yes, this results in cache hits.
But the cost is you potentially have to perform hundreds of thousands
of I/O system calls to mass chown. The ideal situation is that
UID/GID is consistent across tasks on any given cache and
potentially expensive permissions setting can be avoided. So, that's
what this commit does.
We add the task's UID and GID to run-task's requirements. When we
first see a cache, we record a UID and GID with it and chown the
empty cache directory to that UID and GID. Subsequent tasks using
this cache *must* use the same UID and GID or else run-task will
fail.
Since run-task now guarantees that all cache consumers use the same
UID and GID, we can avoid a potentially expensive recursive chown.
So we eliminate --chown-recursive for all caches. Ideally we'd
eliminate it everywhere. But some paths are optionally caches.
We'll likely need a primitive that declares "potentially caches"
paths so run-task can chown them if and only if they aren't caches.
This change will shine light on all tasks using inconsistent UID
and GID values on the same cache. Bustage is expected. Unfortunately,
we can't easily know what will break. So it will be one of those
things where we will have to fix problems as they arise. Fortunately,
because caches are now tied to the content of run-task, we only
need to back out this change and things should revert to caches
without UID and GID pinning requirements and everything will work
again.
MozReview-Commit-ID: 2ka4rOnnXIp
--- a/taskcluster/docker/recipes/run-task
+++ b/taskcluster/docker/recipes/run-task
@@ -280,34 +280,44 @@ def main(args):
our_requirements = {
# Include a version string that we can bump whenever to trigger
# fresh caches. The actual value is not relevant and doesn't need
# to follow any explicit order. Since taskgraph bakes this file's
# hash into cache names, any change to this file/version is sufficient
# to force the use of a new cache.
b'version=1',
+ # Include the UID and GID the task will run as to ensure that tasks
+ # with different UID and GID don't share the same cache.
+ b'uid=%d' % uid,
+ b'gid=%d' % gid,
}
for cache in caches:
if not os.path.isdir(cache):
print('cache %s is not a directory; this should never happen')
return 1
requires_path = os.path.join(cache, '.cacherequires')
- # The cache is empty. No validation necessary. Just set up our
- # requirements file.
+ # The cache is empty. Configure it.
if not os.listdir(cache):
+ # We write a requirements file so future invocations know what the
+ # requirements are.
with open(requires_path, 'wb') as fh:
fh.write(b'\n'.join(sorted(our_requirements)))
# And make it read-only as a precaution against deletion.
os.chmod(requires_path, stat.S_IRUSR | stat.S_IRGRP | stat.S_IROTH)
+ # Since the uid/gid is baked into the requirements, set the cache as
+ # owned by this uid/gid, since it should be the only uid/gid to
+ # ever use it.
+ set_dir_permissions(cache, uid, gid)
+
# The cache has content and we have a requirements file. Validate
# requirements alignment.
elif os.path.exists(requires_path):
with open(requires_path, 'rb') as fh:
wanted_requirements = set(fh.read().splitlines())
missing = wanted_requirements - our_requirements
if missing:
--- a/taskcluster/taskgraph/transforms/job/hazard.py
+++ b/taskcluster/taskgraph/transforms/job/hazard.py
@@ -61,14 +61,13 @@ def docker_worker_hazard(config, job, ta
env['MOZCONFIG'] = run['mozconfig']
# build-haz-linux.sh needs this otherwise it assumes the checkout is in
# the workspace.
env['GECKO_DIR'] = '/home/worker/checkouts/gecko'
worker['command'] = [
'/home/worker/bin/run-task',
- '--chown-recursive', '/home/worker/tooltool-cache',
'--chown-recursive', '/home/worker/workspace',
'--vcs-checkout', '/home/worker/checkouts/gecko',
'--',
'/bin/bash', '-c', run['command']
]
--- a/taskcluster/taskgraph/transforms/job/mozharness.py
+++ b/taskcluster/taskgraph/transforms/job/mozharness.py
@@ -166,25 +166,23 @@ def mozharness_on_docker_worker_setup(co
worker['retry-exit-status'] = 4
docker_worker_setup_secrets(config, job, taskdesc)
command = [
'/home/worker/bin/run-task',
# Various caches/volumes are default owned by root:root.
'--chown-recursive', '/home/worker/workspace',
- '--chown-recursive', '/home/worker/tooltool-cache',
'--vcs-checkout', '/home/worker/workspace/build/src',
'--tools-checkout', '/home/worker/workspace/build/tools',
'--',
+ '/home/worker/workspace/build/src/{}'.format(
+ run.get('job-script', 'taskcluster/scripts/builder/build-linux.sh')
+ ),
]
- command.append("/home/worker/workspace/build/src/{}".format(
- run.get('job-script',
- "taskcluster/scripts/builder/build-linux.sh"
- )))
worker['command'] = command
@run_job_using("generic-worker", "mozharness", schema=mozharness_run_schema)
def mozharness_on_generic_worker(config, job, taskdesc):
assert job['worker']['os'] == 'windows', 'only supports windows right now'
--- a/taskcluster/taskgraph/transforms/job/spidermonkey.py
+++ b/taskcluster/taskgraph/transforms/job/spidermonkey.py
@@ -60,16 +60,16 @@ def docker_worker_spidermonkey(config, j
script = "build-sm.sh"
if run['using'] == 'spidermonkey-package':
script = "build-sm-package.sh"
elif run['using'] == 'spidermonkey-mozjs-crate':
script = "build-sm-mozjs-crate.sh"
worker['command'] = [
'/home/worker/bin/run-task',
+ # Not always a cache. So not consistently owned.
'--chown-recursive', '/home/worker/workspace',
- '--chown-recursive', '/home/worker/tooltool-cache',
'--vcs-checkout', '/home/worker/workspace/build/src',
'--',
'/bin/bash',
'-c',
'cd /home/worker && workspace/build/src/taskcluster/scripts/builder/%s' % script
]
--- a/taskcluster/taskgraph/transforms/job/toolchain.py
+++ b/taskcluster/taskgraph/transforms/job/toolchain.py
@@ -115,17 +115,16 @@ def docker_worker_toolchain(config, job,
if run['tooltool-downloads']:
internal = run['tooltool-downloads'] == 'internal'
docker_worker_add_tooltool(config, job, taskdesc, internal=internal)
worker['command'] = [
'/home/worker/bin/run-task',
# Various caches/volumes are default owned by root:root.
'--chown-recursive', '/home/worker/workspace',
- '--chown-recursive', '/home/worker/tooltool-cache',
'--vcs-checkout=/home/worker/workspace/build/src',
'--',
'bash',
'-c',
'cd /home/worker && '
'./workspace/build/src/taskcluster/scripts/misc/{}'.format(
run['script'])
]