docker: use user-defined network for mozreview cluster (bug 1290783); r?glob draft
authorGregory Szorc <gps@mozilla.com>
Mon, 29 Aug 2016 10:48:05 -0700
changeset 9368 2470c0a45fc9c50f537067f7219944ca1dcb7e89
parent 9367 19ff277942e9f08f769593a2f5efa59f180eb6dd
child 9370 6f4202294d30fac30d9eb677d3fd4af916118f8d
push id1168
push userbmo:gps@mozilla.com
push dateMon, 29 Aug 2016 18:06:42 +0000
reviewersglob
bugs1290783
docker: use user-defined network for mozreview cluster (bug 1290783); r?glob Docker 1.12 requires things like links to be specified at container create time instead of start time. If we were to retain using links, we would have to drastically overhaul the container startup sequence. And this would undo a lot of optimizations around our startup sequence, which currently creates containers as early as possible to cut down on start time. Fortunately, there's a better way. Newer versions of Docker support "user-defined networks." These are essentially named, isolated networks. When using user-defined networks, containers that join networks can specify "aliases." These magically get turned into resolveable hostnames inside all containers in that network. This is a really cool feature because it means hostnames and port numbers can be static within the network. Before, you would have to sniff environment variables at container start time to resolve IPs/hostnames and ports. With constant aliases in user-defined networks, you can hardcode both the hostname and the port. This commit switches the MozReview cluster to use user-defined networks. The network name is randomly generated. Containers in the network have aliases giving each a sane hostname. A lot of code around configuring containers at startup for dynamic hostnames and ports has been removed since these are now static properties. There is certainly follow-up work to remove more code around container startup that was needed for the dynamic environment. But I'll defer that to another day. I've tested this commit on Docker 1.11 and 1.12 and it appears to "just work" on both. I would have liked to split the changes to docker.py into a smaller commit. But Docker 1.11 didn't like moving host config to container create time without also changing the networking config. MozReview-Commit-ID: 18dH4CSPj5n
ansible/roles/docker-hg-reviewboard/defaults/main.yml
ansible/roles/docker-hg-reviewboard/files/entrypoint.py
ansible/roles/docker-hg-reviewboard/files/supervisor-docker.conf
ansible/roles/docker-hg-reviewboard/tasks/main.yml
ansible/roles/docker-rbweb/files/entrypoint.py
ansible/roles/openssh-lpk/tasks/main.yml
testing/vcttesting/docker.py
--- a/ansible/roles/docker-hg-reviewboard/defaults/main.yml
+++ b/ansible/roles/docker-hg-reviewboard/defaults/main.yml
@@ -1,5 +1,5 @@
 ---
 domain: localhost
-ldap_uri: ldap://localhost/
+ldap_uri: ldap://ldap:389/
 bind_dn: cn=admin,dc=mozilla
 bind_pw: password
deleted file mode 100755
--- a/ansible/roles/docker-hg-reviewboard/files/entrypoint.py
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/python -u
-# This Source Code Form is subject to the terms of the Mozilla Public
-# License, v. 2.0. If a copy of the MPL was not distributed with this
-# file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-import os
-import subprocess
-import sys
-
-if 'LDAP_PORT_389_TCP_ADDR' not in os.environ:
-    print('error: contained invoked without link to an ldap contaer')
-    sys.exit(1)
-
-ldap_url = 'ldap://%s:%s/' % (os.environ['LDAP_PORT_389_TCP_ADDR'],
-                              os.environ['LDAP_PORT_389_TCP_PORT'])
-
-os.environ['DOCKER_ENTRYPOINT'] = '1'
-
-subprocess.check_call([
-    '/usr/bin/python', '-u',
-    '/usr/bin/ansible-playbook', 'docker-hgrb.yml',
-        '-c', 'local',
-        '-t', 'docker-startup',
-        '-e', 'ldap_uri=%s' % ldap_url,
-    ],
-    cwd='/vct/ansible')
-
-del os.environ['DOCKER_ENTRYPOINT']
-
-os.execl(sys.argv[1], *sys.argv[1:])
--- a/ansible/roles/docker-hg-reviewboard/files/supervisor-docker.conf
+++ b/ansible/roles/docker-hg-reviewboard/files/supervisor-docker.conf
@@ -1,12 +1,18 @@
 [program:rsyslog]
 command = /usr/sbin/rsyslogd -n
 autorestart = true
 redirect_stderr = true
 
+# We need to run nslcd or system integration with LDAP doesn't work.
+[program:nslcd]
+command = /usr/sbin/nslcd -d
+autorestart = true
+redirect_stderr = true
+
 [program:sshd]
 command = /usr/sbin/sshd -D
 autorestart = true
 
 [program:httpd]
 command = /usr/sbin/httpd -DFOREGROUND
 autorestart = true
--- a/ansible/roles/docker-hg-reviewboard/tasks/main.yml
+++ b/ansible/roles/docker-hg-reviewboard/tasks/main.yml
@@ -5,17 +5,16 @@
         owner=root
         group=root
         mode=0644
 
 - name: install Docker support scripts
   copy: src={{ item }} dest=/{{ item }} mode=0755
   with_items:
     - create-repo
-    - entrypoint.py
     - kill-wsgi-procs
     - refresh
     - set-strip-users
     - set-urls
 
 - name: Install psutil
   pip: name=psutil
 
@@ -26,9 +25,8 @@
     - { section: reviewboard, option: password, value: mrpassword }
 
 - name: add LDAP settings file
   template: src=ldap.json.j2
             dest=/etc/mercurial/ldap.json
             owner=root
             group=root
             mode=0644
-  tags: docker-startup
--- a/ansible/roles/docker-rbweb/files/entrypoint.py
+++ b/ansible/roles/docker-rbweb/files/entrypoint.py
@@ -3,40 +3,23 @@
 # License, v. 2.0. If a copy of the MPL was not distributed with this
 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
 import json
 import os
 import sys
 
 
-if 'BMOWEB_PORT_80_TCP_ADDR' not in os.environ:
-    print('error: container invoked without link to a bmoweb container')
-    sys.exit(1)
-
-if 'PULSE_PORT_5672_TCP_ADDR' not in os.environ:
-    print('error: container invoked without link to a pulse container')
-    sys.exit(1)
-
-if 'AUTOLAND_PORT_80_TCP_ADDR' not in os.environ:
-    print('error: container invoked without link to an autoland container')
-    sys.exit(1)
-
 execfile('/venv/bin/activate_this.py', dict(__file__='/venv/bin/activate_this.py'))
 sys.path.insert(0, '/reviewboard/conf')
 os.environ['DJANGO_SETTINGS_MODULE'] = 'reviewboard.settings'
 
-bugzilla_url = 'http://%s:%s' % (os.environ['BMOWEB_PORT_80_TCP_ADDR'],
-                                 os.environ['BMOWEB_PORT_80_TCP_PORT'])
-
-autoland_url = 'http://%s:%s' % (os.environ['AUTOLAND_PORT_80_TCP_ADDR'],
-                                 os.environ['AUTOLAND_PORT_80_TCP_PORT'])
-
-ldap_url = 'ldap://%s:%s' % (os.environ['LDAP_PORT_389_TCP_ADDR'],
-                             os.environ['LDAP_PORT_389_TCP_PORT'])
+bugzilla_url = 'http://bmoweb'
+autoland_url = 'http://autoland'
+ldap_url = 'ldap://ldap:389'
 
 # siteconfig takes priority over settings_local.py. Ensure siteconfig
 # is up to date.
 #
 # This code mimics what is done in
 # reviewboard.admin.management.sites.migrate_settings(). Its existence is
 # unfortunate. If we could guarantee that settings_local.py never changes,
 # we wouldn't need this.
@@ -56,18 +39,18 @@ sc.set('logging_directory', '/reviewboar
 # reset this to Bugzilla's public IP, which is not available at this point.
 # It is later set via set-site-url by vcttesting.mozreview.MozReview.
 sc.set('auth_bz_xmlrpc_url', '%s/xmlrpc.cgi' % bugzilla_url)
 sc.save()
 
 # Define MozReview settings.
 settings = {}
 settings['enabled'] = True
-settings['pulse_host'] = os.environ['PULSE_PORT_5672_TCP_ADDR']
-settings['pulse_port'] = int(os.environ['PULSE_PORT_5672_TCP_PORT'])
+settings['pulse_host'] = 'pulse'
+settings['pulse_port'] = 5672
 settings['pulse_user'] = 'guest'
 settings['pulse_password'] = 'guest'
 settings['pulse_ssl'] = False
 settings['autoland_try_ui_enabled'] = True
 settings['autoland_url'] = autoland_url
 settings['autoland_user'] = 'autoland'
 settings['autoland_password'] = 'autoland'
 settings['autoland_testing'] = True
--- a/ansible/roles/openssh-lpk/tasks/main.yml
+++ b/ansible/roles/openssh-lpk/tasks/main.yml
@@ -1,15 +1,9 @@
 ---
-# This is to support executing in Docker.
-- name: Find LDAP URI
-  set_fact: ldap_uri=ldap://{{ ansible_env.LDAP_PORT_389_TCP_ADDR }}:{{ ansible_env.LDAP_PORT_389_TCP_PORT }}/
-  when: ansible_env.LDAP_PORT_389_TCP_ADDR is defined
-  tags: docker-startup
-
 # We don't use the yum module here because it is too slow.
 - name: Regular OpenSSH cannot be installed
   command: /usr/bin/yum remove -y openssh openssh-clients openssh-server
 
 - name: Install packages related to LDAP auth
   command: /usr/bin/yum install -y authconfig nss-pam-ldapd openldap-clients pam_ldap
 
 - name: create directory for LDAP certificates
@@ -39,14 +33,12 @@
     - { path: openssh-lpk-server-5.4p1-1.x86_64.rpm, sha256: 41f59067d9d41fe04f27c0702b14ed9bced00203cab1e9af2be6e3e7299ef4ee }
 
 - name: Install RPMs related to LDAP auth
   command: /usr/bin/yum localinstall -y /var/tmp/openssh-lpk-5.4p1-1.x86_64.rpm /var/tmp/openssh-lpk-clients-5.4p1-1.x86_64.rpm /var/tmp/openssh-lpk-server-5.4p1-1.x86_64.rpm
 
 - name: Configure system authentication settings
   template: src=nslcd.conf.j2 dest=/etc/nslcd.conf
   notify: run authconfig
-  tags: docker-startup
 
 - name: Configure sshd
   template: src=sshd_config.j2 dest=/etc/ssh/sshd_config
   notify: restart sshd
-  tags: docker-startup
--- a/testing/vcttesting/docker.py
+++ b/testing/vcttesting/docker.py
@@ -1011,78 +1011,113 @@ class Docker(object):
             web_image = images['bmoweb']
             pulse_image = images['pulse']
             rbweb_image = images['rbweb']
             hgweb_image = images['hgweb']
             treestatus_image = images['treestatus']
 
         containers = self.state['containers'].setdefault(cluster, [])
 
+        network_name = 'mozreview-%s' % uuid.uuid4()
+        self.client.create_network(network_name, driver='bridge')
+
         with limited_threadpoolexecutor(10, max_workers) as e:
             if start_pulse:
+                pulse_host_config = self.client.create_host_config(
+                    port_bindings={5672: pulse_port})
                 f_pulse_create = e.submit(
                     self.client.create_container,
                     pulse_image,
+                    host_config=pulse_host_config,
+                    networking_config=self.network_config(network_name, 'pulse'),
                     labels=['pulse'])
 
             bmo_url = 'http://%s:%s/' % (self.docker_hostname, http_port)
 
+            bmoweb_host_config = self.client.create_host_config(
+                port_bindings={80: http_port})
             f_web_create = e.submit(
                 self.client.create_container,
                 web_image,
                 environment={'BMO_URL': bmo_url},
+                host_config=bmoweb_host_config,
+                networking_config=self.network_config(network_name, 'bmoweb'),
                 labels=['bmoweb'])
 
             if start_rbweb:
+                rbweb_host_config = self.client.create_host_config(
+                    port_bindings={80: rbweb_port})
                 f_rbweb_create = e.submit(
                     self.client.create_container,
                     rbweb_image,
                     command=['/run'],
                     entrypoint=['/entrypoint.py'],
+                    host_config=rbweb_host_config,
+                    networking_config=self.network_config(network_name, 'rbweb'),
                     ports=[80],
                     labels=['rbweb'])
 
             if start_ldap:
+                ldap_host_config = self.client.create_host_config(
+                    port_bindings={389: ldap_port})
                 f_ldap_create = e.submit(
                     self.client.create_container,
                     ldap_image,
+                    host_config=ldap_host_config,
+                    networking_config=self.network_config(network_name, 'ldap'),
                     labels=['ldap'])
 
             if start_hgrb:
+                hgrb_host_config = self.client.create_host_config(
+                    port_bindings={22: ssh_port, 80: hg_port})
                 f_hgrb_create = e.submit(
                     self.client.create_container,
                     hgrb_image,
                     ports=[22, 80],
-                    entrypoint=['/entrypoint.py'],
-                    command=['/usr/bin/supervisord', '-n'])
+                    command=['/usr/bin/supervisord', '-n'],
+                    host_config=hgrb_host_config,
+                    networking_config=self.network_config(network_name, 'hgrb'))
 
             if start_hgweb:
+                hgweb_host_config = self.client.create_host_config(
+                    port_bindings={80: hgweb_port})
                 f_hgweb_create = e.submit(
                     self.client.create_container,
                     hgweb_image,
                     ports=[80],
                     entrypoint=['/entrypoint-solo'],
                     command=['/usr/bin/supervisord', '-n'],
+                    host_config=hgweb_host_config,
+                    networking_config=self.network_config(network_name, 'hgweb'),
                     labels=['hgweb'])
 
             if start_autoland:
                 f_autolanddb_create = e.submit(
                     self.client.create_container,
                     autolanddb_image,
-                    labels=['autolanddb'])
+                    labels=['autolanddb'],
+                    networking_config=self.network_config(network_name, 'autolanddb'))
 
+                autoland_host_config = self.client.create_host_config(
+                    port_bindings={80: autoland_port})
                 f_autoland_create = e.submit(
                     self.client.create_container,
                     autoland_image,
+                    host_config=autoland_host_config,
+                    networking_config=self.network_config(network_name, 'autoland'),
                     labels=['autolandweb'])
 
             if start_treestatus:
+                treestatus_host_config = self.client.create_host_config(
+                    port_bindings={80: treestatus_port})
                 f_treestatus_create = e.submit(
                     self.client.create_container,
                     treestatus_image,
+                    host_config=treestatus_host_config,
+                    networking_config=self.network_config(network_name, 'treestatus'),
                     labels=['treestatus'])
 
             if start_autoland:
                 autolanddb_id = f_autolanddb_create.result()['Id']
                 containers.append(autolanddb_id)
                 f_start_autolanddb = e.submit(
                     self.client.start,
                     autolanddb_id)
@@ -1090,26 +1125,24 @@ class Docker(object):
             # RabbitMQ takes a while to start up. Start it before other
             # containers. (We probably could have a callback-driven mechanism
             # here to ensure no time is lost. But that is more complex.)
             if start_pulse:
                 pulse_id = f_pulse_create.result()['Id']
                 containers.append(pulse_id)
                 f_start_pulse = e.submit(
                     self.client.start,
-                    pulse_id,
-                    port_bindings={5672: pulse_port})
+                    pulse_id)
 
             if start_ldap:
                 ldap_id = f_ldap_create.result()['Id']
                 containers.append(ldap_id)
                 f_start_ldap = e.submit(
                     self.client.start,
-                    ldap_id,
-                    port_bindings={389: ldap_port})
+                    ldap_id)
 
             web_id = f_web_create.result()['Id']
             containers.append(web_id)
 
             if start_autoland:
                 f_start_autolanddb.result()
                 autolanddb_state = self.client.inspect_container(autolanddb_id)
                 autoland_id = f_autoland_create.result()['Id']
@@ -1132,70 +1165,54 @@ class Docker(object):
                 treestatus_id = f_treestatus_create.result()['Id']
                 containers.append(treestatus_id)
 
             # At this point, all containers have been created.
             self.save_state()
 
             f_start_web = e.submit(
                 self.client.start,
-                web_id,
-                port_bindings={80: http_port})
+                web_id)
             f_start_web.result()
             web_state = self.client.inspect_container(web_id)
 
             if start_pulse:
                 f_start_pulse.result()
                 pulse_state = self.client.inspect_container(pulse_id)
 
             if start_ldap:
                 f_start_ldap.result()
                 ldap_state = self.client.inspect_container(ldap_id)
 
             # TODO: Use futures for hgrb, hgweb and treestatus
             if start_hgrb:
-                self.client.start(hgrb_id,
-                                  links=[(ldap_state['Name'], 'ldap')],
-                                  port_bindings={22: ssh_port, 80: hg_port})
+                self.client.start(hgrb_id)
                 hgrb_state = self.client.inspect_container(hgrb_id)
 
             if start_hgweb:
-                self.client.start(hgweb_id,
-                                  port_bindings={80: hgweb_port})
+                self.client.start(hgweb_id)
                 hgweb_state = self.client.inspect_container(hgweb_id)
 
             if start_treestatus:
-                self.client.start(treestatus_id,
-                                  port_bindings={80: treestatus_port})
+                self.client.start(treestatus_id)
                 treestatus_state = self.client.inspect_container(treestatus_id)
 
             if start_autoland:
                 assert start_hgrb
                 assert start_treestatus
                 f_start_autoland = e.submit(
                     self.client.start,
-                    autoland_id,
-                    links=[(autolanddb_state['Name'], 'autolanddb'),
-                           (web_state['Name'], 'bmoweb'),
-                           (hgrb_state['Name'], 'hgrb'),
-                           (treestatus_state['Name'], 'treestatus')],
-                    port_bindings={80: autoland_port})
+                    autoland_id)
                 f_start_autoland.result()
                 autoland_state = self.client.inspect_container(autoland_id)
 
             if start_rbweb:
                 assert start_autoland
                 self.client.start(
-                    rbweb_id,
-                    links=[(web_state['Name'], 'bmoweb'),
-                           (pulse_state['Name'], 'pulse'),
-                           (hgrb_state['Name'], 'hgrb'),
-                           (autoland_state['Name'], 'autoland'),
-                           (ldap_state['Name'], 'ldap')],
-                    port_bindings={80: rbweb_port})
+                    rbweb_id)
                 rbweb_state = self.client.inspect_container(rbweb_id)
 
         bmoweb_hostname, bmoweb_hostport = \
             self._get_host_hostname_port(web_state, '80/tcp')
         bmo_url = 'http://%s:%d/' % (bmoweb_hostname, bmoweb_hostport)
 
         if start_pulse:
             rabbit_hostname, rabbit_hostport = \
@@ -1323,21 +1340,17 @@ class Docker(object):
                         networks.add(network['NetworkID'])
 
                 count += 1
                 e.submit(self.client.remove_container, container, force=True,
                          v=True)
 
         # There should only be 1, so don't use a ThreadPoolExecutor.
         for network in networks:
-            # TODO remove this lookup and check once using user-defined networks
-            # everywhere.
-            n = self.client.inspect_network(network)
-            if n['Name'] != 'bridge':
-                self.client.remove_network(network)
+            self.client.remove_network(network)
 
         print('stopped %d containers' % count)
 
         try:
             del self.state['containers'][cluster]
             self.save_state()
         except KeyError:
             pass