Bug 1288610 - Add functions for creating deterministic tar archives; r?glandium draft
authorGregory Szorc <gps@mozilla.com>
Mon, 25 Jul 2016 12:46:07 -0700
changeset 392514 ccb7bd86bf96562286c84acf9ff2ed87ca03318a
parent 392445 251fccc1f62bf0eac569ef4f6717fea61ebadb27
child 392515 86bceb55502aa5b22627a667b42e161dff5b9956
push id24048
push userbmo:gps@mozilla.com
push dateMon, 25 Jul 2016 19:47:02 +0000
reviewersglandium
bugs1288610
milestone50.0a1
Bug 1288610 - Add functions for creating deterministic tar archives; r?glandium I have a need to create tar archives deterministically and reproducibly. Since we already have similar functionality in mozpack for producting zip/jar archives, I figured it made sense for this functionality to live in mozpack. I made the functionality as simple as possible: we only accept files from the filesystem and the set of files must be known in advance. No class to hold/buffer state: just a simple function that takes a mapping of files and writes to a stream. MozReview-Commit-ID: If0NTcA7wpc
python/mozbuild/mozpack/archive.py
python/mozbuild/mozpack/test/test_archive.py
new file mode 100644
--- /dev/null
+++ b/python/mozbuild/mozpack/archive.py
@@ -0,0 +1,107 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from __future__ import absolute_import
+
+import bz2
+import gzip
+import stat
+import tarfile
+
+
+# 2016-01-01T00:00:00+0000
+DEFAULT_MTIME = 1451606400
+
+
+def create_tar_from_files(fp, files):
+    """Create a tar file deterministically.
+
+    Receives a dict mapping names of files in the archive to local filesystem
+    paths.
+
+    The files will be archived and written to the passed file handle opened
+    for writing.
+
+    Only regular files can be written.
+
+    FUTURE accept mozpack.files classes for writing
+    FUTURE accept a filename argument (or create APIs to write files)
+    """
+    with tarfile.open(name='', mode='w', fileobj=fp, dereference=True) as tf:
+        for archive_path, fs_path in sorted(files.items()):
+            ti = tf.gettarinfo(fs_path, archive_path)
+
+            if not ti.isreg():
+                raise ValueError('not a regular file: %s' % fs_path)
+
+            # Disallow setuid and setgid bits. This is an arbitrary restriction.
+            # However, since we set uid/gid to root:root, setuid and setgid
+            # would be a glaring security hole if the archive were
+            # uncompressed as root.
+            if ti.mode & (stat.S_ISUID | stat.S_ISGID):
+                raise ValueError('cannot add file with setuid or setgid set: '
+                                 '%s' % fs_path)
+
+            # Set uid, gid, username, and group as deterministic values.
+            ti.uid = 0
+            ti.gid = 0
+            ti.uname = ''
+            ti.gname = ''
+
+            # Set mtime to a constant value.
+            ti.mtime = DEFAULT_MTIME
+
+            with open(fs_path, 'rb') as fh:
+                tf.addfile(ti, fh)
+
+
+def create_tar_gz_from_files(fp, files, filename=None, compresslevel=9):
+    """Create a tar.gz file deterministically from files.
+
+    This is a glorified wrapper around ``create_tar_from_files`` that
+    adds gzip compression.
+
+    The passed file handle should be opened for writing in binary mode.
+    When the function returns, all data has been written to the handle.
+    """
+    # Offset 3-7 in the gzip header contains an mtime. Pin it to a known
+    # value so output is deterministic.
+    gf = gzip.GzipFile(filename=filename or '', mode='wb', fileobj=fp,
+                       compresslevel=compresslevel, mtime=DEFAULT_MTIME)
+    with gf:
+        create_tar_from_files(gf, files)
+
+
+class _BZ2Proxy(object):
+    """File object that proxies writes to a bz2 compressor."""
+    def __init__(self, fp, compresslevel=9):
+        self.fp = fp
+        self.compressor = bz2.BZ2Compressor(compresslevel=compresslevel)
+        self.pos = 0
+
+    def tell(self):
+        return self.pos
+
+    def write(self, data):
+        data = self.compressor.compress(data)
+        self.pos += len(data)
+        self.fp.write(data)
+
+    def close(self):
+        data = self.compressor.flush()
+        self.pos += len(data)
+        self.fp.write(data)
+
+
+def create_tar_bz2_from_files(fp, files, compresslevel=9):
+    """Create a tar.bz2 file deterministically from files.
+
+    This is a glorified wrapper around ``create_tar_from_files`` that
+    adds bzip2 compression.
+
+    This function is similar to ``create_tar_gzip_from_files()``.
+    """
+    proxy = _BZ2Proxy(fp, compresslevel=compresslevel)
+    create_tar_from_files(proxy, files)
+    proxy.close()
new file mode 100644
--- /dev/null
+++ b/python/mozbuild/mozpack/test/test_archive.py
@@ -0,0 +1,190 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from __future__ import absolute_import
+
+import hashlib
+import os
+import shutil
+import stat
+import tarfile
+import tempfile
+import unittest
+
+from mozpack.archive import (
+    DEFAULT_MTIME,
+    create_tar_from_files,
+    create_tar_gz_from_files,
+    create_tar_bz2_from_files,
+)
+
+from mozunit import main
+
+
+MODE_STANDARD = stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IROTH
+
+
+def file_hash(path):
+    h = hashlib.sha1()
+    with open(path, 'rb') as fh:
+        while True:
+            data = fh.read(8192)
+            if not data:
+                break
+            h.update(data)
+
+    return h.hexdigest()
+
+
+class TestArchive(unittest.TestCase):
+    def _create_files(self, root):
+        files = {}
+        for i in range(10):
+            p = os.path.join(root, b'file%d' % i)
+            with open(p, 'wb') as fh:
+                fh.write(b'file%d' % i)
+            # Need to set permissions or umask may influence testing.
+            os.chmod(p, MODE_STANDARD)
+            files[b'file%d' % i] = p
+
+        return files
+
+    def _verify_basic_tarfile(self, tf):
+        self.assertEqual(len(tf.getmembers()), 10)
+
+        names = ['file%d' % i for i in range(10)]
+        self.assertEqual(tf.getnames(), names)
+
+        for ti in tf.getmembers():
+            self.assertEqual(ti.uid, 0)
+            self.assertEqual(ti.gid, 0)
+            self.assertEqual(ti.uname, '')
+            self.assertEqual(ti.gname, '')
+            self.assertEqual(ti.mode, MODE_STANDARD)
+            self.assertEqual(ti.mtime, DEFAULT_MTIME)
+
+    def test_dirs_refused(self):
+        d = tempfile.mkdtemp()
+        try:
+            tp = os.path.join(d, 'test.tar')
+            with open(tp, 'wb') as fh:
+                with self.assertRaisesRegexp(ValueError, 'not a regular'):
+                    create_tar_from_files(fh, {'test': d})
+        finally:
+            shutil.rmtree(d)
+
+    def test_setuid_setgid_refused(self):
+        d = tempfile.mkdtemp()
+        try:
+            uid = os.path.join(d, 'setuid')
+            gid = os.path.join(d, 'setgid')
+            with open(uid, 'a'):
+                pass
+            with open(gid, 'a'):
+                pass
+
+            os.chmod(uid, MODE_STANDARD | stat.S_ISUID)
+            os.chmod(gid, MODE_STANDARD | stat.S_ISGID)
+
+            tp = os.path.join(d, 'test.tar')
+            with open(tp, 'wb') as fh:
+                with self.assertRaisesRegexp(ValueError, 'cannot add file with setuid'):
+                    create_tar_from_files(fh, {'test': uid})
+                with self.assertRaisesRegexp(ValueError, 'cannot add file with setuid'):
+                    create_tar_from_files(fh, {'test': gid})
+        finally:
+            shutil.rmtree(d)
+
+    def test_create_tar_basic(self):
+        d = tempfile.mkdtemp()
+        try:
+            files = self._create_files(d)
+
+            tp = os.path.join(d, 'test.tar')
+            with open(tp, 'wb') as fh:
+                create_tar_from_files(fh, files)
+
+            # Output should be deterministic.
+            self.assertEqual(file_hash(tp), 'cd16cee6f13391abd94dfa435d2633b61ed727f1')
+
+            with tarfile.open(tp, 'r') as tf:
+                self._verify_basic_tarfile(tf)
+
+        finally:
+            shutil.rmtree(d)
+
+    def test_executable_preserved(self):
+        d = tempfile.mkdtemp()
+        try:
+            p = os.path.join(d, 'exec')
+            with open(p, 'wb') as fh:
+                fh.write('#!/bin/bash\n')
+            os.chmod(p, MODE_STANDARD | stat.S_IXUSR)
+
+            tp = os.path.join(d, 'test.tar')
+            with open(tp, 'wb') as fh:
+                create_tar_from_files(fh, {'exec': p})
+
+            self.assertEqual(file_hash(tp), '357e1b81c0b6cfdfa5d2d118d420025c3c76ee93')
+
+            with tarfile.open(tp, 'r') as tf:
+                m = tf.getmember('exec')
+                self.assertEqual(m.mode, MODE_STANDARD | stat.S_IXUSR)
+
+        finally:
+            shutil.rmtree(d)
+
+    def test_create_tar_gz_basic(self):
+        d = tempfile.mkdtemp()
+        try:
+            files = self._create_files(d)
+
+            gp = os.path.join(d, 'test.tar.gz')
+            with open(gp, 'wb') as fh:
+                create_tar_gz_from_files(fh, files)
+
+            self.assertEqual(file_hash(gp), 'acb602239c1aeb625da5e69336775609516d60f5')
+
+            with tarfile.open(gp, 'r:gz') as tf:
+                self._verify_basic_tarfile(tf)
+
+        finally:
+            shutil.rmtree(d)
+
+    def test_tar_gz_name(self):
+        d = tempfile.mkdtemp()
+        try:
+            files = self._create_files(d)
+
+            gp = os.path.join(d, 'test.tar.gz')
+            with open(gp, 'wb') as fh:
+                create_tar_gz_from_files(fh, files, filename='foobar', compresslevel=1)
+
+            self.assertEqual(file_hash(gp), 'fd099f96480cc1100f37baa8e89a6b820dbbcbd3')
+
+            with tarfile.open(gp, 'r:gz') as tf:
+                self._verify_basic_tarfile(tf)
+
+        finally:
+            shutil.rmtree(d)
+
+    def test_create_tar_bz2_basic(self):
+        d = tempfile.mkdtemp()
+        try:
+            files = self._create_files(d)
+
+            bp = os.path.join(d, 'test.tar.bz2')
+            with open(bp, 'wb') as fh:
+                create_tar_bz2_from_files(fh, files)
+
+            self.assertEqual(file_hash(bp), '1827ad00dfe7acf857b7a1c95ce100361e3f6eea')
+
+            with tarfile.open(bp, 'r:bz2') as tf:
+                self._verify_basic_tarfile(tf)
+        finally:
+            shutil.rmtree(d)
+
+
+if __name__ == '__main__':
+    main()