Bug 1342974 - Add code to generate a Merkle tree summary of a release r=catlee,arroway
MozReview-Commit-ID: COav6N88SOt
new file mode 100644
--- /dev/null
+++ b/testing/mozharness/mozharness/mozilla/merkle.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python
+
+import struct
+
+def _round2(n):
+ k = 1
+ while k < n:
+ k <<= 1
+ return k >> 1
+
+def _leaf_hash(hash_fn, leaf):
+ return hash_fn(b'\x00' + leaf).digest()
+
+def _pair_hash(hash_fn, left, right):
+ return hash_fn(b'\x01' + left + right).digest()
+
+class InclusionProof:
+ """
+ Represents a Merkle inclusion proof for purposes of serialization,
+ deserialization, and verification of the proof. The format for inclusion
+ proofs in RFC 6962-bis is as follows:
+
+ opaque LogID<2..127>;
+ opaque NodeHash<32..2^8-1>;
+
+ struct {
+ LogID log_id;
+ uint64 tree_size;
+ uint64 leaf_index;
+ NodeHash inclusion_path<1..2^16-1>;
+ } InclusionProofDataV2;
+
+ In other words:
+ - 1 + N octets of log_id (currently zero)
+ - 8 octets of tree_size = self.n
+ - 8 octets of leaf_index = m
+ - 2 octets of path length, followed by
+ * 1 + N octets of NodeHash
+ """
+
+ # Pre-generated 'log ID'. Not used by Firefox; it is only needed because
+ # there's a slot in the RFC 6962-bis format that requires a value at least
+ # two bytes long (plus a length byte).
+ LOG_ID = b'\x02\x00\x00'
+
+ def __init__(self, tree_size, leaf_index, path_elements):
+ self.tree_size = tree_size
+ self.leaf_index = leaf_index
+ self.path_elements = path_elements
+
+ @staticmethod
+ def from_rfc6962_bis(serialized):
+ start = 0
+ read = 1
+ if len(serialized) < start + read:
+ raise Exception('Inclusion proof too short for log ID header')
+ log_id_len, = struct.unpack('B', serialized[start:start+read])
+ start += read
+ start += log_id_len # Ignore the log ID itself
+
+ read = 8 + 8 + 2
+ if len(serialized) < start + read:
+ raise Exception('Inclusion proof too short for middle section')
+ tree_size, leaf_index, path_len = struct.unpack('!QQH', serialized[start:start+read])
+ start += read
+
+ path_elements = []
+ end = 1 + log_id_len + 8 + 8 + 2 + path_len
+ while start < end:
+ read = 1
+ if len(serialized) < start + read:
+ raise Exception('Inclusion proof too short for middle section')
+ elem_len, = struct.unpack('!B', serialized[start:start+read])
+ start += read
+
+ read = elem_len
+ if len(serialized) < start + read:
+ raise Exception('Inclusion proof too short for middle section')
+ if end < start + read:
+ raise Exception('Inclusion proof element exceeds declared length')
+ path_elements.append(serialized[start:start+read])
+ start += read
+
+ return InclusionProof(tree_size, leaf_index, path_elements)
+
+ def to_rfc6962_bis(self):
+ inclusion_path = b''
+ for step in self.path_elements:
+ step_len = struct.pack('B', len(step))
+ inclusion_path += step_len + step
+
+ middle = struct.pack('!QQH', self.tree_size, self.leaf_index, len(inclusion_path))
+ return self.LOG_ID + middle + inclusion_path
+
+ def _expected_head(self, hash_fn, leaf, leaf_index, tree_size):
+ node = _leaf_hash(hash_fn, leaf)
+
+ # Compute indicators of which direction the pair hashes should be done.
+ # Derived from the PATH logic in draft-ietf-trans-rfc6962-bis
+ lr = []
+ while tree_size > 1:
+ k = _round2(tree_size)
+ left = leaf_index < k
+ lr = [left] + lr
+
+ if left:
+ tree_size = k
+ else:
+ tree_size = tree_size - k
+ leaf_index = leaf_index - k
+
+ assert(len(lr) == len(self.path_elements))
+ for i, elem in enumerate(self.path_elements):
+ if lr[i]:
+ node = _pair_hash(hash_fn, node, elem)
+ else:
+ node = _pair_hash(hash_fn, elem, node)
+
+ return node
+
+
+ def verify(self, hash_fn, leaf, leaf_index, tree_size, tree_head):
+ return self._expected_head(hash_fn, leaf, leaf_index, tree_size) == tree_head
+
+class MerkleTree:
+ """
+ Implements a Merkle tree on a set of data items following the
+ structure defined in RFC 6962-bis. This allows us to create a
+ single hash value that summarizes the data (the 'head'), and an
+ 'inclusion proof' for each element that connects it to the head.
+
+ https://tools.ietf.org/html/draft-ietf-trans-rfc6962-bis-24
+ """
+
+ def __init__(self, hash_fn, data):
+ self.n = len(data)
+ self.hash_fn = hash_fn
+
+ # We cache intermediate node values, as a dictionary of dictionaries,
+ # where the node representing data elements data[m:n] is represented by
+ # nodes[m][n]. This corresponds to the 'D[m:n]' notation in RFC
+ # 6962-bis. In particular, the leaves are stored in nodes[i][i+1] and
+ # the head is nodes[0][n].
+ self.nodes = {}
+ for i in range(self.n):
+ self.nodes[i, i+1] = _leaf_hash(self.hash_fn, data[i])
+
+ def _node(self, start, end):
+ if (start, end) in self.nodes:
+ return self.nodes[start, end]
+
+ k = _round2(end - start)
+ left = self._node(start, start + k)
+ right = self._node(start + k, end)
+ node = _pair_hash(self.hash_fn, left, right)
+
+ self.nodes[start, end] = node
+ return node
+
+ def head(self):
+ return self._node(0, self.n)
+
+ def _relative_proof(self, target, start, end):
+ n = end - start
+ k = _round2(n)
+
+ if n == 1:
+ return []
+ elif target - start < k:
+ return self._relative_proof(target, start, start + k) + [self._node(start + k, end)]
+ elif target - start >= k:
+ return self._relative_proof(target, start + k, end) + [self._node(start, start + k)]
+
+ def inclusion_proof(self, leaf_index):
+ path_elements = self._relative_proof(leaf_index, 0, self.n)
+ return InclusionProof(self.n, leaf_index, path_elements)
--- a/testing/mozharness/scripts/release/generate-checksums.py
+++ b/testing/mozharness/scripts/release/generate-checksums.py
@@ -1,23 +1,25 @@
from multiprocessing.pool import ThreadPool
import os
from os import path
import re
import sys
import posixpath
+import hashlib
sys.path.insert(1, os.path.dirname(os.path.dirname(sys.path[0])))
from mozharness.base.python import VirtualenvMixin, virtualenv_config_options
from mozharness.base.script import BaseScript
from mozharness.base.vcs.vcsbase import VCSMixin
from mozharness.mozilla.checksums import parse_checksums_file
from mozharness.mozilla.signing import SigningMixin
from mozharness.mozilla.buildbot import BuildbotMixin
+from mozharness.mozilla.merkle import MerkleTree
class ChecksumsGenerator(BaseScript, VirtualenvMixin, SigningMixin, VCSMixin, BuildbotMixin):
config_options = [
[["--stage-product"], {
"dest": "stage_product",
"help": "Name of product used in file server's directory structure, eg: firefox, mobile",
}],
[["--version"], {
@@ -74,24 +76,26 @@ class ChecksumsGenerator(BaseScript, Vir
],
"virtualenv_path": "venv",
'buildbot_json_path': 'buildprops.json',
},
all_actions=[
"create-virtualenv",
"collect-individual-checksums",
"create-big-checksums",
+ "create-summary",
"sign",
"upload",
"copy-info-files",
],
default_actions=[
"create-virtualenv",
"collect-individual-checksums",
"create-big-checksums",
+ "create-summary",
"sign",
"upload",
],
)
self.checksums = {}
self.bucket = None
self.bucket_name = self._get_bucket_name()
@@ -147,16 +151,25 @@ class ChecksumsGenerator(BaseScript, Vir
def _get_file_prefix(self):
return "pub/{}/candidates/{}-candidates/build{}/".format(
self.config["stage_product"], self.config["version"], self.config["build_number"]
)
def _get_sums_filename(self, format_):
return "{}SUMS".format(format_.upper())
+ def _get_summary_filename(self, format_):
+ return "{}SUMMARY".format(format_.upper())
+
+ def _get_hash_function(self, format_):
+ if format_ in ("sha256", "sha384", "sha512"):
+ return getattr(hashlib, format_)
+ else:
+ self.fatal("Unsupported format {}".format(format_))
+
def _get_bucket(self):
if not self.bucket:
self.activate_virtualenv()
from boto.s3.connection import S3Connection
self.info("Connecting to S3")
conn = S3Connection()
self.debug("Successfully connected to S3")
@@ -211,16 +224,40 @@ class ChecksumsGenerator(BaseScript, Vir
if not set(self.config["formats"]) <= set(info["hashes"]):
self.fatal("Missing necessary format for file {}".format(f))
self.debug("Adding checksums for file: {}".format(f))
self.checksums[f] = info
break
else:
self.debug("Ignoring checksums for file: {}".format(f))
+ def create_summary(self):
+ """
+ This step computes a Merkle tree over the checksums for each format
+ and writes a file containing the head of the tree and inclusion proofs
+ for each file.
+ """
+ for fmt in self.config["formats"]:
+ hash_fn = self._get_hash_function(fmt)
+ files = [fn for fn in sorted(self.checksums)]
+ data = [self.checksums[fn]["hashes"][fmt] for fn in files]
+
+ tree = MerkleTree(hash_fn, data)
+ head = tree.head().encode("hex")
+ proofs = [tree.inclusion_proof(i).to_rfc6962_bis().encode("hex") for i in range(len(files))]
+
+ summary = self._get_summary_filename(fmt)
+ self.info("Creating summary file: {}".format(summary))
+
+ content = "{} TREE_HEAD\n".format(head)
+ for i in range(len(files)):
+ content += "{} {}\n".format(proofs[i], files[i])
+
+ self.write_to_file(summary, content)
+
def create_big_checksums(self):
for fmt in self.config["formats"]:
sums = self._get_sums_filename(fmt)
self.info("Creating big checksums file: {}".format(sums))
with open(sums, "w+") as output_file:
for fn in sorted(self.checksums):
output_file.write("{} {}\n".format(self.checksums[fn]["hashes"][fmt], fn))
new file mode 100644
--- /dev/null
+++ b/testing/mozharness/test/test_mozilla_merkle.py
@@ -0,0 +1,109 @@
+import unittest
+import hashlib
+import random
+from mozharness.mozilla.merkle import MerkleTree, InclusionProof
+
+# Pre-computed tree on 7 inputs
+#
+# ______F_____
+# / \
+# __D__ _E_
+# / \ / \
+# A B C |
+# / \ / \ / \ |
+# 0 1 2 3 4 5 6
+hash_fn = hashlib.sha256
+
+data = [
+ 'fbc459361fc111024c6d1fd83d23a9ff'.decode('hex'),
+ 'ae3a44925afec860451cd8658b3cadde'.decode('hex'),
+ '418903fe6ef29fc8cab93d778a7b018b'.decode('hex'),
+ '3d1c53c00b2e137af8c4c23a06388c6b'.decode('hex'),
+ 'e656ebd8e2758bc72599e5896be357be'.decode('hex'),
+ '81aae91cf90be172eedd1c75c349bf9e'.decode('hex'),
+ '00c262edf8b0bc345aca769e8733e25e'.decode('hex'),
+]
+
+leaves = [
+ '5cb551f87797381a24a5359a986e2cef25b1f2113b387197fe48e8babc9ad5c7'.decode('hex'),
+ '9899dc0be00306bda2a8e69cec32525ca6244f132479bcf840d8c1bc8bdfbff2'.decode('hex'),
+ 'fdd27d0393e32637b474efb9b3efad29568c3ec9b091fdda40fd57ec9196f06d'.decode('hex'),
+ 'c87292a6c8528c2a0679b6c1eefb47e4dbac7840d23645d5b7cb47cf1a8d365f'.decode('hex'),
+ '2ff3bdac9bec3580b82da8a357746f15919414d9cbe517e2dd96910c9814c30c'.decode('hex'),
+ '883e318240eccc0e2effafebdb0fd4fd26d0996da1b01439566cb9babef8725f'.decode('hex'),
+ 'bb13dfb7b202a95f241ea1715c8549dc048d9936ec747028002f7c795de72fcf'.decode('hex'),
+]
+
+nodeA = '06447a7baa079cb0b4b6119d0f575bec508915403fdc30923eba982b63759805'.decode('hex')
+nodeB = '3db98027c655ead4fe897bef3a4b361839a337941a9e624b475580c9d4e882ee'.decode('hex')
+nodeC = '17524f8b0169b2745c67846925d55449ae80a8022ef8189dcf4cbb0ec7fcc470'.decode('hex')
+nodeD = '380d0dc6fd7d4f37859a12dbfc7171b3cce29ab0688c6cffd2b15f3e0b21af49'.decode('hex')
+nodeE = '3a9c2886a5179a6e1948876034f99d52a8f393f47a09887adee6d1b4a5c5fbd6'.decode('hex')
+nodeF = 'd1a0d3947db4ae8305f2ac32985957e02659b2ea3c10da52a48d2526e9af3bbc'.decode('hex')
+
+proofs = [
+ [leaves[1], nodeB, nodeE],
+ [leaves[0], nodeB, nodeE],
+ [leaves[3], nodeA, nodeE],
+ [leaves[2], nodeA, nodeE],
+ [leaves[5], leaves[6], nodeD],
+ [leaves[4], leaves[6], nodeD],
+ [nodeC, nodeD],
+]
+
+known_proof5 = ('020000' + \
+ '0000000000000007' + '0000000000000005' + \
+ '0063' + \
+ '20' + leaves[4].encode('hex') + \
+ '20' + leaves[6].encode('hex') + \
+ '20' + nodeD.encode('hex')).decode('hex')
+
+
+class TestMerkleTree(unittest.TestCase):
+ def testPreComputed(self):
+ tree = MerkleTree(hash_fn, data)
+ head = tree.head()
+ self.assertEquals(head, nodeF)
+
+ for i in range(len(data)):
+ proof = tree.inclusion_proof(i)
+
+ self.assertTrue(proof.verify(hash_fn, data[i], i, len(data), head))
+ self.assertEquals(proof.leaf_index, i)
+ self.assertEquals(proof.tree_size, tree.n)
+ self.assertEquals(proof.path_elements, proofs[i])
+
+
+ def testInclusionProofEncodeDecode(self):
+ tree = MerkleTree(hash_fn, data)
+
+ # Inclusion proof encode/decode round trip test
+ proof5 = tree.inclusion_proof(5)
+ serialized5 = proof5.to_rfc6962_bis()
+ deserialized5 = InclusionProof.from_rfc6962_bis(serialized5)
+ reserialized5 = deserialized5.to_rfc6962_bis()
+ self.assertEquals(serialized5, reserialized5)
+
+ # Inclusion proof encode known answer test
+ serialized5 = proof5.to_rfc6962_bis()
+ self.assertEquals(serialized5, known_proof5)
+
+ # Inclusion proof decode known answer test
+ known_deserialized5 = InclusionProof.from_rfc6962_bis(known_proof5)
+ self.assertEquals(proof5.leaf_index, known_deserialized5.leaf_index)
+ self.assertEquals(proof5.tree_size, known_deserialized5.tree_size)
+ self.assertEquals(proof5.path_elements, known_deserialized5.path_elements)
+
+ def testLargeTree(self):
+ TEST_SIZE = 5000
+ ELEM_SIZE_BYTES = 16
+ data = [bytearray(random.getrandbits(8) for _ in xrange(ELEM_SIZE_BYTES)) for _ in xrange(TEST_SIZE)]
+ tree = MerkleTree(hash_fn, data)
+ head = tree.head()
+
+ for i in range(len(data)):
+ proof = tree.inclusion_proof(i)
+
+ self.assertTrue(proof.verify(hash_fn, data[i], i, len(data), head))
+ self.assertEquals(proof.leaf_index, i)
+ self.assertEquals(proof.tree_size, tree.n)