mozreview: encode diffs as base64 over JSON API (bug 1249004); r?smacleod draft
authorGregory Szorc <gps@mozilla.com>
Fri, 19 Feb 2016 18:22:43 -0800
changeset 7291 f59d229b0d7c0082b5ce5943cf7f2254b17cd63d
parent 7289 5e7da07ff6ee24e424fe86cd4180b3c870ef3404
push id650
push usergszorc@mozilla.com
push dateSat, 20 Feb 2016 02:24:12 +0000
reviewerssmacleod
bugs1249004
mozreview: encode diffs as base64 over JSON API (bug 1249004); r?smacleod Mercurial represents diffs as byte sequences (str in Python). Python's json encoder (like JSON) likes speaking in terms of Unicode. Under the hood, it is very aggressive about converting str to unicode and vice-versa, often applying a default encoding of UTF-8. Since diffs can contain arbitrary byte sequences that aren't valid UTF-8, the JSON encoding of a diff could fail. This was actually happening in the wild. It is possible to coerce Python+JSON into representing arbitrary byte sequences. But it requires a lot of hoop jumping on both ends (e.g. ensure_ascii=False, which the docs recommend against). It is much simpler to base64 encode byte sequences *before* they are JSON encoded. So that's what this commit does. The "diff" commit JSON key has been changed to "diff_b64" to reflect base64 encoded values. The Mercurial server is the only consumer of the batch submission API, so we don't have to worry about backwards compatibility. So we've dropped support for the "diff" key. While I was here, I noticed that the "parent_diff" key is no longer used, so I removed support for it. Tests demonstrating preservation of binary data have been added. Because YAML encodes binary data, a new mach debug command was added to dump the raw bytes from Review Board's stored diffs. This itself required hacks because mach will transparently UTF-8 encode sys.stdout. Oy. In the end, we show that the raw diff stored in Review Board contains the byte sequences captured by the filesystem/Mercurial. Encoding is hard. MozReview-Commit-ID: HDfbFy2RElf
hgext/reviewboard/hgrb/proto.py
hgext/reviewboard/tests/test-unicode.t
pylib/mozreview/mozreview/resources/batch_review_request.py
testing/vcttesting/reviewboard/mach_commands.py
--- a/hgext/reviewboard/hgrb/proto.py
+++ b/hgext/reviewboard/hgrb/proto.py
@@ -291,26 +291,34 @@ def _processpushreview(repo, req, ldap_u
         else:
             base_commit_id = base_ctx.hex()
 
         summary = encoding.fromlocal(ctx.description().splitlines()[0])
         commits['individual'].append({
             'id': node,
             'precursors': precursors.get(node, []),
             'message': encoding.fromlocal(ctx.description()),
-            'diff': diff,
+            # Diffs are arbitrary byte sequences. json.dump() will try to
+            # interpret str as UTF-8, which could fail. Instead of trying
+            # to coerce the str to a unicode or use ensure_ascii=False (which
+            # is a giant pain), just base64 encode the diff in the JSON.
+            'diff_b64': diff.encode('base64'),
             'bug': str(reviewid.bug),
             'base_commit_id': base_commit_id,
             'first_public_ancestor': first_public_ancestor,
             'reviewers': list(commitparser.parse_rquestion_reviewers(summary)),
             'requal_reviewers': list(commitparser.parse_requal_reviewers(summary))
         })
 
-    commits['squashed']['diff'] = ''.join(patch.diff(repo, node1=base_parent_node,
-        node2=repo[nodes[-1]].node(), opts=diffopts)) + '\n'
+    squashed_diff = b''.join(patch.diff(repo,
+                                        node1=base_parent_node,
+                                        node2=repo[nodes[-1]].node(),
+                                        opts=diffopts)) + '\n'
+
+    commits['squashed']['diff_b64'] = squashed_diff.encode('base64')
     commits['squashed']['base_commit_id'] = base_ctx.hex()
 
     rburl = repo.ui.config('reviewboard', 'url', None).rstrip('/')
     repoid = repo.ui.configint('reviewboard', 'repoid', None)
     privileged_rb_username = repo.ui.config('reviewboard', 'username', None)
     privileged_rb_password = repo.ui.config('reviewboard', 'password', None)
 
     if ldap_username:
--- a/hgext/reviewboard/tests/test-unicode.t
+++ b/hgext/reviewboard/tests/test-unicode.t
@@ -118,12 +118,174 @@ The globbing is patching over a bug in m
     component: TestComponent
     depends_on: []
     platform: All
     product: TestProduct
     resolution: ''
     status: UNCONFIRMED
     summary: First Bug
 
+Put some wonky byte sequences in the diff
+
+  $ bugzilla create-bug TestProduct TestComponent 'Bug 2'
+  >>> with open('foo', 'wb') as fh:
+  ...     fh.write(b'hello world \xff\xff\x7e\n')
+  $ hg commit -m 'Bug 2 - base'
+
+こんにちは世界 from above
+
+  >>> with open('foo', 'wb') as fh:
+  ...     fh.write(b'hello world \xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf\xe4\xb8\x96\xe7\x95\x8c\n')
+  $ hg commit -m 'Bug 2 - tip'
+
+  $ hg --config bugzilla.username=author@example.com --config bugzilla.apikey=${authorkey} push -r 2::
+  pushing to ssh://$DOCKER_HOSTNAME:$HGPORT6/test-repo
+  searching for changes
+  remote: adding changesets
+  remote: adding manifests
+  remote: adding file changes
+  remote: added 2 changesets with 2 changes to 1 files
+  remote: recorded push in pushlog
+  submitting 2 changesets for review
+  
+  changeset:  2:78025579528e
+  summary:    Bug 2 - base
+  review:     http://$DOCKER_HOSTNAME:$HGPORT1/r/4 (draft)
+  
+  changeset:  3:6204fc917b21
+  summary:    Bug 2 - tip
+  review:     http://$DOCKER_HOSTNAME:$HGPORT1/r/5 (draft)
+  
+  review id:  bz://2/mynick
+  review url: http://$DOCKER_HOSTNAME:$HGPORT1/r/3 (draft)
+  (review requests lack reviewers; visit review url to assign reviewers)
+  (visit review url to publish these review requests so others can see them)
+
+  $ rbmanage dumpreview 4
+  id: 4
+  status: pending
+  public: false
+  bugs: []
+  commit: null
+  submitter: author+6
+  summary: ''
+  description: ''
+  target_people: []
+  extra_data: {}
+  commit_extra_data:
+    p2rb: true
+    p2rb.identifier: bz://2/mynick
+    p2rb.is_squashed: false
+  diffs: []
+  approved: false
+  approval_failure: The review request is not public.
+  draft:
+    bugs:
+    - '2'
+    commit: null
+    summary: Bug 2 - base
+    description:
+    - Bug 2 - base
+    - ''
+    - 'MozReview-Commit-ID: 5ijR9k'
+    target_people: []
+    extra: {}
+    commit_extra_data:
+      p2rb: true
+      p2rb.commit_id: 78025579528e119adf8ccc61727fccc1e23bda1c
+      p2rb.first_public_ancestor: 3a9f6899ef84c99841f546030b036d0124a863cf
+      p2rb.identifier: bz://2/mynick
+      p2rb.is_squashed: false
+    diffs:
+    - id: 4
+      revision: 1
+      base_commit_id: 86ab97a5dd61e8ec7ff3c23212db732e3531af01
+      name: diff
+      extra: {}
+      patch:
+      - diff --git a/foo b/foo
+      - '--- a/foo'
+      - +++ b/foo
+      - '@@ -1,1 +1,1 @@'
+      - -initial
+      - !!binary |
+        K2hlbGxvIHdvcmxkIP//fg==
+      - ''
+
+  $ rbmanage dumpreview 5
+  id: 5
+  status: pending
+  public: false
+  bugs: []
+  commit: null
+  submitter: author+6
+  summary: ''
+  description: ''
+  target_people: []
+  extra_data: {}
+  commit_extra_data:
+    p2rb: true
+    p2rb.identifier: bz://2/mynick
+    p2rb.is_squashed: false
+  diffs: []
+  approved: false
+  approval_failure: The review request is not public.
+  draft:
+    bugs:
+    - '2'
+    commit: null
+    summary: Bug 2 - tip
+    description:
+    - Bug 2 - tip
+    - ''
+    - 'MozReview-Commit-ID: APOgLo'
+    target_people: []
+    extra: {}
+    commit_extra_data:
+      p2rb: true
+      p2rb.commit_id: 6204fc917b213cf88051df32860d62ca91ae1422
+      p2rb.first_public_ancestor: 3a9f6899ef84c99841f546030b036d0124a863cf
+      p2rb.identifier: bz://2/mynick
+      p2rb.is_squashed: false
+    diffs:
+    - id: 5
+      revision: 1
+      base_commit_id: 78025579528e119adf8ccc61727fccc1e23bda1c
+      name: diff
+      extra: {}
+      patch:
+      - diff --git a/foo b/foo
+      - '--- a/foo'
+      - +++ b/foo
+      - '@@ -1,1 +1,1 @@'
+      - !!binary |
+        LWhlbGxvIHdvcmxkIP//fg==
+      - "+hello world \u3053\u3093\u306B\u3061\u306F\u4E16\u754C"
+      - ''
+
+The raw diff demonstrates the original bytes are preserved
+
+  $ rbmanage dump-raw-diff 4
+  ID: 4 (draft)
+  diff --git a/foo b/foo
+  --- a/foo
+  +++ b/foo
+  @@ -1,1 +1,1 @@
+  -initial
+  +hello world \xff\xff~ (esc)
+  
+  
+
+  $ rbmanage dump-raw-diff 5
+  ID: 5 (draft)
+  diff --git a/foo b/foo
+  --- a/foo
+  +++ b/foo
+  @@ -1,1 +1,1 @@
+  -hello world \xff\xff~ (esc)
+  +hello world \xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf\xe4\xb8\x96\xe7\x95\x8c (esc)
+  
+  
+
 Cleanup
 
   $ mozreview stop
   stopped 9 containers
--- a/pylib/mozreview/mozreview/resources/batch_review_request.py
+++ b/pylib/mozreview/mozreview/resources/batch_review_request.py
@@ -96,28 +96,27 @@ class BatchReviewRequestResource(WebAPIR
     commit sets. This identifier is used to update review requests with a new
     set of diffs from a new push. Generally this identifier will represent
     some unit of work, such as a bug.
 
     The ``commits`` argument takes the following form::
 
         {
             'squashed': {
-                'diff': <squashed-diff-string>,
+                'diff_b64': <squashed-diff-string encoded as base64>,
                 'base_commit_id': <commit-id-to-apply-diff-to> (optional),
                 'first_public_ancestor': <commit of first public ancestor> (optional),
             },
             'individual': [
                 {
                     'id': <commit-id>,
                     'precursors': [<previous changeset>],
                     'message': <commit-message>,
-                    'diff': <diff>,
+                    'diff_b64': <diff encoded as base64>,
                     'bug': <bug-id>,
-                    'parent_diff': <diff-from-base-to-commit> (optional),
                     'base_commit_id': <commit-id-to-apply-diffs-to> (optional),
                     'first_public_ancestor': <commit of first public ancestor> (optional),
                     'reviewers': [<user1>, <user2>, ...] (optional),
                     'requal_reviewers': [<user1>, <user2>, ...] (optional),
                 },
                 {
                     ...
                 },
@@ -273,25 +272,25 @@ class BatchReviewRequestResource(WebAPIR
 
         for key in ('squashed', 'individual'):
             if key not in commits:
                 logger.error('commits field does not contain %s' % key)
                 return INVALID_FORM_DATA, {
                     'fields': {
                         'commits': ['Does not contain %s key' % key]}}
 
-        for key in ('base_commit_id', 'diff', 'first_public_ancestor'):
+        for key in ('base_commit_id', 'diff_b64', 'first_public_ancestor'):
             if key not in commits['squashed']:
                 logger.error('squashed key missing %s' % key)
                 return INVALID_FORM_DATA, {
                     'fields': {
                         'commits': ['Squashed commit does not contain %s key' % key]}}
 
         for commit in commits['individual']:
-            for key in ('id', 'message', 'bug', 'diff', 'precursors', 'first_public_ancestor'):
+            for key in ('id', 'message', 'bug', 'diff_b64', 'precursors', 'first_public_ancestor'):
                 if key not in commit:
                     logger.error('commit (%s) missing key %s' % (
                                  commit.get('id', '<id>') ,key))
                     return INVALID_FORM_DATA, {
                         'fields': {
                             'commits': ['Individual commit (%s) missing key %s' % (
                                 commit.get('id', '<id>'), key)]
                         }}
@@ -395,17 +394,19 @@ class BatchReviewRequestResource(WebAPIR
             # since it can be quite time consuming.
             # Calling create_from_data() instead of create_from_upload() skips
             # diff size validation. We allow unlimited diff sizes, so no biggie.
             logger.info('%s: generating squashed diffset for %d' % (
                         identifier, squashed_rr.id))
             diffset = DiffSet.objects.create_from_data(
                 repository=repo,
                 diff_file_name='diff',
-                diff_file_contents=commits['squashed']['diff'],
+                # The original value is a unicode instance. Python 3 can't
+                # .encode() a unicode instance, so go to str first.
+                diff_file_contents=commits['squashed']['diff_b64'].encode('ascii').decode('base64'),
                 parent_diff_file_name=None,
                 parent_diff_file_contents=None,
                 diffset_history=None,
                 basedir='',
                 request=request,
                 base_commit_id=commits['squashed'].get('base_commit_id'),
                 save=True,
                 )
@@ -940,19 +941,19 @@ def update_review_request(local_site, re
     for user in sorted(reviewer_users):
         draft.target_people.add(user)
         logger.debug('adding reviewer %s to #%d' % (user.username, rr.id))
 
     try:
         diffset = DiffSet.objects.create_from_data(
             repository=rr.repository,
             diff_file_name='diff',
-            diff_file_contents=commit['diff'],
+            diff_file_contents=commit['diff_b64'].encode('ascii').decode('base64'),
             parent_diff_file_name='diff',
-            parent_diff_file_contents=commit.get('parent_diff'),
+            parent_diff_file_contents=None,
             diffset_history=None,
             basedir='',
             request=request,
             base_commit_id=commit.get('base_commit_id'),
             save=True,
         )
         update_diffset_history(rr, diffset)
         diffset.save()
--- a/testing/vcttesting/reviewboard/mach_commands.py
+++ b/testing/vcttesting/reviewboard/mach_commands.py
@@ -228,16 +228,46 @@ class ReviewBoardCommands(object):
         description='Print a representation of a review request.')
     @CommandArgument('rrid', help='Review request id to dump')
     def dumpreview(self, rrid):
         client = self._get_client()
         root = client.get_root()
         r = root.get_review_request(review_request_id=rrid)
         print(serialize_review_requests(client, r))
 
+    @Command('dump-raw-diff', category='reviewboard',
+             description='Dump the raw content of a diff from the server')
+    @CommandArgument('rrid', help='Review request id of diffs to dump')
+    def dump_raw_diff(self, rrid):
+        from rbtools.api.errors import APIError
+
+        client = self._get_client()
+        root = client.get_root()
+        rr = root.get_review_request(review_request_id=rrid)
+
+        # mach wraps sys.stdout with a transparent UTF-8 writer. This
+        # will interfere with our printing of raw data. So bypass it.
+        stdout = os.fdopen(sys.stdout.fileno(), 'w')
+
+        for diff in rr.get_diffs():
+            stdout.write(b'ID: %d\n' % diff.id)
+            stdout.write(diff.get_patch().data)
+            stdout.write(b'\n')
+
+        try:
+            draft = rr.get_draft()
+            for diff in draft.get_draft_diffs():
+                stdout.write(b'ID: %d (draft)\n' % diff.id)
+                stdout.write(diff.get_patch().data)
+                stdout.write(b'\n')
+        except APIError:
+            pass
+
+        stdout.close()
+
     @Command('add-reviewer', category='reviewboard',
         description='Add a reviewer to a review request')
     @CommandArgument('rrid', help='Review request id to modify')
     @CommandArgument('--user', action='append',
         help='User from whom to ask for review')
     def add_reviewer(self, rrid, user):
         from rbtools.api.errors import APIError