mozreview: encode diffs as base64 over JSON API (
bug 1249004); r?smacleod
Mercurial represents diffs as byte sequences (str in Python).
Python's json encoder (like JSON) likes speaking in terms of Unicode.
Under the hood, it is very aggressive about converting str to unicode
and vice-versa, often applying a default encoding of UTF-8.
Since diffs can contain arbitrary byte sequences that aren't valid
UTF-8, the JSON encoding of a diff could fail. This was actually
happening in the wild.
It is possible to coerce Python+JSON into representing arbitrary byte
sequences. But it requires a lot of hoop jumping on both ends (e.g.
ensure_ascii=False, which the docs recommend against). It is much
simpler to base64 encode byte sequences *before* they are JSON
encoded. So that's what this commit does.
The "diff" commit JSON key has been changed to "diff_b64" to reflect
base64 encoded values. The Mercurial server is the only consumer of
the batch submission API, so we don't have to worry about backwards
compatibility. So we've dropped support for the "diff" key.
While I was here, I noticed that the "parent_diff" key is no longer
used, so I removed support for it.
Tests demonstrating preservation of binary data have been added.
Because YAML encodes binary data, a new mach debug command was added to
dump the raw bytes from Review Board's stored diffs. This itself
required hacks because mach will transparently UTF-8 encode sys.stdout.
Oy. In the end, we show that the raw diff stored in Review Board
contains the byte sequences captured by the filesystem/Mercurial.
Encoding is hard.
MozReview-Commit-ID: HDfbFy2RElf
--- a/hgext/reviewboard/hgrb/proto.py
+++ b/hgext/reviewboard/hgrb/proto.py
@@ -291,26 +291,34 @@ def _processpushreview(repo, req, ldap_u
else:
base_commit_id = base_ctx.hex()
summary = encoding.fromlocal(ctx.description().splitlines()[0])
commits['individual'].append({
'id': node,
'precursors': precursors.get(node, []),
'message': encoding.fromlocal(ctx.description()),
- 'diff': diff,
+ # Diffs are arbitrary byte sequences. json.dump() will try to
+ # interpret str as UTF-8, which could fail. Instead of trying
+ # to coerce the str to a unicode or use ensure_ascii=False (which
+ # is a giant pain), just base64 encode the diff in the JSON.
+ 'diff_b64': diff.encode('base64'),
'bug': str(reviewid.bug),
'base_commit_id': base_commit_id,
'first_public_ancestor': first_public_ancestor,
'reviewers': list(commitparser.parse_rquestion_reviewers(summary)),
'requal_reviewers': list(commitparser.parse_requal_reviewers(summary))
})
- commits['squashed']['diff'] = ''.join(patch.diff(repo, node1=base_parent_node,
- node2=repo[nodes[-1]].node(), opts=diffopts)) + '\n'
+ squashed_diff = b''.join(patch.diff(repo,
+ node1=base_parent_node,
+ node2=repo[nodes[-1]].node(),
+ opts=diffopts)) + '\n'
+
+ commits['squashed']['diff_b64'] = squashed_diff.encode('base64')
commits['squashed']['base_commit_id'] = base_ctx.hex()
rburl = repo.ui.config('reviewboard', 'url', None).rstrip('/')
repoid = repo.ui.configint('reviewboard', 'repoid', None)
privileged_rb_username = repo.ui.config('reviewboard', 'username', None)
privileged_rb_password = repo.ui.config('reviewboard', 'password', None)
if ldap_username:
--- a/hgext/reviewboard/tests/test-unicode.t
+++ b/hgext/reviewboard/tests/test-unicode.t
@@ -118,12 +118,174 @@ The globbing is patching over a bug in m
component: TestComponent
depends_on: []
platform: All
product: TestProduct
resolution: ''
status: UNCONFIRMED
summary: First Bug
+Put some wonky byte sequences in the diff
+
+ $ bugzilla create-bug TestProduct TestComponent 'Bug 2'
+ >>> with open('foo', 'wb') as fh:
+ ... fh.write(b'hello world \xff\xff\x7e\n')
+ $ hg commit -m 'Bug 2 - base'
+
+こんにちは世界 from above
+
+ >>> with open('foo', 'wb') as fh:
+ ... fh.write(b'hello world \xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf\xe4\xb8\x96\xe7\x95\x8c\n')
+ $ hg commit -m 'Bug 2 - tip'
+
+ $ hg --config bugzilla.username=author@example.com --config bugzilla.apikey=${authorkey} push -r 2::
+ pushing to ssh://$DOCKER_HOSTNAME:$HGPORT6/test-repo
+ searching for changes
+ remote: adding changesets
+ remote: adding manifests
+ remote: adding file changes
+ remote: added 2 changesets with 2 changes to 1 files
+ remote: recorded push in pushlog
+ submitting 2 changesets for review
+
+ changeset: 2:78025579528e
+ summary: Bug 2 - base
+ review: http://$DOCKER_HOSTNAME:$HGPORT1/r/4 (draft)
+
+ changeset: 3:6204fc917b21
+ summary: Bug 2 - tip
+ review: http://$DOCKER_HOSTNAME:$HGPORT1/r/5 (draft)
+
+ review id: bz://2/mynick
+ review url: http://$DOCKER_HOSTNAME:$HGPORT1/r/3 (draft)
+ (review requests lack reviewers; visit review url to assign reviewers)
+ (visit review url to publish these review requests so others can see them)
+
+ $ rbmanage dumpreview 4
+ id: 4
+ status: pending
+ public: false
+ bugs: []
+ commit: null
+ submitter: author+6
+ summary: ''
+ description: ''
+ target_people: []
+ extra_data: {}
+ commit_extra_data:
+ p2rb: true
+ p2rb.identifier: bz://2/mynick
+ p2rb.is_squashed: false
+ diffs: []
+ approved: false
+ approval_failure: The review request is not public.
+ draft:
+ bugs:
+ - '2'
+ commit: null
+ summary: Bug 2 - base
+ description:
+ - Bug 2 - base
+ - ''
+ - 'MozReview-Commit-ID: 5ijR9k'
+ target_people: []
+ extra: {}
+ commit_extra_data:
+ p2rb: true
+ p2rb.commit_id: 78025579528e119adf8ccc61727fccc1e23bda1c
+ p2rb.first_public_ancestor: 3a9f6899ef84c99841f546030b036d0124a863cf
+ p2rb.identifier: bz://2/mynick
+ p2rb.is_squashed: false
+ diffs:
+ - id: 4
+ revision: 1
+ base_commit_id: 86ab97a5dd61e8ec7ff3c23212db732e3531af01
+ name: diff
+ extra: {}
+ patch:
+ - diff --git a/foo b/foo
+ - '--- a/foo'
+ - +++ b/foo
+ - '@@ -1,1 +1,1 @@'
+ - -initial
+ - !!binary |
+ K2hlbGxvIHdvcmxkIP//fg==
+ - ''
+
+ $ rbmanage dumpreview 5
+ id: 5
+ status: pending
+ public: false
+ bugs: []
+ commit: null
+ submitter: author+6
+ summary: ''
+ description: ''
+ target_people: []
+ extra_data: {}
+ commit_extra_data:
+ p2rb: true
+ p2rb.identifier: bz://2/mynick
+ p2rb.is_squashed: false
+ diffs: []
+ approved: false
+ approval_failure: The review request is not public.
+ draft:
+ bugs:
+ - '2'
+ commit: null
+ summary: Bug 2 - tip
+ description:
+ - Bug 2 - tip
+ - ''
+ - 'MozReview-Commit-ID: APOgLo'
+ target_people: []
+ extra: {}
+ commit_extra_data:
+ p2rb: true
+ p2rb.commit_id: 6204fc917b213cf88051df32860d62ca91ae1422
+ p2rb.first_public_ancestor: 3a9f6899ef84c99841f546030b036d0124a863cf
+ p2rb.identifier: bz://2/mynick
+ p2rb.is_squashed: false
+ diffs:
+ - id: 5
+ revision: 1
+ base_commit_id: 78025579528e119adf8ccc61727fccc1e23bda1c
+ name: diff
+ extra: {}
+ patch:
+ - diff --git a/foo b/foo
+ - '--- a/foo'
+ - +++ b/foo
+ - '@@ -1,1 +1,1 @@'
+ - !!binary |
+ LWhlbGxvIHdvcmxkIP//fg==
+ - "+hello world \u3053\u3093\u306B\u3061\u306F\u4E16\u754C"
+ - ''
+
+The raw diff demonstrates the original bytes are preserved
+
+ $ rbmanage dump-raw-diff 4
+ ID: 4 (draft)
+ diff --git a/foo b/foo
+ --- a/foo
+ +++ b/foo
+ @@ -1,1 +1,1 @@
+ -initial
+ +hello world \xff\xff~ (esc)
+
+
+
+ $ rbmanage dump-raw-diff 5
+ ID: 5 (draft)
+ diff --git a/foo b/foo
+ --- a/foo
+ +++ b/foo
+ @@ -1,1 +1,1 @@
+ -hello world \xff\xff~ (esc)
+ +hello world \xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf\xe4\xb8\x96\xe7\x95\x8c (esc)
+
+
+
Cleanup
$ mozreview stop
stopped 9 containers
--- a/pylib/mozreview/mozreview/resources/batch_review_request.py
+++ b/pylib/mozreview/mozreview/resources/batch_review_request.py
@@ -96,28 +96,27 @@ class BatchReviewRequestResource(WebAPIR
commit sets. This identifier is used to update review requests with a new
set of diffs from a new push. Generally this identifier will represent
some unit of work, such as a bug.
The ``commits`` argument takes the following form::
{
'squashed': {
- 'diff': <squashed-diff-string>,
+ 'diff_b64': <squashed-diff-string encoded as base64>,
'base_commit_id': <commit-id-to-apply-diff-to> (optional),
'first_public_ancestor': <commit of first public ancestor> (optional),
},
'individual': [
{
'id': <commit-id>,
'precursors': [<previous changeset>],
'message': <commit-message>,
- 'diff': <diff>,
+ 'diff_b64': <diff encoded as base64>,
'bug': <bug-id>,
- 'parent_diff': <diff-from-base-to-commit> (optional),
'base_commit_id': <commit-id-to-apply-diffs-to> (optional),
'first_public_ancestor': <commit of first public ancestor> (optional),
'reviewers': [<user1>, <user2>, ...] (optional),
'requal_reviewers': [<user1>, <user2>, ...] (optional),
},
{
...
},
@@ -273,25 +272,25 @@ class BatchReviewRequestResource(WebAPIR
for key in ('squashed', 'individual'):
if key not in commits:
logger.error('commits field does not contain %s' % key)
return INVALID_FORM_DATA, {
'fields': {
'commits': ['Does not contain %s key' % key]}}
- for key in ('base_commit_id', 'diff', 'first_public_ancestor'):
+ for key in ('base_commit_id', 'diff_b64', 'first_public_ancestor'):
if key not in commits['squashed']:
logger.error('squashed key missing %s' % key)
return INVALID_FORM_DATA, {
'fields': {
'commits': ['Squashed commit does not contain %s key' % key]}}
for commit in commits['individual']:
- for key in ('id', 'message', 'bug', 'diff', 'precursors', 'first_public_ancestor'):
+ for key in ('id', 'message', 'bug', 'diff_b64', 'precursors', 'first_public_ancestor'):
if key not in commit:
logger.error('commit (%s) missing key %s' % (
commit.get('id', '<id>') ,key))
return INVALID_FORM_DATA, {
'fields': {
'commits': ['Individual commit (%s) missing key %s' % (
commit.get('id', '<id>'), key)]
}}
@@ -395,17 +394,19 @@ class BatchReviewRequestResource(WebAPIR
# since it can be quite time consuming.
# Calling create_from_data() instead of create_from_upload() skips
# diff size validation. We allow unlimited diff sizes, so no biggie.
logger.info('%s: generating squashed diffset for %d' % (
identifier, squashed_rr.id))
diffset = DiffSet.objects.create_from_data(
repository=repo,
diff_file_name='diff',
- diff_file_contents=commits['squashed']['diff'],
+ # The original value is a unicode instance. Python 3 can't
+ # .encode() a unicode instance, so go to str first.
+ diff_file_contents=commits['squashed']['diff_b64'].encode('ascii').decode('base64'),
parent_diff_file_name=None,
parent_diff_file_contents=None,
diffset_history=None,
basedir='',
request=request,
base_commit_id=commits['squashed'].get('base_commit_id'),
save=True,
)
@@ -940,19 +941,19 @@ def update_review_request(local_site, re
for user in sorted(reviewer_users):
draft.target_people.add(user)
logger.debug('adding reviewer %s to #%d' % (user.username, rr.id))
try:
diffset = DiffSet.objects.create_from_data(
repository=rr.repository,
diff_file_name='diff',
- diff_file_contents=commit['diff'],
+ diff_file_contents=commit['diff_b64'].encode('ascii').decode('base64'),
parent_diff_file_name='diff',
- parent_diff_file_contents=commit.get('parent_diff'),
+ parent_diff_file_contents=None,
diffset_history=None,
basedir='',
request=request,
base_commit_id=commit.get('base_commit_id'),
save=True,
)
update_diffset_history(rr, diffset)
diffset.save()
--- a/testing/vcttesting/reviewboard/mach_commands.py
+++ b/testing/vcttesting/reviewboard/mach_commands.py
@@ -228,16 +228,46 @@ class ReviewBoardCommands(object):
description='Print a representation of a review request.')
@CommandArgument('rrid', help='Review request id to dump')
def dumpreview(self, rrid):
client = self._get_client()
root = client.get_root()
r = root.get_review_request(review_request_id=rrid)
print(serialize_review_requests(client, r))
+ @Command('dump-raw-diff', category='reviewboard',
+ description='Dump the raw content of a diff from the server')
+ @CommandArgument('rrid', help='Review request id of diffs to dump')
+ def dump_raw_diff(self, rrid):
+ from rbtools.api.errors import APIError
+
+ client = self._get_client()
+ root = client.get_root()
+ rr = root.get_review_request(review_request_id=rrid)
+
+ # mach wraps sys.stdout with a transparent UTF-8 writer. This
+ # will interfere with our printing of raw data. So bypass it.
+ stdout = os.fdopen(sys.stdout.fileno(), 'w')
+
+ for diff in rr.get_diffs():
+ stdout.write(b'ID: %d\n' % diff.id)
+ stdout.write(diff.get_patch().data)
+ stdout.write(b'\n')
+
+ try:
+ draft = rr.get_draft()
+ for diff in draft.get_draft_diffs():
+ stdout.write(b'ID: %d (draft)\n' % diff.id)
+ stdout.write(diff.get_patch().data)
+ stdout.write(b'\n')
+ except APIError:
+ pass
+
+ stdout.close()
+
@Command('add-reviewer', category='reviewboard',
description='Add a reviewer to a review request')
@CommandArgument('rrid', help='Review request id to modify')
@CommandArgument('--user', action='append',
help='User from whom to ask for review')
def add_reviewer(self, rrid, user):
from rbtools.api.errors import APIError