vcssync: functionality for preparing a Git ref for integration (bug 1357597) draft
authorGregory Szorc <gps@mozilla.com>
Wed, 19 Apr 2017 15:49:10 -0700
changeset 10856 cf1a3a0439719aa4e7d8273bbe702a8c9e41b01f
parent 10855 ef7942ef042c74c6825e40bb27d5a9363db80aa7
child 10857 f867f6d5354ce934fa0bba4fe6853e9fa0ccc96f
push id1638
push userbmo:gps@mozilla.com
push dateSat, 22 Apr 2017 00:35:48 +0000
bugs1357597
vcssync: functionality for preparing a Git ref for integration (bug 1357597) Before importing a GitHub pull request into a Mercurial repository, we first need to fetch the commit data so we can operate on it. We also may need to "normalize" the commits so they conform to Mozilla standards. For example, we don't like merges or "fixup" commits in the final history of the Firefox repo, so if we see a pull request with either of these, we probably just want to squash it down to a single commit. Or more practically, we may wish to prune certain files or directories that aren't part of the destination repository. You could also conceive of more advanced functionality, such as automatically filing a bug and adding a reference to it in the commit message. Because "pull requests" are a specific implementation of a generic concept that is "incorporate [Git] changes from X into Y," we invent a generic term for the work we're doing that doesn't overload the "pull request" terminology: integration. This pre-import "integration" work constitutes sufficient complexity to justify having it exist as a standalone function. This commit establishes that function. Building on top of the work to identify a merge base and to squash commits, we implement a function that fetches Git commits needed to process a request to incorporate changes between 2 refs. It then rewrites the "incoming" commits as appropriate for the (eventual) destination repository. The rewriting is driven by 2 callbacks: 1 chooses the "shape" of the commits (currently either "preserve" or "squash") and the other allows rewriting of commits themselves. The latter can be used to change the commit message, etc. After returning successfully, the function ensures that a local Git repository contains commits for the originally requested "base" and "head" refs. A future consumer of this function will take the "integration" commit(s) and import them to Mercurial, possibly after other transforms, such as re-parenting them. While the code is intended to be used for GitHub pull requests, it is intentionally sufficiently generic to allow uses outside of GitHub. All you need are 2 Git refs and some optional callbacks to control rewriting and you can prepare any set of Git commits for whatever is ahead. MozReview-Commit-ID: Hl0RQ5aTPAy
vcssync/mozvcssync/cli.py
vcssync/mozvcssync/gitrewrite/integrate.py
vcssync/setup.py
vcssync/tests/test-integrate-git-ref-linear.t
--- a/vcssync/mozvcssync/cli.py
+++ b/vcssync/mozvcssync/cli.py
@@ -15,16 +15,19 @@ import hglib
 
 from .git2hg import (
     linearize_git_repo_to_hg,
 )
 from .gitrewrite import (
     RewriteError,
     commit_metadata_rewriter,
 )
+from .gitrewrite.integrate import (
+    prepare_ref_for_integration,
+)
 from .gitrewrite.linearize import (
     linearize_git_repo,
 )
 from .gitrewrite.squash import (
     squash_git_ref,
 )
 from .overlay import (
     overlay_hg_repos,
@@ -228,16 +231,48 @@ def squash_git():
     try:
         squash_git_ref(repo, args.base_ref, args.squash_ref,
                        commit_rewriter=rewriter)
     except RewriteError as e:
         logger.error('abort: %s' % str(e))
         sys.exit(1)
 
 
+def integrate_git_ref():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('base_url', help='URL of Git repo to integrate to')
+    parser.add_argument('base_ref', help='Ref in Git repo to integrate to')
+    parser.add_argument('head_url', help='URL of Git repo to integrate from')
+    parser.add_argument('head_ref', help='Ref in Git repo to integrate from')
+    parser.add_argument('repo_path', help='Path to local Git repo to operate '
+                                          'on')
+    parser.add_argument('--shape-strategy', choices=('preserve', 'squash'),
+                        default='preserve',
+                        help='How to change the "shape" of incoming commits')
+
+    args = parser.parse_args()
+    configure_logging()
+
+    local_base_ref = b'local/%s' % args.base_ref
+    local_head_ref = b'local/%s' % args.head_ref
+
+    def shape_strategy(merge_info):
+        return args.shape_strategy
+
+    def rewriter(merge_info, strategy, commit_map, source_commit, dest_commit):
+        if strategy == 'squash':
+            dest_commit.message = b'rewritten squashed commit'
+
+    prepare_ref_for_integration(args.base_url, args.base_ref, local_base_ref,
+                                args.head_url, args.head_ref, local_head_ref,
+                                args.repo_path,
+                                shape_strategy_fn=shape_strategy,
+                                commit_rewriter=rewriter)
+
+
 def overlay_hg_repos_cli():
     # Unbuffer stdout.
     sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 1)
 
     parser = argparse.ArgumentParser()
     parser.add_argument('--hg', help='hg executable to use')
     parser.add_argument('--into', required=True,
                         help='Subdirectory into which changesets will be '
new file mode 100644
--- /dev/null
+++ b/vcssync/mozvcssync/gitrewrite/integrate.py
@@ -0,0 +1,174 @@
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+from __future__ import absolute_import, unicode_literals
+
+import logging
+import os
+import subprocess
+
+import dulwich.repo
+
+from . import (
+    prune_directories,
+    RewriteError,
+)
+from .squash import (
+    squash_commits_from_merge_info,
+)
+from ..gitutil import (
+    calculate_merge_properties,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+def prepare_ref_for_integration(base_url, base_ref, local_base_ref,
+                                head_url, head_ref, local_head_ref,
+                                repo_path,
+                                base_revision=None,
+                                head_revision=None,
+                                exclude_dirs=None,
+                                shape_strategy_fn=None,
+                                commit_rewriter=None):
+    """Prepare a Git ref for integration with another Git ref.
+
+    It is a common workflow to request a set of Git commits be integrated with
+    another Git commit. GitHub and other tools call this a "pull request."
+    Essentially, a Git repo+ref is being requested to be pulled/merged into
+    another repo+ref. This is commonly done by performing a ``git merge``.
+
+    Many projects don't do literal ``git merge`` based integration. Instead,
+    they rewrite commits first. This function exists to facilitate that
+    rewriting.
+
+    This function fetches the Git refs defined by ``base_url``/``base_ref`` and
+    ``head_url``/``head_ref`` into local refs ``local_base_ref`` and
+    ``local_head_ref``. It then calculates an ideal merge base from
+    *head* into *base* and determines the set of commits between the merge base
+    and *head* that aren't in *base*. These commits are then rewritten (if
+    requested).
+
+    If ``base_revision`` and/or ``head_revision`` are defined, operations
+    operate on exactly these revisions. Otherwise, the value of the ref at the
+    time of the operation will be used (this value can change over time and lead
+    to non-deterministic behavior).
+
+    ``shape_strategy_fn`` is callable that determines how the DAG
+    "shap" of commits in ``head_ref`` should be treated. The function
+    receives as arguments the merge properties from
+    ``calculate_merge_properties``. If this argument is not passed or the
+    callable returns ``preserve``, commits will be carried forward as-is. If it
+    returns ``squash``, commits are squashed into a single commit. If it returns
+    a non-truthy value or raises an exception, the rewrite is denied. This
+    callback can be used to automatically squash commits when they don't
+    conform to a commit authoring strategy, for example.
+
+    ``commit_rewriter`` is a function called after "shape shifting" the
+    integration set before the commit is saved. The callback can be used to
+    rewrite commit messages, etc. The function receives as arguments the
+    derived merge properties, the resolved shape strategy, a dict mapping of
+    old commit IDs to new commit IDs, the original commit, and the commit that
+    is being rewritten. The rewritten commit object should be modified in place
+    if desired.
+    """
+    if not shape_strategy_fn:
+        shape_strategy_fn = lambda x: 'preserve'
+
+    real_local_base_ref = b'refs/%s' % local_base_ref
+    real_local_head_ref = b'refs/%s' % local_head_ref
+
+    repo_path = os.path.abspath(repo_path)
+
+    if not os.path.exists(repo_path):
+        subprocess.check_call([b'git', b'init', b'--bare', repo_path])
+
+    # TODO consider checking for base_revision and head_revision local
+    # presence and avoiding the I/O.
+    logger.warn('fetching %s:%s into %s:%s' % (base_url, base_ref,
+                                               repo_path, local_base_ref))
+    subprocess.check_call(
+        [b'git', b'fetch', b'--no-tags', base_url,
+         b'+refs/heads/%s:refs/%s' % (base_ref, local_base_ref)],
+        cwd=repo_path)
+    logger.warn('fetching %s:%s into %s:%s' % (head_url, head_ref,
+                                               repo_path, local_head_ref))
+    subprocess.check_call(
+        [b'git', b'fetch', b'--no-tags', head_url,
+         b'+refs/heads/%s:refs/%s' % (head_ref, local_head_ref)],
+        cwd=repo_path)
+
+    repo = dulwich.repo.Repo(repo_path)
+
+    base_commit = repo[base_revision or real_local_base_ref]
+    head_commit = repo[head_revision or real_local_head_ref]
+
+    logger.warn('using base commit %s; head commit %s' % (
+        base_commit.id, head_commit.id))
+
+    merge_info = calculate_merge_properties(repo, head_commit.id,
+                                            base_commit.id)
+
+    logger.warn('identified %s as most appropriate merge base' %
+                merge_info['merge_base'].id)
+
+    logger.warn('found %d commits to integrate' % len(merge_info['incoming']))
+    for commit in merge_info['incoming']:
+        logger.warn('%s %s' % (commit.id, commit.message.splitlines()[0]))
+
+    strategy = shape_strategy_fn(merge_info)
+    if not strategy:
+        raise RewriteError('no rewrite strategy defined')
+
+    if strategy == 'preserve':
+        incoming = merge_info['incoming']
+    elif strategy == 'squash':
+        logger.warn('squashing %d commits' % len(merge_info['incoming']))
+        # We don't pass a commit rewriter because we have our own rewriting
+        # facility. This may create an orphaned commit object in the store.
+        # It shouldn't matter.
+        incoming = [squash_commits_from_merge_info(repo, merge_info)]
+    else:
+        raise ValueError('unknown rewrite strategy: %s' % strategy)
+
+    # Perform commit rewriting post shape shifting.
+
+    commit_map = {}
+
+    for source_commit in incoming:
+        dest_commit = source_commit.copy()
+
+        dest_commit.parents = [commit_map.get(p, p) for p in
+                               dest_commit.parents]
+
+        if exclude_dirs:
+            dest_commit.tree = prune_directories(repo.object_store,
+                                                 source_commit.tree,
+                                                 exclude_dirs).id
+
+        if commit_rewriter:
+            commit_rewriter(merge_info, strategy, dict(commit_map),
+                            source_commit, dest_commit)
+
+        commit_map[source_commit.id] = dest_commit.id
+
+        if source_commit.id == dest_commit.id:
+            logger.warn('commit %s remains unchanged' % source_commit.id)
+            continue
+
+        repo.object_store.add_object(dest_commit)
+        logger.warn('source commit %s rewritten to %s' % (
+            source_commit.id, dest_commit.id))
+
+    integrate_commit = incoming[-1]
+
+    return {
+        'base_commit': base_commit,
+        'head_commit': head_commit,
+        'merge_info': merge_info,
+        'integrate_commit': integrate_commit,
+        'shape_strategy': strategy,
+        'commit_map': commit_map,
+    }
--- a/vcssync/setup.py
+++ b/vcssync/setup.py
@@ -9,16 +9,17 @@ console_scripts = [
     'servo-overlay=mozvcssync.servo:overlay_cli',
     'servo-pulse-listen=mozvcssync.servo:pulse_daemon',
 ]
 
 # These commands are really only useful for testing. So don't expose them by
 # default.
 if 'VCSSYNC_ENABLE_TESTING_COMMANDS' in os.environ:
     console_scripts.extend([
+        'integrate-git-ref=mozvcssync.cli:integrate_git_ref',
         'squash-git-ref=mozvcssync.cli:squash_git',
     ])
 
 setup(
     name='mozvcssync',
     version='0.1',
     description='Synchronize changes across VCS repositories',
     url='https://mozilla-version-control-tools.readthedocs.io/',
new file mode 100644
--- /dev/null
+++ b/vcssync/tests/test-integrate-git-ref-linear.t
@@ -0,0 +1,93 @@
+  $ . $TESTDIR/vcssync/tests/helpers.sh
+
+Create our base Git repo
+
+  $ standardgitrepo base > /dev/null 2>&1
+
+Make new commits on a clone, simulating work
+
+  $ git clone -q base head
+  $ cd head
+  $ git checkout -q -b feature-branch
+  $ echo 0 > foo
+  $ git commit -q --all -m 'local commit 1'
+  $ echo 1 > foo
+  $ git commit -q --all -m 'local commit 2'
+  $ echo 2 > foo
+  $ git commit -q --all -m 'local commit 3'
+
+  $ git log --graph --format=oneline --all -n 4
+  * 6837841cb20bb06e905436a3920420c4be751bdc local commit 3
+  * 24bd7a1e72cd67f62a60d98babc5ac8c55a37f18 local commit 2
+  * 80bdefcfe37bb05d997ff2dcaa265a1fd7d2623a local commit 1
+  * a447b9b0ff25bf17daab1c7edae4a998eca0adac dummy commit 1 after merge
+
+  $ cd ..
+
+Request to integrate the new commits should pick our 3 local commits
+
+  $ integrate-git-ref `pwd`/base master `pwd`/head feature-branch grepo
+  Initialized empty Git repository in $TESTTMP/grepo/
+  fetching $TESTTMP/base:master into $TESTTMP/grepo:local/master
+  From $TESTTMP/base
+   * [new branch]      master     -> refs/local/master
+  fetching $TESTTMP/head:feature-branch into $TESTTMP/grepo:local/feature-branch
+  From $TESTTMP/head
+   * [new branch]      feature-branch -> refs/local/feature-branch
+  using base commit a447b9b0ff25bf17daab1c7edae4a998eca0adac; head commit 6837841cb20bb06e905436a3920420c4be751bdc
+  identified a447b9b0ff25bf17daab1c7edae4a998eca0adac as most appropriate merge base
+  found 3 commits to integrate
+  80bdefcfe37bb05d997ff2dcaa265a1fd7d2623a local commit 1
+  24bd7a1e72cd67f62a60d98babc5ac8c55a37f18 local commit 2
+  6837841cb20bb06e905436a3920420c4be751bdc local commit 3
+  commit 80bdefcfe37bb05d997ff2dcaa265a1fd7d2623a remains unchanged
+  commit 24bd7a1e72cd67f62a60d98babc5ac8c55a37f18 remains unchanged
+  commit 6837841cb20bb06e905436a3920420c4be751bdc remains unchanged
+
+Create commits in the base repo, causing DAG divergence
+
+  $ cd base
+  $ echo 0 > foo
+  $ git commit -q --all -m 'upstream commit 1'
+  $ echo 1 > foo
+  $ git commit -q --all -m 'upstream commit 2'
+  $ cd ..
+
+Should still identify the same merge base and result in same operation
+
+  $ integrate-git-ref `pwd`/base master `pwd`/head feature-branch grepo
+  fetching $TESTTMP/base:master into $TESTTMP/grepo:local/master
+  From $TESTTMP/base
+     a447b9b..422b237  master     -> refs/local/master
+  fetching $TESTTMP/head:feature-branch into $TESTTMP/grepo:local/feature-branch
+  using base commit 422b237d26bd553394edb6992b74380b69e54d3a; head commit 6837841cb20bb06e905436a3920420c4be751bdc
+  identified a447b9b0ff25bf17daab1c7edae4a998eca0adac as most appropriate merge base
+  found 3 commits to integrate
+  80bdefcfe37bb05d997ff2dcaa265a1fd7d2623a local commit 1
+  24bd7a1e72cd67f62a60d98babc5ac8c55a37f18 local commit 2
+  6837841cb20bb06e905436a3920420c4be751bdc local commit 3
+  commit 80bdefcfe37bb05d997ff2dcaa265a1fd7d2623a remains unchanged
+  commit 24bd7a1e72cd67f62a60d98babc5ac8c55a37f18 remains unchanged
+  commit 6837841cb20bb06e905436a3920420c4be751bdc remains unchanged
+
+Now ask for commits to be squashed
+
+  $ integrate-git-ref `pwd`/base master `pwd`/head feature-branch grepo --shape-strategy squash
+  fetching $TESTTMP/base:master into $TESTTMP/grepo:local/master
+  fetching $TESTTMP/head:feature-branch into $TESTTMP/grepo:local/feature-branch
+  using base commit 422b237d26bd553394edb6992b74380b69e54d3a; head commit 6837841cb20bb06e905436a3920420c4be751bdc
+  identified a447b9b0ff25bf17daab1c7edae4a998eca0adac as most appropriate merge base
+  found 3 commits to integrate
+  80bdefcfe37bb05d997ff2dcaa265a1fd7d2623a local commit 1
+  24bd7a1e72cd67f62a60d98babc5ac8c55a37f18 local commit 2
+  6837841cb20bb06e905436a3920420c4be751bdc local commit 3
+  squashing 3 commits
+  source commit 93915f82a20f48358bb9f5917fff592855cfc06e rewritten to af98a7004ea61618dee80f4d2a9108356580204b
+
+  $ GIT_DIR=grepo git cat-file -p af98a7004ea61618dee80f4d2a9108356580204b
+  tree c3f19f98812e09a59b7df4cf5c694bb19b942f18
+  parent a447b9b0ff25bf17daab1c7edae4a998eca0adac
+  author test <test@example.com> 0 +0000
+  committer test <test@example.com> 0 +0000
+  
+  rewritten squashed commit (no-eol)