vcssync: extract git commit rewriting to standalone function (
bug 1357236); r?glob
linearize_git_repo() accepts a bunch of arguments and contains code
for rewriting Git commit object metadata (author field, summary line,
commit message, etc). This rewriting is not critical to the operation
of this function. It exists there for hysterical raisins.
As the inline TODO in that function states, we want to extract this
metadata rewriting to its own function so we have separation of
concerns and can perform similar rewriting from other consumers
(such as a Git pull request importer).
This commit extracts the Git commit metadata rewriting to its own
reusable function.
The new commit_metadata_rewriter() function accepts a repo and
behavioral arguments and returns a function that is called to
process a pair of old and new Git commit objects. This function
is then passed to linearize_git_repo() and called on each processed
commit.
This change is a bit larger than I would have liked. There was
significant churn around callers of linearize_git_repo(), including
from the CLI layer. Some of this was due to needing to construct a
dulwich.repo.Repo instance. Other changes were around handling of
argument passing. The new code relies much less on **kwargs and
I think that is a good thing because it is too easy for **kwargs
to hide subtle bugs like unused arguments. Unfortunately, it does
make the change a bit harder to review.
MozReview-Commit-ID: Azqmx5EkmBe
--- a/vcssync/mozvcssync/cli.py
+++ b/vcssync/mozvcssync/cli.py
@@ -5,23 +5,25 @@
from __future__ import absolute_import, unicode_literals
import argparse
import logging
import os
import subprocess
import sys
+import dulwich.repo
import hglib
from .git2hg import (
linearize_git_repo_to_hg,
)
from .gitrewrite import (
RewriteError,
+ commit_metadata_rewriter,
)
from .gitrewrite.linearize import (
linearize_git_repo,
)
from .overlay import (
overlay_hg_repos,
PushRaceError,
PushRemoteFail,
@@ -61,27 +63,28 @@ LINEARIZE_GIT_ARGS = [
'merge commits')),
(('--github-username',), dict(help='Username to use for GitHub API '
'requests')),
(('--github-token',), dict(help='GitHub API token to use for GitHub API '
'requests')),
]
-def get_git_linearize_kwargs(args):
+def get_commit_rewriter_kwargs(args):
+ """Obtain keyword arguments to pass to ``commit_metadata_rewriter()``"""
kwargs = {}
# Credentials are sensitive. Allow them to come from environment,
# which isn't exposed in e.g. `ps`.
for env in ('GITHUB_USERNAME', 'GITHUB_TOKEN'):
v = os.environ.get(env)
if v is not None:
kwargs[env.lower()] = v
- for k in ('exclude_dirs', 'summary_prefix', 'reviewable_key',
+ for k in ('summary_prefix', 'reviewable_key',
'remove_reviewable', 'source_repo_key',
'source_revision_key', 'normalize_github_merge_message',
'committer_action', 'use_p2_author',
'github_username', 'github_token'):
v = getattr(args, k)
if v is not None:
kwargs[k] = v
@@ -116,23 +119,28 @@ def linearize_git():
help='URL of repository being converted')
parser.add_argument('git_repo', help='Path to Git repository to linearize')
parser.add_argument('ref', help='ref to linearize')
args = parser.parse_args()
configure_logging()
- kwargs = get_git_linearize_kwargs(args)
+ kwargs = get_commit_rewriter_kwargs(args)
if args.source_repo:
kwargs['source_repo'] = args.source_repo
+ repo = dulwich.repo.Repo(args.git_repo)
+ rewriter = commit_metadata_rewriter(repo, **kwargs)
+
try:
- linearize_git_repo(args.git_repo, args.ref, **kwargs)
+ linearize_git_repo(repo, args.ref,
+ exclude_dirs=args.exclude_dirs,
+ commit_rewriter=rewriter)
except RewriteError as e:
logger.error('abort: %s' % str(e))
def linearize_git_to_hg():
parser = argparse.ArgumentParser()
for args, kwargs in LINEARIZE_GIT_ARGS:
parser.add_argument(*args, **kwargs)
@@ -167,31 +175,32 @@ def linearize_git_to_hg():
args = parser.parse_args()
if args.hg:
hglib.HGPATH = args.hg
configure_logging()
- kwargs = get_git_linearize_kwargs(args)
- for k in ('similarity', 'find_copies_harder', 'skip_submodules',
- 'move_to_subdir', 'git_push_url', 'hg_push_url',
- 'shamap_s3_upload_url'):
- v = getattr(args, k)
- if v is not None:
- kwargs[k] = v
+ rewriter_args = get_commit_rewriter_kwargs(args)
try:
linearize_git_repo_to_hg(
args.git_repo_url,
args.git_ref,
args.git_repo_path,
args.hg_repo_path,
- **kwargs)
+ git_push_url=args.git_push_url,
+ hg_push_url=args.hg_push_url,
+ move_to_subdir=args.move_to_subdir,
+ find_copies_harder=args.find_copies_harder,
+ skip_submodules=args.skip_submodules,
+ similarity=args.similarity,
+ shamap_s3_upload_url=args.shamap_s3_upload_url,
+ git_commit_rewriter_args=rewriter_args)
except RewriteError as e:
logger.error('abort: %s' % str(e))
sys.exit(1)
except subprocess.CalledProcessError:
sys.exit(1)
def overlay_hg_repos_cli():
--- a/vcssync/mozvcssync/git2hg.py
+++ b/vcssync/mozvcssync/git2hg.py
@@ -5,35 +5,39 @@
from __future__ import absolute_import, unicode_literals
import hashlib
import logging
import os
import subprocess
import tempfile
+import dulwich.repo
import hglib
+from .gitrewrite import (
+ commit_metadata_rewriter,
+)
from .gitrewrite.linearize import (
linearize_git_repo,
)
logger = logging.getLogger(__name__)
def linearize_git_repo_to_hg(git_source_url, ref, git_repo_path, hg_repo_path,
git_push_url=None,
hg_push_url=None,
move_to_subdir=None,
find_copies_harder=False,
skip_submodules=False,
similarity=50,
shamap_s3_upload_url=None,
- **kwargs):
+ git_commit_rewriter_args=None):
"""Linearize a Git repo to an hg repo by squashing merges.
Many Git repositories (especially those on GitHub) have an excessive
number of merge commits and don't practice "every commit is
good/bisectable." When converting these repositories to Mercurial, it is
often desirable to ignore the non-first-parent ancestry so the result has
more readable history.
@@ -48,16 +52,20 @@ def linearize_git_repo_to_hg(git_source_
This directory will be created if necessary.
If ``git_push_url`` is specified, the local clone (including converted
commits) will be pushed to that URL.
If ``hg_push_url`` is specified, the converted Mercurial repo will be
pushed to that URL.
+ ``git_commit_rewriter_args`` is a dict of arguments to pass to
+ ``gitrewrite.commit_metadata_rewriter()`` to construct a function for
+ rewriting Git commits.
+
The conversion works in phases:
1) Git commits are rewritten into a new ref.
2) ``hg convert`` converts the rewritten Git commits to Mercurial.
See the docs in ``/docs/vcssync.rst`` for reasons why.
Returns a dict describing the conversion result. The dict has the following
@@ -83,29 +91,34 @@ def linearize_git_repo_to_hg(git_source_
hg_repo_path = os.path.abspath(hg_repo_path)
# Create Git repo, if necessary.
if not os.path.exists(git_repo_path):
subprocess.check_call([b'git', b'init', b'--bare', git_repo_path])
# We don't need to set up a remote because we use an explicit refspec
# during fetch.
+ git_repo = dulwich.repo.Repo(git_repo_path)
+
subprocess.check_call([b'git', b'fetch', b'--no-tags', git_source_url,
b'heads/%s:heads/%s' % (ref, ref)],
cwd=git_repo_path)
if git_push_url:
subprocess.check_call([b'git', b'push', b'--mirror', git_push_url],
cwd=git_repo_path)
+ rewriter = commit_metadata_rewriter(git_repo,
+ source_repo=git_source_url,
+ **git_commit_rewriter_args)
+
git_state = linearize_git_repo(
- git_repo_path,
+ git_repo,
b'heads/%s' % ref,
- source_repo=git_source_url,
- **kwargs)
+ commit_rewriter=rewriter)
if git_push_url:
subprocess.check_call([b'git', b'push', b'--mirror', git_push_url],
cwd=git_repo_path)
rev_map = os.path.join(hg_repo_path, b'.hg', b'shamap')
def maybe_push_hg():
--- a/vcssync/mozvcssync/gitrewrite/__init__.py
+++ b/vcssync/mozvcssync/gitrewrite/__init__.py
@@ -291,8 +291,141 @@ def rewrite_commit_message(message, summ
message = b'%s %s' % (summary_prefix, message)
message = b'%s\n' % message.rstrip()
assert isinstance(message, str)
result['message'] = message
return result
+
+
+def commit_metadata_rewriter(
+ repo,
+ summary_prefix=None,
+ reviewable_key=None,
+ remove_reviewable=False,
+ normalize_github_merge_message=False,
+ source_repo_key=None,
+ source_repo=None,
+ source_revision_key=None,
+ committer_action='keep',
+ author_map=None,
+ use_p2_author=False,
+ github_username=None,
+ github_token=None):
+ """Obtain a function used to rewrite Git commit object metadata.
+
+ The returned function is called with 2 arguments: the old and new Git commit
+ objects. The function is expected to mutate the new Git commit object.
+
+ The function is attached to a ``dulwich.repo.Repo`` instance and a set of
+ behavioral options defined via arguments. Those arguments are as follows:
+
+ ``summary_prefix`` allows prefixing the summary line of the commit
+ message with a string. A space separates the prefix from the original
+ message.
+
+ ``reviewable_key`` if set will replace Reviewable.io Markdown in the
+ commit message with a string of the form ``<reviewable_key>: <URL>``.
+
+ ``remove_reviewable`` will remove Reviewable.io Markdown in the commit
+ message.
+
+ ``source_repo_key`` and ``source_repo`` rewrite the commit message to
+ contain metadata listing the source repository in the form
+ ``<source_repo_key>: <source_repo>``. ``source_repo`` should presumably
+ be a URL.
+
+ ``source_revision_key`` if specified will rewrite the commit message
+ to contain a line of the form ``<source_revision_key>: COMMIT`` where
+ ``COMMIT`` is the original Git commit ID.
+
+ ``committer_action`` specifies how to handle the ``committer`` field in
+ the Git commit object. Possible values are ``keep`` (the default) to
+ not modify the field, ``use-author`` to copy the ``author`` field to the
+ ``committer`` field, or ``use-committer`` to copy the ``committer``
+ field to the ``author`` field.
+
+ ``author_map`` is a dict mapping old author/committer values to new
+ ones.
+
+ ``use_p2_author`` indicates whether to use the author of the 2nd parent
+ on merge commits. By default, the author of the merge commit is used.
+ """
+ if committer_action not in ('keep', 'use-author', 'use-committer'):
+ raise ValueError('committer_action must be one of keep, use-author, '
+ 'or use-committer')
+
+ author_map = author_map or {}
+
+ github_client = None
+ if github_username and github_token:
+ github_client = github3.login(username=github_username,
+ token=github_token)
+
+ github_org, github_repo = None, None
+ github_cache_dir = os.path.join(repo.path, 'github-cache')
+
+ if source_repo and source_repo.startswith(b'https://github.com/'):
+ orgrepo = source_repo[len(b'https://github.com/'):]
+ github_org, github_repo = orgrepo.split(b'/')
+
+ if github_client and github_repo and not os.path.exists(github_cache_dir):
+ os.mkdir(github_cache_dir)
+
+
+ def rewrite_commit(source_commit, dest_commit):
+ if use_p2_author and len(source_commit.parents) == 2:
+ c = repo[source_commit.parents[1]]
+ author = c.author
+ committer = c.committer
+ else:
+ author = source_commit.author
+ committer = source_commit.committer
+
+ dest_commit.author = author_map.get(author, author)
+ dest_commit.committer = author_map.get(committer, committer)
+
+ if committer_action == 'use-author':
+ dest_commit.committer = dest_commit.author
+ dest_commit.commit_time = dest_commit.author_time
+ dest_commit.commit_timezone = dest_commit.author_timezone
+ elif committer_action == 'use-committer':
+ dest_commit.author = dest_commit.committer
+ dest_commit.author_time = dest_commit.commit_time
+ dest_commit.author_timezone = dest_commit.commit_timezone
+ else:
+ assert committer_action == 'keep'
+
+ if summary_prefix or reviewable_key or remove_reviewable or normalize_github_merge_message:
+ message_result = rewrite_commit_message(
+ dest_commit.message,
+ summary_prefix=summary_prefix,
+ reviewable_key=reviewable_key,
+ remove_reviewable=remove_reviewable,
+ normalize_github_merge=normalize_github_merge_message,
+ github_client=github_client,
+ github_org=github_org,
+ github_repo=github_repo,
+ github_cache_dir=github_cache_dir,
+ )
+
+ dest_commit.message = message_result['message']
+
+ # Record source repository and revision annotations in commit message
+ # if requested.
+ if source_repo_key or source_revision_key:
+ lines = dest_commit.message.rstrip().splitlines()
+
+ # Insert a blank line if previous line isn't a "metadata" line.
+ if not re.match('^[a-zA-Z-]+: \S+$', lines[-1]) or len(lines) == 1:
+ lines.append(b'')
+
+ if source_repo_key:
+ lines.append(b'%s: %s' % (source_repo_key, source_repo))
+ if source_revision_key:
+ lines.append(b'%s: %s' % (source_revision_key,
+ source_commit.id))
+
+ dest_commit.message = b'%s\n' % b'\n'.join(lines)
+
+ return rewrite_commit
--- a/vcssync/mozvcssync/gitrewrite/linearize.py
+++ b/vcssync/mozvcssync/gitrewrite/linearize.py
@@ -6,44 +6,29 @@
This takes a repository with merge commits and rewrites commits to remove
the merges, yielding a clean, linear history.
"""
from __future__ import absolute_import, print_function, unicode_literals
import logging
-import os
-import re
import subprocess
-import dulwich.repo
-import github3
-
from . import (
prune_directories,
- rewrite_commit_message,
RewriteError,
)
logger = logging.getLogger(__name__)
-def linearize_git_repo(git_repo, ref, exclude_dirs=None,
- summary_prefix=None,
- reviewable_key=None,
- remove_reviewable=False,
- normalize_github_merge_message=False,
- source_repo_key=None, source_repo=None,
- source_revision_key=None,
- committer_action='keep',
- author_map=None,
- use_p2_author=False,
- github_username=None, github_token=None):
+def linearize_git_repo(repo, ref, exclude_dirs=None,
+ commit_rewriter=None):
"""Linearize a ref in a Git repository.
The commits in the ref will be rewritten to only include first parent
ancestry. i.e. all merge commits will be removed. All commits from the
non-first-parent ancestry will be dropped.
As a side-effect of conversion, the refs ``refs/convert/source/<ref>`` and
``refs/convert/dest/<ref>`` will be written containing pointers to the
@@ -53,45 +38,19 @@ def linearize_git_repo(git_repo, ref, ex
The original ``ref`` is untouched.
Subsequent invocations of this function will perform an incremental
conversion and only convert commits introduced since the last conversion.
``exclude_dirs`` is an iterable of directories to exclude from history.
- ``summary_prefix`` allows prefixing the summary line of the commit message
- with a string. A space separates the prefix from the original message.
-
- ``reviewable_key`` if set will replace Reviewable.io Markdown in the
- commit message with a string of the form ``<reviewable_key>: <URL>``.
-
- ``remove_reviewable`` will remove Reviewable.io Markdown in the commit
- message.
-
- ``source_repo_key`` and ``source_repo`` rewrite the commit message to
- contain metadata listing the source repository in the form
- ``<source_repo_key>: <source_repo>``. ``source_repo`` should presumably
- be a URL.
-
- ``source_revision_key`` if specified will rewrite the commit message
- to contain a line of the form ``<source_revision_key>: COMMIT`` where
- ``COMMIT`` is the original Git commit ID.
-
- ``committer_action`` specifies how to handle the ``committer`` field in
- the Git commit object. Possible values are ``keep`` (the default) to
- not modify the field, ``use-author`` to copy the ``author`` field to the
- ``committer`` field, or ``use-committer`` to copy the ``committer`` field
- to the ``author`` field.
-
- ``author_map`` is a dict mapping old author/committer values to new
- ones.
-
- ``use_p2_author`` indicates whether to use the author of the 2nd parent
- on merge commits. By default, the author of the merge commit is used.
+ ``commit_rewriter`` is a callable that will be invoked for every commit
+ being rewritten. The called function can modify the Git commit object
+ as it sees fit.
Returns a dict describing what rewrites were performed. The dict has the
following keys:
source_commit
Git commit hash corresponding to ``ref``.
dest_commit
The converted commit hash corresponding to converted ``ref``.
@@ -100,23 +59,16 @@ def linearize_git_repo(git_repo, ref, ex
commits that were converted as part of this evaluation.
source_ref
The ref holding the original commit that was last converted (points
to ``source_commit``).
dest_ref
The ref holding the converted commit that was last converted (points
to ``dest_commit``).
"""
- if committer_action not in ('keep', 'use-author', 'use-committer'):
- raise ValueError('committer_action must be one of keep, use-author, '
- 'or use-committer')
-
- author_map = author_map or {}
-
- repo = dulwich.repo.Repo(git_repo)
head = repo[b'refs/%s' % ref].id
# Look for state from previous conversion.
source_ref = b'refs/convert/source/%s' % ref
dest_ref = b'refs/convert/dest/%s' % ref
if source_ref in repo.refs and dest_ref not in repo.refs:
raise Exception('convert source ref without dest ref %s' % dest_ref)
@@ -172,31 +124,16 @@ def linearize_git_repo(git_repo, ref, ex
logger.warn('no new commits to linearize; not doing anything')
return result
source_commits = list(reversed(source_commits))
logger.warn('linearizing %d commits from %s (%s to %s)' % (
len(source_commits), ref, source_commits[0].id, source_commits[-1].id))
- github_client = None
- if github_username and github_token:
- github_client = github3.login(username=github_username,
- token=github_token)
-
- github_org, github_repo = None, None
- github_cache_dir = os.path.join(git_repo, 'github-cache')
-
- if source_repo and source_repo.startswith(b'https://github.com/'):
- orgrepo = source_repo[len(b'https://github.com/'):]
- github_org, github_repo = orgrepo.split(b'/')
-
- if github_client and github_repo and not os.path.exists(github_cache_dir):
- os.mkdir(github_cache_dir)
-
last_tree = repo[dest_commit_id].tree if dest_commit_id else None
rewrite_count = 0
for i, source_commit in enumerate(source_commits):
logger.warn('%d/%d %s %s' % (
i + 1, len(source_commits), source_commit.id,
source_commit.message.splitlines()[0].decode('utf-8', 'replace')))
@@ -213,74 +150,21 @@ def linearize_git_repo(git_repo, ref, ex
#
# In some cases, retaining empty commits may be desirable. So this
# behavior could be controlled by a function argument if wanted.
if dest_commit.tree == last_tree:
logger.warn('dropping %s because no tree changes' %
source_commit.id)
continue
- if use_p2_author and len(source_commit.parents) == 2:
- c = repo[source_commit.parents[1]]
- author = c.author
- committer = c.committer
- else:
- author = source_commit.author
- committer = source_commit.committer
-
# Replace parents list with our single parent from the last conversion.
dest_commit.parents = [dest_commit_id] if dest_commit_id else []
- dest_commit.author = author_map.get(author, author)
- dest_commit.committer = author_map.get(committer, committer)
-
- if committer_action == 'use-author':
- dest_commit.committer = dest_commit.author
- dest_commit.commit_time = dest_commit.author_time
- dest_commit.commit_timezone = dest_commit.author_timezone
- elif committer_action == 'use-committer':
- dest_commit.author = dest_commit.committer
- dest_commit.author_time = dest_commit.commit_time
- dest_commit.author_timezone = dest_commit.commit_timezone
- else:
- assert committer_action == 'keep'
-
- # Basic commit message rewriting.
- # TODO consider factoring this into a callback to make it extensible.
- if summary_prefix or reviewable_key or remove_reviewable or normalize_github_merge_message:
- message_result = rewrite_commit_message(
- dest_commit.message,
- summary_prefix=summary_prefix,
- reviewable_key=reviewable_key,
- remove_reviewable=remove_reviewable,
- normalize_github_merge=normalize_github_merge_message,
- github_client=github_client,
- github_org=github_org,
- github_repo=github_repo,
- github_cache_dir=github_cache_dir,
- )
-
- dest_commit.message = message_result['message']
-
- # Record source repository and revision annotations in commit message
- # if requested.
- if source_repo_key or source_revision_key:
- lines = dest_commit.message.rstrip().splitlines()
-
- # Insert a blank line if previous line isn't a "metadata" line.
- if not re.match('^[a-zA-Z-]+: \S+$', lines[-1]) or len(lines) == 1:
- lines.append(b'')
-
- if source_repo_key:
- lines.append(b'%s: %s' % (source_repo_key, source_repo))
- if source_revision_key:
- lines.append(b'%s: %s' % (source_revision_key,
- source_commit.id))
-
- dest_commit.message = b'%s\n' % b'\n'.join(lines)
+ if commit_rewriter:
+ commit_rewriter(source_commit, dest_commit)
# Our commit object is fully transformed. Write it.
repo.object_store.add_object(dest_commit)
rewrite_count += 1
dest_commit_id = dest_commit.id
last_tree = dest_commit.tree
result['commit_map'][source_commit.id] = dest_commit_id
@@ -303,26 +187,26 @@ def linearize_git_repo(git_repo, ref, ex
else:
reflog_commands.append(b'create %s\0%s' % (dest_ref, dest_commit_id))
p = subprocess.Popen([b'git', b'update-ref',
b'--create-reflog',
b'-m', b'linearize %s' % ref,
b'--stdin', b'-z'],
stdin=subprocess.PIPE,
- cwd=git_repo)
+ cwd=repo.path)
p.stdin.write(b'\0'.join(reflog_commands))
p.stdin.close()
res = p.wait()
if res:
raise Exception('failed to update refs')
logger.warn('%d commits from %s converted; original: %s; rewritten: %s' % (
rewrite_count, ref, head, repo[dest_ref].id))
# Perform a garbage collection so we don't have potentially thousands
# of loose objects sitting around, as performance will suffer and Git
# will complain otherwise.
subprocess.check_call([b'git',
b'-c', b'gc.autodetach=false',
- b'gc', b'--auto'], cwd=git_repo)
+ b'gc', b'--auto'], cwd=repo.path)
return result