It seems, pygit2 is not caching Patch and DiffStats attributes from a Diff object. Each access to these attributes re-triggers the git stats calculation. Is there a way to optimize this operation and possibly cache the stats numbers?
I compared the Diff operation on a large commit using GitPthon and pygit2 and the difference in time to generate was substantial.
Here is an example displaying performances of GitPython vs GitPython git diff and stats (several times slower using pygit2):
import os
import string
import random
import tempfile
from time import time
import pygit2
import git as gitpython
repo_dir = tempfile.TemporaryDirectory()
pygit2_repo = pygit2.init_repository(path=repo_dir.name, bare=False, initial_head='master')
gitpython_repo = gitpython.Repo(repo_dir.name)
def create_commit(repo, nb_files=20, nb_lines=100000, nb_char_per_line=50):
for x in range(nb_files):
blob = '\n'.join([''.join(random.choices(string.printable, k=nb_char_per_line))] * nb_lines)
with open(os.path.join(repo.workdir, 'file_%s.txt' % x), 'w') as f:
f.write(blob)
repo.index.add_all()
repo.index.write()
tree = repo.index.write_tree()
parents = []
if not repo.is_empty:
parent, ref = repo.resolve_refish(refish=repo.head.name)
parents = [parent.oid]
return repo.create_commit("HEAD", repo.default_signature, repo.default_signature, "Commit message", tree, parents).hex
if __name__ == '__main__':
first_commit = create_commit(pygit2_repo)
second_commit = create_commit(pygit2_repo)
t0 = time()
pygit2_commit = pygit2_repo.get(second_commit)
pygit2_diff = pygit2_repo.diff(first_commit, second_commit)
print('pygit2 diff generation %f' % (time()-t0))
t0 = time()
gitpython_commit = gitpython_repo.commit(second_commit)
gitpython_diff = gitpython_commit.diff(gitpython_commit.parents[0])
print('gitpython diff generation %f' % (time()-t0))
t0 = time()
s = pygit2_diff.stats
pygit2_total_deletions = s.deletions
pygit2_total_insertions = s.insertions
print('pygit2 stats lookup %f' % (time()-t0))
t0 = time()
gitpython_total_deletions = gitpython_commit.stats.total['deletions']
gitpython_total_insertions = gitpython_commit.stats.total['insertions']
print('gitpython stats lookup %f' % (time()-t0))
t0 = time()
pygit2_line_stats = {patch.delta.new_file.path: patch.line_stats for patch in pygit2_diff}
print('pygit2 line stats %f' % (time()-t0))
t0 = time()
gitpython_line_stats = gitpython_commit.stats.files
print('gitpython line stats %f' % (time()-t0))
t0 = time()
pygit2_diff_changes = [(d.new_file.path, d.old_file.path) for d in pygit2_diff.deltas]
print('pygit2 diff deltas iteration %f' % (time()-t0))
t0 = time()
gitpython_diff_changes = [(patch.a_path, patch.b_path) for patch in gitpython_diff]
print('gitpython diff changes iteration %f' % (time()-t0))
There is also the odd behavior of recalculating all changes when iterating over files from a Diff object.
Is it possible to pre-populate all changes within a Diff to improve speed?
On another aspect, is it possible to discard certain file differences at the git level when their changes are too excessive, possibly given an argument given to the diff function?
Thanks in advance!