pygit2 Slow performance on Diff Patch iteration and no cache on DiffStats

Question

It seems, pygit2 is not caching Patch and DiffStats attributes from a Diff object. Each access to these attributes re-triggers the git stats calculation. Is there a way to optimize this operation and possibly cache the stats numbers?

I compared the Diff operation on a large commit using GitPthon and pygit2 and the difference in time to generate was substantial.

Here is an example displaying performances of GitPython vs GitPython git diff and stats (several times slower using pygit2):

import os
import string
import random
import tempfile
from time import time
import pygit2
import git as gitpython

repo_dir = tempfile.TemporaryDirectory()
pygit2_repo = pygit2.init_repository(path=repo_dir.name, bare=False, initial_head='master')
gitpython_repo = gitpython.Repo(repo_dir.name)


def create_commit(repo, nb_files=20, nb_lines=100000, nb_char_per_line=50):
    for x in range(nb_files):
        blob = '\n'.join([''.join(random.choices(string.printable, k=nb_char_per_line))] * nb_lines)
        with open(os.path.join(repo.workdir, 'file_%s.txt' % x), 'w') as f:
            f.write(blob)
    repo.index.add_all()
    repo.index.write()
    tree = repo.index.write_tree()

    parents = []
    if not repo.is_empty:
        parent, ref = repo.resolve_refish(refish=repo.head.name)
        parents = [parent.oid]

    return repo.create_commit("HEAD", repo.default_signature, repo.default_signature, "Commit message", tree, parents).hex


if __name__ == '__main__':
    first_commit = create_commit(pygit2_repo)
    second_commit = create_commit(pygit2_repo)

    t0 = time()
    pygit2_commit = pygit2_repo.get(second_commit)
    pygit2_diff = pygit2_repo.diff(first_commit, second_commit)
    print('pygit2    diff generation  %f' % (time()-t0))

    t0 = time()
    gitpython_commit = gitpython_repo.commit(second_commit)
    gitpython_diff = gitpython_commit.diff(gitpython_commit.parents[0])
    print('gitpython diff generation  %f' % (time()-t0))

    t0 = time()
    s = pygit2_diff.stats
    pygit2_total_deletions = s.deletions
    pygit2_total_insertions = s.insertions
    print('pygit2    stats lookup  %f' % (time()-t0))

    t0 = time()
    gitpython_total_deletions = gitpython_commit.stats.total['deletions']
    gitpython_total_insertions = gitpython_commit.stats.total['insertions']
    print('gitpython stats lookup  %f' % (time()-t0))

    t0 = time()
    pygit2_line_stats = {patch.delta.new_file.path: patch.line_stats for patch in pygit2_diff}
    print('pygit2    line stats  %f' % (time()-t0))

    t0 = time()
    gitpython_line_stats = gitpython_commit.stats.files
    print('gitpython line stats  %f' % (time()-t0))

    t0 = time()
    pygit2_diff_changes = [(d.new_file.path, d.old_file.path) for d in pygit2_diff.deltas]
    print('pygit2    diff deltas iteration   %f' % (time()-t0))

    t0 = time()
    gitpython_diff_changes = [(patch.a_path, patch.b_path) for patch in gitpython_diff]
    print('gitpython diff changes iteration  %f' % (time()-t0))

There is also the odd behavior of recalculating all changes when iterating over files from a Diff object.

Is it possible to pre-populate all changes within a Diff to improve speed?

On another aspect, is it possible to discard certain file differences at the git level when their changes are too excessive, possibly given an argument given to the diff function?

Thanks in advance!

pygit2 Slow performance on Diff Patch iteration and no cache on DiffStats

0 Answers0