I wrote the following code to extract commit information from a GitHub repos. I've used the tabulate
module to display the data in tabular form. I'm getting UnicodeEncodeError: 'charmap' codec can't encode characters in position 0-5346: character maps to <undefined>
. I'm using visual studio code for compiling purposes.
from datetime import datetime
from pydriller import Repository
from tabulate import tabulate
from pydriller.metrics.process.code_churn import CodeChurn
repo_url = 'https://github.com/horsicq/XOpcodeCalc' # Replace with the repository URL or local path
# mining_parameters = {
# # 'path': '/path/to/relevant/files', # Optional: Specify the path to relevant files/folders
# 'since': datetime(2022, 1, 1), # Optional: Filter commits since a specific date
# 'to': datetime(2022, 12, 31), # Optional: Filter commits up to a specific date
# 'order': 'reverse' # Optional: Set the order of retrieved commits (default is 'reverse')
# }
table_data = []
for commit in Repository(repo_url).traverse_commits():
commit_id = commit.hash
committer_name = commit.committer
commit_date = commit.committer_date
commit_message = commit.msg
author_name = commit.author.name
author_email = commit.author.email
branch = commit.branches
in_main_branch_stat = commit.in_main_branch
merge_commit = commit.merge
# dmm_u_size = commit.dmm_unit_size
# dmm_u_complexity = commit.dmm_unit_complexity
# dmm_u_interfacing = commit.dmm_unit_interfacing
row = [commit_id,committer_name, commit_message, author_name, author_email, commit_date, branch, in_main_branch_stat, merge_commit]
table_data.append(row)
# print(f"Commit ID: {commit_id}")
# print(f"Committer: {committer_name}")
# print(f"Date: {commit_date}")
# print(f"Message: {commit_message}")
# print(f"Author: {author_name} <{author_email}>")
# print(f"Branch: {branch}")
# print(f"In main branch status: {in_main_branch_stat}")
# print(f"Merge commit condition: {merge_commit}")
# print(f"DMM Unit Size: {dmm_u_size}")
# print(f"DMM Unit Complexity: {dmm_u_complexity}")
# print(f"DMM Unit Interfacing: {dmm_u_interfacing}")
for modified_file in commit.modified_files:
file_path = modified_file.filename
change_type = modified_file.change_type
lines_added = modified_file.added_lines
lines_removed = modified_file.deleted_lines
No_methods = modified_file.methods # giving hexadecimal of location of methods
No_methods_before = modified_file.methods_before # giving hexadecimal of location of methods
changed_methods = modified_file.changed_methods # giving hexadecimal of location of methods
complexity = modified_file.complexity
# code_diff = modified_file.diff
nloc = modified_file.nloc
# source_code = modified_file.source_code
# source_code_before = modified_file.source_code_before
row = ['', file_path, change_type, lines_added, lines_removed, No_methods,No_methods_before, changed_methods, complexity]
table_data.append(row)
# print(f"File Path: {file_path}")
# print(f"Change Type: {change_type}")
# print(f"Lines Added: {lines_added}")
# print(f"Lines Removed: {lines_removed}")
# print(f"No_methods: {No_methods}")
# print(f"No_methods_before: {No_methods_before}")
# print(f"Changed_methods: {changed_methods}")
# print(f"Complexity: {complexity}")
# print(f"Code Diff: {code_diff}")
# print(f"Lines of Code: {nloc}")
# print(f"source_code_before: {source_code_before}")
# print(f"source_code: {source_code}")
# print('-' * 50)
# print('-' * 50)
table_headers = ['Commit ID', 'Committer Name', 'Message', 'Author Name', 'Author Email', 'Commit Date', 'Branch', 'In Main Branch Stat', 'Merge Commit', 'File Path', 'Change Type', 'Lines Added', 'Lines Removed', 'No. of Methods', 'No. of Methods Before', 'Changed Methods', 'Complexity']
table = tabulate(table_data, headers=table_headers, tablefmt='fancy_grid')
print(table)
OUTPUT:
[Running] python -u "c:\Users\pulki\Desktop\Pulkit_work\Coding_stuff\apirewuest\api.py"
Traceback (most recent call last):
File "c:\Users\pulki\Desktop\Pulkit_work\Coding_stuff\apirewuest\api.py", line 82, in <module>
print(table)
File "C:\Python311\Lib\encodings\cp1252.py", line 19, in encode
return codecs.charmap_encode(input,self.errors,encoding_table)[0]
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode characters in position 0-5346: character maps to <undefined>
[Done] exited with code=1 in 195.387 seconds
Please can someone identify the error and let me know. Thanks
Also can someone suggest a better way to extract the data?
I used tabulate to display the data in tabular form but an encoding error is thrown