0

I wrote the following code to extract commit information from a GitHub repos. I've used the tabulate module to display the data in tabular form. I'm getting UnicodeEncodeError: 'charmap' codec can't encode characters in position 0-5346: character maps to <undefined>. I'm using visual studio code for compiling purposes.

from datetime import datetime
from pydriller import Repository
from tabulate import tabulate
from pydriller.metrics.process.code_churn import CodeChurn

repo_url = 'https://github.com/horsicq/XOpcodeCalc'  # Replace with the repository URL or local path
# mining_parameters = {
#     # 'path': '/path/to/relevant/files',  # Optional: Specify the path to relevant files/folders
#     'since': datetime(2022, 1, 1),  # Optional: Filter commits since a specific date
#     'to': datetime(2022, 12, 31),  # Optional: Filter commits up to a specific date
#     'order': 'reverse'  # Optional: Set the order of retrieved commits (default is 'reverse')
# }
table_data = []

for commit in Repository(repo_url).traverse_commits():
    commit_id = commit.hash
    committer_name = commit.committer
    commit_date = commit.committer_date
    commit_message = commit.msg
    author_name = commit.author.name
    author_email = commit.author.email 
    branch = commit.branches
    in_main_branch_stat = commit.in_main_branch
    merge_commit = commit.merge
    # dmm_u_size = commit.dmm_unit_size
    # dmm_u_complexity = commit.dmm_unit_complexity
    # dmm_u_interfacing = commit.dmm_unit_interfacing

    
    row = [commit_id,committer_name, commit_message, author_name, author_email, commit_date, branch, in_main_branch_stat, merge_commit]
    table_data.append(row)
    
    # print(f"Commit ID: {commit_id}")
    # print(f"Committer: {committer_name}")
    # print(f"Date: {commit_date}")
    # print(f"Message: {commit_message}")
    # print(f"Author: {author_name} <{author_email}>")
    # print(f"Branch: {branch}")
    # print(f"In main branch status: {in_main_branch_stat}")
    # print(f"Merge commit condition: {merge_commit}")
    # print(f"DMM Unit Size: {dmm_u_size}")
    # print(f"DMM Unit Complexity: {dmm_u_complexity}")
    # print(f"DMM Unit Interfacing: {dmm_u_interfacing}")

    
    for modified_file in commit.modified_files:
        file_path = modified_file.filename
        change_type = modified_file.change_type
        lines_added = modified_file.added_lines
        lines_removed = modified_file.deleted_lines
        No_methods = modified_file.methods                  # giving hexadecimal of location of methods
        No_methods_before = modified_file.methods_before    # giving hexadecimal of location of methods
        changed_methods = modified_file.changed_methods     # giving hexadecimal of location of methods
        complexity = modified_file.complexity
        # code_diff = modified_file.diff
        nloc = modified_file.nloc
        # source_code = modified_file.source_code
        # source_code_before = modified_file.source_code_before

        row = ['', file_path, change_type, lines_added, lines_removed, No_methods,No_methods_before, changed_methods, complexity]
        table_data.append(row)
        # print(f"File Path: {file_path}")
        # print(f"Change Type: {change_type}")
        # print(f"Lines Added: {lines_added}")
        # print(f"Lines Removed: {lines_removed}")
        # print(f"No_methods: {No_methods}")
        # print(f"No_methods_before: {No_methods_before}")
        # print(f"Changed_methods: {changed_methods}")
        # print(f"Complexity: {complexity}")
        # print(f"Code Diff: {code_diff}")
        # print(f"Lines of Code: {nloc}")

        # print(f"source_code_before: {source_code_before}")
        # print(f"source_code: {source_code}")
    #     print('-' * 50)

    # print('-' * 50)

    table_headers = ['Commit ID', 'Committer Name', 'Message', 'Author Name', 'Author Email', 'Commit Date', 'Branch', 'In Main Branch Stat', 'Merge Commit', 'File Path', 'Change Type', 'Lines Added', 'Lines Removed', 'No. of Methods', 'No. of Methods Before', 'Changed Methods', 'Complexity']

table = tabulate(table_data, headers=table_headers, tablefmt='fancy_grid')
print(table)

OUTPUT:

[Running] python -u "c:\Users\pulki\Desktop\Pulkit_work\Coding_stuff\apirewuest\api.py"
Traceback (most recent call last):
  File "c:\Users\pulki\Desktop\Pulkit_work\Coding_stuff\apirewuest\api.py", line 82, in <module>
    print(table)
  File "C:\Python311\Lib\encodings\cp1252.py", line 19, in encode
    return codecs.charmap_encode(input,self.errors,encoding_table)[0]
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
UnicodeEncodeError: 'charmap' codec can't encode characters in position 0-5346: character maps to <undefined>

[Done] exited with code=1 in 195.387 seconds

Please can someone identify the error and let me know. Thanks

Also can someone suggest a better way to extract the data?

I used tabulate to display the data in tabular form but an encoding error is thrown

0 Answers0