How count duplicates in one file and output it beautiful Python

Question

I have data with 3 columns. I need to compare this fields and if they are similar count them. If they are not similar then output information. I need to get this1. In below I put my code, but it doesn't count right and I don't know how to do this better

My data(file)

1082,hello,first
1082,hello,first
1082,hello,second
1082,yellow,third
1082,yellow,third
1082,yellow,third
1085,hello,fourth
1085,hello,fourth
1086,hello,fifth
1086,hello,fifth
1086,hello,fifth
1086,yellow,sixth
1086,yellow,sixth
1086,hello,seventh
1086,hello,seventh

I need to get this:

run.py

from create_excel import create_excel, settings_merge, create_header
from functions import separate_first_raw, separate_and_check_all_fields

input_file = "test.csv"
output_file = "test.xlsx"

my_file = open(input_file, "r", encoding="utf-8")
workbook = create_excel(output_file)
worksheet = workbook.add_worksheet()
merge_format = settings_merge(workbook)

raw = 1
column = 0
test = 0
raw_domain = 1
raw_site = 1
sum_category = 1
raw_category = 1
first = "true"

create_header(worksheet)

if my_file:
    current_line = my_file.readline()

    first_line_value = current_line.split(",")
    first_line_id = first_line_value[0]
    first_line_domain = first_line_value[1]
    first_line_value = first_line_value[2]

    # worksheet.write(1, 0, first_line_id)
    # worksheet.write(1, 1, first_line_domain)
    # worksheet.write(1, 4, first_line_value)
    raw = 2

    for line in my_file:
        previous_line = current_line
        print(f'previous_line {previous_line}')
        current_line = line
        print(f'current {current_line}')
        # print(f"Previous {previous_line}")
        # print(f"Current {current_line}")

        domain = separate_first_raw(previous_line, current_line, first)
        site = separate_and_check_all_fields(previous_line, current_line, 1, first)
        category = separate_and_check_all_fields(previous_line, current_line, 2, first)
#
#         # for i in domain:
#         #     print(i)
#         #     if i != 0:
#         #         worksheet.merge_range(raw_domain, 0, raw, 0, i[0], merge_format)
#         #         print(f"Id: {i[0]}  {raw_domain} and {raw}")
#         #         raw_domain = raw + 1
#         # for i in site:
#         #     if i != 0:
#         #         worksheet.write(raw, 1, i[1], merge_format)
#         #         raw_site = raw + 1
        for i in category:
            if i != 0:
                print(i)
                if first == "true":
                    worksheet.write(raw, 3, first_line_value, merge_format)
                    worksheet.write(raw, 2, sum_category, merge_format)
                else:
                    worksheet.write(raw, 3, i[2], merge_format)
                    worksheet.write(raw, 2, sum_category, merge_format)
                raw_category = raw + 1
                sum_category = 1
                raw += 1
                first = "false"
            else:
                sum_category += 1
# else:
#     print("Please input file")


workbook.close()

functions.py

def separate_first_raw(previous_line, current_line, test):
    something = []

    previous_list = previous_line.split(",")
    current_list = current_line.split(",")
    # if test == "true":
    #     something.append(previous_list)
    # else:
    if previous_list[0] != current_list[0]:
        something.append(current_list)
    else:
        something.append(0)

    return something


def separate_and_check_all_fields(previous_line, current_line, field, test):
    something = []

    previous_list = previous_line.split(",")
    current_list = current_line.split(",")
    if test == "true":
        something.append(previous_list)
    else:
        if previous_list[field] != current_list[field] or \
                previous_list[1] != current_list[1] or previous_list[0] != current_list[0]:
            something.append(current_list)
        else:
            something.append(0)

    return something

create_excel.py

import xlsxwriter


def create_excel(filename):
    workbook = xlsxwriter.Workbook(filename)

    return workbook


def create_header(worksheet):
    worksheet.write(0, 0, "id")
    worksheet.write(0, 1, "domain")
    worksheet.write(0, 2, "firewall")
    worksheet.write(0, 3, "category")


def settings_merge(workbook):
    merge_format = workbook.add_format({
        'bold': 1,
        'border': 1,
        'align': 'center',
        'valign': 'vcenter'})

    return merge_format

constantstranger · Accepted Answer · 2022-07-13T16:11:09.160

Here is a simple way using pandas to do what I believe you are attempting:

import pandas as pd
input_file = "test.csv"
output_file = "test.xlsx"
df = pd.read_csv(input_file, sep=',', names=['id','something','ordinal'])
print('','input:',df,sep='\n')
df = df.assign(count=0).groupby(list(df.columns)).count().reset_index(2)
print('','ouput:',df,sep='\n')
df.to_excel(output_file)

Explanation:

Use DataFrame.read_csv() to read from the csv file and name the columns 'id','something','ordinal'
Use assign() to add a column named count
Use groupby() and count() to count the number of rows for each unique value of 'id','something','ordinal'
Use reset_index() to move level 2 (ordinal) out of the MultiIndex and back to the columns
Use DataFrame.to_excel() to write the resulting dataframe with counts to the output filename.

Output:

input:
      id something  ordinal
0   1082     hello    first
1   1082     hello    first
2   1082     hello   second
3   1082    yellow    third
4   1082    yellow    third
5   1082    yellow    third
6   1085     hello   fourth
7   1085     hello   fourth
8   1086     hello    fifth
9   1086     hello    fifth
10  1086     hello    fifth
11  1086    yellow    sixth
12  1086    yellow    sixth
13  1086     hello  seventh
14  1086     hello  seventh

ouput:
                ordinal  count
id   something
1082 hello        first      2
     hello       second      1
     yellow       third      3
1085 hello       fourth      2
1086 hello        fifth      3
     hello      seventh      2
     yellow       sixth      2

Contents of text.csv:

1082,hello,first
1082,hello,first
1082,hello,second
1082,yellow,third
1082,yellow,third
1082,yellow,third
1085,hello,fourth
1085,hello,fourth
1086,hello,fifth
1086,hello,fifth
1086,hello,fifth
1086,yellow,sixth
1086,yellow,sixth
1086,hello,seventh
1086,hello,seventh

Contents of test.xlsx:

Thank you, but is it possible in pandas order by max count? For example: 1082 yellow third 3 hello first 2 hello second 1 1085 hello fourth 2 1086 hello fifth 3 hello seventh 2 yellow sixth 2 — Danila0987654, Jul 13 '22 at 17:52
Yes, it's possible, but that was not part of your original question. If you decide to put that query into a new question, feel free to drop a link to it here and I will be happy to take a look. — constantstranger, Jul 13 '22 at 19:42

How count duplicates in one file and output it beautiful Python

1 Answers1