How to find the most common combinations of values in 2 or more columns for rows in a CSV file. example:
event,rack,role,dc
network,north,mobile,africa
network,east,mobile,asia
oom,south,desktop,europe
cpu,east,web,northamerica
oom,north,mobile,europe
cpu,south,web,northamerica
cpu,west,web,northamerica
I have tried to create lists for some of the possible combinations I'm looking at, and then use the most_common() method in Collections.Counter to find the common patterns. but I need an algorithm to find common records for any possible combination of 2 or more columns.
My code so far:
import csv
from collections import Counter
class Alert:
def __init__(self, event, rack, role, dc):
self.event = event
self.rack = rack
self.role = role
self.dc = dc
def __str__(self):
return(",".join([self.event, self.rack, self.role, self.dc]))
alerts = []
with open('data.csv', mode='r') as csv_file:
csv_reader = csv.DictReader(csv_file)
for row in csv_reader:
alert = Alert(row['event'], row['rack'], row['role'], row['dc'])
alerts.append(alert)
dcevent= []
dceventrole = []
dcrole = []
dcrolerack = []
for alert in alerts:
dcevent.append(alert.dc + '-' + alert.event)
dceventrole.append(alert.dc+'-'+alert.event+'-'+alert.role)
dcrole.append(alert.dc+'-'+alert.role)
dcrolerack.append(alert.dc+'-'+alert.role+'-'+alert.rack)
masterlist = Counter(dcevent).most_common() + Counter(dceventrole).most_common() + Counter(dcrole).most_common() + Counter(dcrolerack).most_common()
for item in sorted(masterlist, key=lambda x: x[1], reverse=True):
print(item)
This is the output for the records mentioned above:
('northamerica-web-cpu', 3) # there are 3 rows matching the values northamerica,web and cpu
('northamerica-web', 3) # there are 3 rows matching just the values northamerica and web
('northamerica-cpu', 3) # there are 3 rows matching northamerica and cpu
('europe-oom', 2) # there are 2 rows matching europe and oom
('africa-mobile-network', 1)
('asia-mobile-network', 1)
('europe-desktop-oom', 1)
('europe-mobile-oom', 1)
('africa-mobile-north', 1)
('asia-mobile-east', 1)
('europe-desktop-south', 1)
('northamerica-web-east', 1)
('europe-mobile-north', 1)
('northamerica-web-south', 1)
('northamerica-web-west', 1)
('africa-mobile', 1)
('asia-mobile', 1)
('europe-desktop', 1)
('europe-mobile', 1)
('africa-network', 1)
('asia-network', 1)