I've been trying to develop an apriori algorithm using this data. I was able to get the associations and the confidence for both the pairs and the triples but am having trouble formatting the output and extracting the correct elements.
I ran the algorithm on this test data. Its just a subset of the original dataset. Currently the output looks like this:
[[frozenset({'GRO73461'}), frozenset({'ELE17451'}), 1.0],
[frozenset({'GRO99222'}), frozenset({'ELE17451'}), 0.8125], [frozenset({'ELE17451'}), frozenset({'GRO99222'}), 0.5], [frozenset({'ELE17451'}), frozenset({'GRO73461'}), 0.38461538461538464]]
frozenset({'GRO73461', 'ELE17451'}), 0.8], [frozenset({'GRO73461'}), frozenset({'DAI22896', 'ELE17451'}), 0.8]
As you can see its kind of a mess. The list is ordered based on the confidence in descending order. I want to separate the frequent pairs from the frequent triples and arrange the output so that it looks like this:
OUTPUT A
FRO11987 FRO12685 0.4325
FRO11987 ELE11375 0.4225
FRO11987 GRO94758 0.4125
FRO11987 SNA80192 0.4025
FRO11987 FRO18919 0.4015
OUTPUT B
FRO11987 FRO12685 DAI95741 0.4325
FRO11987 ELE11375 GRO73461 0.4225
FRO11987 GRO94758 ELE26917 0.4125
FRO11987 SNA80192 ELE28189 0.4025
FRO11987 FRO18919 GRO68850 0.4015
Where the above is the top 5 frequent pairs, and top 5 frequent triples based on the confidence.
The main area I'm having trouble with is discerning between the frequent pairs and triples and then extracting the items from the frozensets such that they are in the above format.
from numpy import *
import pandas as pd
from operator import itemgetter
def loadDataSet(data=None):
return pd.read_csv(data, sep = ' ', error_bad_lines=False)
def createCandidateSet(data):
C1 = []
for transaction in data:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return list(map(frozenset, C1))
def scanData(dataset, Ck, support):
ssCount = {}
for tID in dataset:
for candidate in Ck:
if candidate.issubset(tID):
if not candidate in ssCount:
ssCount[candidate] = 1
else:
ssCount[candidate]+=1
# numItems = float(len(dataset))
res = []
supportData ={}
for key in ssCount:
#Support is a proportion or a integer; the occurrence of the item in relation to the data set
# currSupport = ssCount[key]/numItems
currSupport = ssCount[key]
if currSupport >= support:
res.insert(0, key)
supportData[key] = currSupport
return res, supportData
def aprioriHelper(Lk, k): #creates candidate itemsets
res = []
freqItemLen = len(Lk)
for i in range(freqItemLen):
for j in range(i+1, freqItemLen):
L1 = list(Lk[i])[:k-2]
L2 = list(Lk[j])[:k-2]
L1.sort()
L2.sort()
if L1 == L2:
res.append(Lk[i] | Lk[j])
return res
def apriori(dataset, minSupport=100):
C1 = createCandidateSet(dataset)
D = list(map(set, dataset))
L1, supportData = scanData(D, C1, minSupport)
L = [L1]
k = 2
while (len(L[k-2]) > 0):
Ck = aprioriHelper(L[k-2], k)
Lk, supportK = scanData(D, Ck, minSupport) #scan dataset to get frequent items sets, now the itemsets are bigger
supportData.update(supportK)
L.append(Lk)
k+=1
return L, supportData
def generateRules(L, supportData, conf = 0.7): #support data is data on each item sets support, comes from scanData
rules = [] #takes tuples of associations, consequences, and confidence
for i in range(1, len(L)): #get itemsets with number of items >=2
for freq in L[i]:
association = [frozenset([item]) for item in freq]
if i > 1:
rulesFromConsequences(freq, association, supportData, rules, conf)
else:
calculateConfidence(freq, association, supportData, rules, conf)
return rules
def calculateConfidence(freq, association, supportData, rules, conf=0.7):
filteredAssociations = []
for consequence in association:
#confidence(I -> J) = Support(I U J)/Support(I)
confidence = supportData[freq]/supportData[freq - consequence] #calculate confidence
if confidence >= conf:
# print(freq-consequence, ' ', consequence, ' ', confidence) #print out association rule and confidence
rules.append((freq-consequence, consequence, confidence))
filteredAssociations.append(consequence)
return filteredAssociations
def rulesFromConsequences(freq, association, supportData, rules, conf=0.7):
#generate more rules when frequent itemsets become larger
a_len = len(association[0])
if (len(freq) > (a_len+1)): #try to merge into a bigger itemset that is frequent
association_p1 = aprioriHelper(association, a_len+1) #create association+1 new candidates- create bigger itemset and get more candidates for association rules
association_p1 = calculateConfidence(freq, association_p1, supportData, rules, conf)
if len(association_p1) > 1: #need to have at least two sets in order to merge
rulesFromConsequences(freq, association_p1, supportData, rules, conf) #recursively call to build bigger itemset and get more rules
def main():
dataset = [line.split() for line in open('datatest.txt')]
L, supportData = apriori(dataset, minSupport=8)
rules = generateRules(L, supportData, conf=0)
rules = sorted(rules, key = itemgetter(2), reverse=True)
triples = []
doubles = []
i = 0
while len(triples) < 5:
if i == len(rules):
break
if len(rules[i][1]) == 2:
triples.append(rules[i])
i+=1
j = 0
while len(doubles) < 5:
if j == len(rules):
break
if len(rules[j][1]) == 1:
doubles.append(rules[j])
j+=1
if __name__ == '__main__':
main()
Any advice on the issue is appreciated. If you have any questions on the code or thought process please let me know. Apologies in advance if there are any careless mistakes.
Thank you for reading