1

I've been trying to develop an apriori algorithm using this data. I was able to get the associations and the confidence for both the pairs and the triples but am having trouble formatting the output and extracting the correct elements.

I ran the algorithm on this test data. Its just a subset of the original dataset. Currently the output looks like this:

[[frozenset({'GRO73461'}), frozenset({'ELE17451'}), 1.0], 
[frozenset({'GRO99222'}), frozenset({'ELE17451'}), 0.8125], [frozenset({'ELE17451'}), frozenset({'GRO99222'}), 0.5], [frozenset({'ELE17451'}), frozenset({'GRO73461'}), 0.38461538461538464]]
frozenset({'GRO73461', 'ELE17451'}), 0.8], [frozenset({'GRO73461'}), frozenset({'DAI22896', 'ELE17451'}), 0.8]

As you can see its kind of a mess. The list is ordered based on the confidence in descending order. I want to separate the frequent pairs from the frequent triples and arrange the output so that it looks like this:

OUTPUT A
FRO11987 FRO12685 0.4325
FRO11987 ELE11375 0.4225
FRO11987 GRO94758 0.4125
FRO11987 SNA80192 0.4025
FRO11987 FRO18919 0.4015
OUTPUT B
FRO11987 FRO12685 DAI95741 0.4325
FRO11987 ELE11375 GRO73461 0.4225
FRO11987 GRO94758 ELE26917 0.4125
FRO11987 SNA80192 ELE28189 0.4025
FRO11987 FRO18919 GRO68850 0.4015

Where the above is the top 5 frequent pairs, and top 5 frequent triples based on the confidence.

The main area I'm having trouble with is discerning between the frequent pairs and triples and then extracting the items from the frozensets such that they are in the above format.

from numpy import *
import pandas as pd
from operator import itemgetter

def loadDataSet(data=None):
    return pd.read_csv(data, sep = ' ', error_bad_lines=False)

def createCandidateSet(data):
    C1 = []
    for transaction in data:
        for item in transaction:
            if not [item] in C1:
                C1.append([item])
    C1.sort()
    return list(map(frozenset, C1))

def scanData(dataset, Ck, support):
    ssCount = {}

    for tID in dataset:
        for candidate in Ck:
            if candidate.issubset(tID):
                if not candidate in ssCount:
                    ssCount[candidate] = 1
                else:
                    ssCount[candidate]+=1
#    numItems = float(len(dataset))
    res = []
    supportData ={}
    for key in ssCount:
        #Support is a proportion or a integer; the occurrence of the item in relation to the data set
#        currSupport = ssCount[key]/numItems
        currSupport = ssCount[key]
        if currSupport >= support:
            res.insert(0, key)
        supportData[key] = currSupport
    return res, supportData

def aprioriHelper(Lk, k): #creates candidate itemsets
    res = []
    freqItemLen = len(Lk)
    for i in range(freqItemLen):
        for j in range(i+1, freqItemLen):
            L1 = list(Lk[i])[:k-2]
            L2 = list(Lk[j])[:k-2]
            L1.sort()
            L2.sort()
            if L1 == L2:
                res.append(Lk[i] | Lk[j])
    return res

def apriori(dataset, minSupport=100):
    C1 = createCandidateSet(dataset)
    D = list(map(set, dataset))
    L1, supportData = scanData(D, C1, minSupport)
    L = [L1]
    k = 2
    while (len(L[k-2]) > 0):
        Ck = aprioriHelper(L[k-2], k)
        Lk, supportK = scanData(D, Ck, minSupport) #scan dataset to get frequent items sets, now the itemsets are bigger
        supportData.update(supportK)
        L.append(Lk)
        k+=1

    return L, supportData

def generateRules(L, supportData, conf = 0.7): #support data is data on each item sets support, comes from scanData
    rules = [] #takes tuples of associations, consequences, and confidence
    for i in range(1, len(L)): #get itemsets with number of items >=2
        for freq in L[i]:
            association = [frozenset([item]) for item in freq]
            if i > 1:
                rulesFromConsequences(freq, association, supportData, rules, conf)
            else:
                calculateConfidence(freq, association, supportData, rules, conf)
    return rules

def calculateConfidence(freq, association, supportData, rules, conf=0.7):
    filteredAssociations = []
    for consequence in association:
        #confidence(I -> J) = Support(I U J)/Support(I)
        confidence = supportData[freq]/supportData[freq - consequence] #calculate confidence
        if confidence >= conf:
#            print(freq-consequence, ' ', consequence, ' ', confidence) #print out association rule and confidence
            rules.append((freq-consequence, consequence, confidence))
            filteredAssociations.append(consequence)
    return filteredAssociations

def rulesFromConsequences(freq, association, supportData, rules, conf=0.7):
    #generate more rules when frequent itemsets become larger
    a_len = len(association[0])
    if (len(freq) > (a_len+1)): #try to merge into a bigger itemset that is frequent
        association_p1 = aprioriHelper(association, a_len+1) #create association+1 new candidates- create bigger itemset and get more candidates for association rules
        association_p1 = calculateConfidence(freq, association_p1, supportData, rules, conf)
        if len(association_p1) > 1: #need to have at least two sets in order to merge
            rulesFromConsequences(freq, association_p1, supportData, rules, conf) #recursively call to build bigger itemset and get more rules


def main():



    dataset = [line.split() for line in open('datatest.txt')]
    L, supportData = apriori(dataset, minSupport=8)



    rules = generateRules(L, supportData, conf=0)
    rules = sorted(rules, key = itemgetter(2), reverse=True)
    triples = []
    doubles = []
    i = 0
    while len(triples) < 5:
        if i == len(rules):
           break
        if len(rules[i][1]) == 2:
           triples.append(rules[i])
        i+=1

    j = 0
    while len(doubles) < 5:
        if j == len(rules):
           break
        if len(rules[j][1]) == 1:
           doubles.append(rules[j])
        j+=1
if __name__ == '__main__':
    main()

Any advice on the issue is appreciated. If you have any questions on the code or thought process please let me know. Apologies in advance if there are any careless mistakes.

Thank you for reading

Srikar Murali
  • 135
  • 2
  • 15

0 Answers0