0
def stability_pairconf(X, est, n_samples, n_iter=20):
labels = []
intersect = []
jaccard = []

for x in range(n_iter):
    boot = resample(X, replace=True, n_samples=n_samples)
    #
    estimator = est.fit(boot.iloc[:,1:]) 
    #
    labels.append(boot['PatientID'])
    #
    labels.append(estimator.labels_)

labels = np.array(labels).transpose()
labels = pd.DataFrame(labels)
  
for col_pos_1 in range(n_iter):
    for col_pos_2 in range(col_pos_1+1, n_iter):
        if col_pos_1 == col_pos_2:
            continue

        intersect = list(set(labels[col_pos_1*2]) & set(labels[col_pos_2*2]))

        run_1_pred = labels[[col_pos_1*2, col_pos_1*2+1]].loc[labels[col_pos_1*2].isin(intersect)]
        run_2_pred = labels[[col_pos_2*2, col_pos_2*2+1]].loc[labels[col_pos_2*2].isin(intersect)]

        run_1_pred = run_1_pred.drop_duplicates().sort_values(by=col_pos_1*2)[col_pos_1*2+1]
        run_2_pred = run_2_pred.drop_duplicates().sort_values(by=col_pos_2*2)[col_pos_2*2+1]

        pairconf = pair_confusion_matrix(run_1_pred, run_2_pred)

        a = pairconf[1,1] # pairs grouped together in both
        b = pairconf[1,0]
        c = pairconf[0,1]
        d = pairconf[0,0] # pairs not grouped together in both

        tmp = a / (a + b + c)          
        jaccard.append(tmp)
      
scores = pd.DataFrame({'Jaccard':jaccard})
    
avgscores = {'Jaccard': np.mean(jaccard)}
    
return scores, avgscores

So run_1_pred and run_2_pred will be of unequal length depending on the boostrapped sample. This leads to issues with contingency. How can I calculate a contingency matrix with two clusterings with unequal clusters?

0 Answers0