def stability_pairconf(X, est, n_samples, n_iter=20):
labels = []
intersect = []
jaccard = []
for x in range(n_iter):
boot = resample(X, replace=True, n_samples=n_samples)
#
estimator = est.fit(boot.iloc[:,1:])
#
labels.append(boot['PatientID'])
#
labels.append(estimator.labels_)
labels = np.array(labels).transpose()
labels = pd.DataFrame(labels)
for col_pos_1 in range(n_iter):
for col_pos_2 in range(col_pos_1+1, n_iter):
if col_pos_1 == col_pos_2:
continue
intersect = list(set(labels[col_pos_1*2]) & set(labels[col_pos_2*2]))
run_1_pred = labels[[col_pos_1*2, col_pos_1*2+1]].loc[labels[col_pos_1*2].isin(intersect)]
run_2_pred = labels[[col_pos_2*2, col_pos_2*2+1]].loc[labels[col_pos_2*2].isin(intersect)]
run_1_pred = run_1_pred.drop_duplicates().sort_values(by=col_pos_1*2)[col_pos_1*2+1]
run_2_pred = run_2_pred.drop_duplicates().sort_values(by=col_pos_2*2)[col_pos_2*2+1]
pairconf = pair_confusion_matrix(run_1_pred, run_2_pred)
a = pairconf[1,1] # pairs grouped together in both
b = pairconf[1,0]
c = pairconf[0,1]
d = pairconf[0,0] # pairs not grouped together in both
tmp = a / (a + b + c)
jaccard.append(tmp)
scores = pd.DataFrame({'Jaccard':jaccard})
avgscores = {'Jaccard': np.mean(jaccard)}
return scores, avgscores
So run_1_pred
and run_2_pred
will be of unequal length depending on the boostrapped sample. This leads to issues with contingency. How can I calculate a contingency matrix with two clusterings with unequal clusters?