6

I want to generate a Precision-Recall curve with 5-fold cross-validation showing standard deviation as in the example ROC curve code here.

The code below (adapted from How to Plot PR-Curve Over 10 folds of Cross Validation in Scikit-Learn) gives a PR curves for each fold of cross-validation along with the mean PR curve. I wanted to also show the region of one standard deviation above and below the mean PR curve in grey. But it gives the following error (details in the link below the code):

ValueError: operands could not be broadcast together with shapes (91,) (78,)

import matplotlib.pyplot as plt
import numpy
from sklearn.datasets import make_blobs
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import KFold
from sklearn.svm import SVC


X, y = make_blobs(n_samples=500, n_features=2, centers=2, cluster_std=10.0,
    random_state=10)

k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
predictor = SVC(kernel='linear', C=1.0, probability=True, random_state=10)

y_real = []
y_proba = []

precisions, recalls = [], []

for i, (train_index, test_index) in enumerate(k_fold.split(X)):
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]
    predictor.fit(Xtrain, ytrain)
    pred_proba = predictor.predict_proba(Xtest)
    precision, recall, _ = precision_recall_curve(ytest, pred_proba[:,1])
    lab = 'Fold %d AUC=%.4f' % (i+1, auc(recall, precision))
    plt.plot(recall, precision, alpha=0.3, label=lab)
    y_real.append(ytest)
    y_proba.append(pred_proba[:,1])
    precisions.append(precision)
    recalls.append(recall)

y_real = numpy.concatenate(y_real)
y_proba = numpy.concatenate(y_proba)
precision, recall, _ = precision_recall_curve(y_real, y_proba)
lab = 'Overall AUC=%.4f' % (auc(recall, precision))

plt.plot(recall, precision, lw=2,color='red', label=lab)

std_precision = np.std(precisions, axis=0)
tprs_upper = np.minimum(precisions[median] + std_precision, 1)
tprs_lower = np.maximum(precisions[median] - std_precision, 0)
plt.fill_between(recall_overall, upper_precision, lower_precision, alpha=0.5, linewidth=0, color='grey')

Error reported and Plot generated

Can you please suggest how I add to the following code to also show one standard deviation around the mean PR curve?

user1886130
  • 109
  • 7

2 Answers2

3

I have got a working solution, but it would be helpful if anybody can comment whether it is doing the right thing.

import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import KFold
from sklearn.svm import SVC
from numpy import interp

X, y = make_blobs(n_samples=500, n_features=2, centers=2, cluster_std=10.0,
    random_state=10)

k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
predictor = SVC(kernel='linear', C=1.0, probability=True, random_state=10)

y_real = []
y_proba = []

precision_array = []
threshold_array=[]
recall_array = np.linspace(0, 1, 100)

for i, (train_index, test_index) in enumerate(k_fold.split(X)):
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]
    predictor.fit(Xtrain, ytrain)
    pred_proba = predictor.predict_proba(Xtest)
    precision_fold, recall_fold, thresh = precision_recall_curve(ytest, pred_proba[:,1])
    precision_fold, recall_fold, thresh = precision_fold[::-1], recall_fold[::-1], thresh[::-1]  # reverse order of results
    thresh = np.insert(thresh, 0, 1.0)
    precision_array = interp(recall_array, recall_fold, precision_fold)
    threshold_array = interp(recall_array, recall_fold, thresh)
    pr_auc = auc(recall_array, precision_array)

    lab_fold = 'Fold %d AUC=%.4f' % (i+1, pr_auc)
    plt.plot(recall_fold, precision_fold, alpha=0.3, label=lab_fold)
    y_real.append(ytest)
    y_proba.append(pred_proba[:,1])

y_real = numpy.concatenate(y_real)
y_proba = numpy.concatenate(y_proba)
precision, recall, _ = precision_recall_curve(y_real, y_proba)
lab = 'Overall AUC=%.4f' % (auc(recall, precision))

plt.plot(recall, precision, lw=2,color='red', label=lab)

plt.legend(loc='lower left', fontsize='small')

mean_precision = np.mean(precision_array)
std_precision = np.std(precision_array)
plt.fill_between(recall, precision + std_precision, precision - std_precision, alpha=0.3, linewidth=0, color='grey')
plt.show()

PR diagram generated by the code above showing standard deviation

Dharman
  • 30,962
  • 25
  • 85
  • 135
user1886130
  • 109
  • 7
  • Hi @user1886130, I don't think this is doing what intended because the variable precision_array is being overwritten at every iteration of the for loop. So, when in the end you compute mean and std you are only computing it over the last iteration – Tommaso Di Noto Jun 14 '21 at 09:05
2

Although in the right direction, the answer from @user1886130 is not entirely correct since the variable precision_array is overwritten at each iteration inside the loop.

A cleaner and correct version is:

import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.metrics import precision_recall_curve, auc
from sklearn.model_selection import KFold
from sklearn.svm import SVC

X, y = make_blobs(n_samples=500, n_features=2, centers=2, cluster_std=10.0, random_state=10)

k_fold = KFold(n_splits=5, shuffle=True, random_state=10)
predictor = SVC(kernel='linear', C=1.0, probability=True, random_state=10)

y_real = []
y_proba = []
precision_array = []
recall_array = np.linspace(0, 1, 100)

for i, (train_index, test_index) in enumerate(k_fold.split(X)):
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]
    predictor.fit(Xtrain, ytrain)
    pred_proba = predictor.predict_proba(Xtest)
    precision_fold, recall_fold, _ = precision_recall_curve(ytest, pred_proba[:, 1])
    precision_fold, recall_fold = precision_fold[::-1], recall_fold[::-1]  # reverse order of results
    prec_array = np.interp(recall_array, recall_fold, precision_fold)
    pr_auc = auc(recall_array, prec_array)
    precision_array.append(prec_array)

    lab_fold = 'Fold %d AUPR=%.4f' % (i+1, pr_auc)
    plt.plot(recall_fold, precision_fold, alpha=0.3, label=lab_fold)
    y_real.append(ytest)
    y_proba.append(pred_proba[:, 1])

y_real = np.concatenate(y_real)
y_proba = np.concatenate(y_proba)
precision, recall, _ = precision_recall_curve(y_real, y_proba)
lab = 'Overall AUPR=%.4f' % (auc(recall, precision))

plt.plot(recall, precision, lw=2, color='red', label=lab)

plt.legend(loc='lower left', fontsize='small')

mean_precision = np.mean(precision_array, axis=0)
std_precision = np.std(precision_array, axis=0)
plt.fill_between(recall_array, mean_precision + std_precision, mean_precision - std_precision, alpha=0.3, linewidth=0, color='grey')
plt.title("PR curves; {} folds".format(k_fold.n_splits), weight="bold", fontsize=15)
plt.xlabel("Recall (Sensitivity)", fontsize=12)
plt.ylabel("Precision (PPV)", fontsize=12)
plt.show()

enter image description here

Tommaso Di Noto
  • 1,208
  • 1
  • 13
  • 24