I'm running the following code:
import numpy as np
import pandas as pd
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
def fbeta_score(precision, recall, beta=1.0):
return (1+beta**2)*(precision*recall)/(beta*precision+recall)
def accuracy_score(y_true, y_score, thresholds):
y_true = y_true.to_numpy()
n = len(y_true)
scores = np.tile(y_score, (len(thresholds), 1)).T
adjusted_probas = scores - thresholds + 0.5
y_pred = np.round(adjusted_probas)
diffs = abs(y_pred.T - y_true)
return 1 - diffs.sum(axis=1)/n
def logistic_regression(data, features):
X = data.data[features]
y = data.target
group = np.tile([1,2,3], 200)[:569]
X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(X, y, group, test_size=0.2, random_state=0, stratify=group)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=100, random_state=0)
folds = [(train, test) for train, test in cv.split(X_train, group_train)]
metrics = ['fpr', 'tpr', 'thresholds', 'precision', 'fbeta', 'precision_thresholds', 'accuracy', 'baseline']
results = {
'train': {m:[] for m in metrics},
'val' : {m:[] for m in metrics},
'test' : {m:[] for m in metrics}
}
for train, test in folds:
xtrain = X_train.iloc[train,:]
ytrain = y_train.iloc[train]
xval = X_train.iloc[test,:]
yval = y_train.iloc[test]
logreg = LogisticRegression(max_iter=10000).fit(xtrain, ytrain)
dummy = DummyClassifier().fit(xtrain, ytrain)
xs = [xtrain, xval, X_test]
ys = [ytrain, yval, y_test]
for i, ds in enumerate(results.keys()):
y_preds = logreg.predict_proba(xs[i])[:,1]
labels = ys[i]
fpr, tpr, thresholds = roc_curve(labels, y_preds)
results[ds]['fpr'].append(fpr)
results[ds]['tpr'].append(tpr)
results[ds]['thresholds'].append(thresholds)
precision, recall, precision_thresholds = precision_recall_curve(labels, y_preds)
results[ds]['precision'].append(precision)
results[ds]['fbeta'].append(fbeta_score(precision[:-1], recall[:-1]))
results[ds]['precision_thresholds'].append(precision_thresholds)
results[ds]['accuracy'].append(accuracy_score(labels, y_preds, precision_thresholds))
results[ds]['baseline'].append(dummy.score(xs[i], labels))
return results
data = load_breast_cancer(as_frame=True)
features1 = ['mean radius', 'mean texture']
features2 = ['mean perimeter', 'mean area']
fs = [features1,
features2]
results = []
for features in fs:
results.append(logistic_regression(data, features))
In this example I used the breast cancer data set, but the data set I'm using is much larger and I'm using many more feature combinations, so running it takes very long. Is there a way I can optimize this perhaps using Dask? Or is there another approach I can use to get similar results?