0

I'm running the following code:

import numpy as np
import pandas as pd

from sklearn.dummy import DummyClassifier
from sklearn.model_selection import RepeatedStratifiedKFold, train_test_split
from sklearn.metrics import roc_curve, precision_recall_curve
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer

def fbeta_score(precision, recall, beta=1.0):
    return (1+beta**2)*(precision*recall)/(beta*precision+recall)

def accuracy_score(y_true, y_score, thresholds):
    y_true = y_true.to_numpy()
    
    n = len(y_true)
    
    scores = np.tile(y_score, (len(thresholds), 1)).T
    adjusted_probas = scores - thresholds + 0.5
    y_pred = np.round(adjusted_probas)
    diffs = abs(y_pred.T - y_true)
    return 1 - diffs.sum(axis=1)/n

def logistic_regression(data, features):
    
    X = data.data[features]
    y = data.target
    
    group = np.tile([1,2,3], 200)[:569]
    
    X_train, X_test, y_train, y_test, group_train, group_test = train_test_split(X, y, group, test_size=0.2, random_state=0, stratify=group)
    
    cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=100, random_state=0)
    folds = [(train, test) for train, test in cv.split(X_train, group_train)]

    metrics = ['fpr', 'tpr', 'thresholds', 'precision', 'fbeta', 'precision_thresholds', 'accuracy', 'baseline']
    results = {
        'train': {m:[] for m in metrics},
        'val'  : {m:[] for m in metrics},
        'test' : {m:[] for m in metrics}
    }

    for train, test in folds:
        
        xtrain = X_train.iloc[train,:] 
        ytrain = y_train.iloc[train]
        xval   = X_train.iloc[test,:] 
        yval   = y_train.iloc[test]
        
        
        logreg = LogisticRegression(max_iter=10000).fit(xtrain, ytrain)
        dummy = DummyClassifier().fit(xtrain, ytrain)
        
        xs = [xtrain, xval, X_test]
        ys = [ytrain, yval, y_test]
        
        for i, ds in enumerate(results.keys()):
            y_preds              = logreg.predict_proba(xs[i])[:,1]
            labels               = ys[i]
            fpr, tpr, thresholds = roc_curve(labels, y_preds)
            results[ds]['fpr'].append(fpr)
            results[ds]['tpr'].append(tpr)
            results[ds]['thresholds'].append(thresholds)
            precision, recall, precision_thresholds = precision_recall_curve(labels, y_preds)
            
            results[ds]['precision'].append(precision)
            results[ds]['fbeta'].append(fbeta_score(precision[:-1], recall[:-1]))
            results[ds]['precision_thresholds'].append(precision_thresholds)
            results[ds]['accuracy'].append(accuracy_score(labels, y_preds, precision_thresholds))
            
            results[ds]['baseline'].append(dummy.score(xs[i], labels))
            
    return results

data = load_breast_cancer(as_frame=True)
features1 = ['mean radius', 'mean texture']
features2 = ['mean perimeter', 'mean area']
fs = [features1,
      features2]

results = []
for features in fs:
    results.append(logistic_regression(data, features))

In this example I used the breast cancer data set, but the data set I'm using is much larger and I'm using many more feature combinations, so running it takes very long. Is there a way I can optimize this perhaps using Dask? Or is there another approach I can use to get similar results?

Mattravel
  • 1,358
  • 1
  • 15
Melanie
  • 13
  • 5

0 Answers0