10-fold cross validation for a logistic regression in google colab python

Question

y3_data is the death variable 0 for alive and 1 for dead, x3_data are my categorical variable the are all have binary output for example Diabetes 0 for yes 1 for no and so on i have around 6 variables in x3_data that have a significant P value with y3_data

and i want the output to be like this one 
     Training_Acc        Testing_Acc
0    76.0586319218241    75.32467532467533
1    75.2442996742671    74.67532467532467
2    73.9413680781759    81.16883116883116
3    77.72357723577235   71.89542483660131
4    76.7479674796748    73.20261437908496
Av   75.94316887794285   75.25337407690348

this is the code i wrote

    #dependant variable y_data:
    y3_data = data['DEATH']
        #dependant variable y_data:
    x3_data = df14
    from sklearn.model_selection import KFold
    from sklearn.model_selection import StratifiedKFold
    
    from imblearn.under_sampling import RandomUnderSampler
    
    rus = RandomUnderSampler(random_state=0)
    x3_resampled,y3_resampled = rus.fit_resample(x3_data,y3_data)
    
    
    
    
        #training and test sample :
    x3_training_data, x3_test_data, y3_training_data, y3_test_data = train_test_split(x3_data, y3_data, test_size = 0.3)
    
        # Estimation result:
    logit_model=sm.Logit(y3_training_data,x3_training_data)
    result3=logit_model.fit()
    print(result3.summary2())
    
        # Model Evaluation  :
    logreg=LogisticRegression()
    logreg.fit(x3_training_data,y3_training_data)
    y_pred=logreg.predict(x3_test_data)
    print('Logistic regression model accuracy:{:.2f}'.format(logreg.score(x3_test_data,y3_test_data)))
    print("Logistic Regression F1 Score :",f1_score(y3_test_data,logreg.predict(x3_test_data),average=None))
    sns.heatmap(confusion_matrix(y3_test_data, logreg.predict(x3_test_data)), annot=True, fmt=".0f")
    plt.title("Logistic Regression Confusion Matrix",fontsize=18, color="red");
    
    num_splits = 10 
    kfold = StratifiedKFold(num_splits, shuffle= True, random_state = 1)
    train_accs, test_accs = [], []  #create empty lists to store accurcy values
    for train_index, test_index in kfold.split(x3_data, y3_data):  # the error is for this part Generate indices to split data into training and test set.

        x3_training_data, x3_test_data = x3_data[train_index], x3_data[test_index]
        y3_training_data, y3_test_data = y3_data[train_index], y3_data[test_index]
        logreg.fit(x3_training_data,y3_training_data)
        y3_pred_train = logreg.predict(x3_training_data)
        y3_pred_test = logreg.predict(x3_test_data)
        
        train_accs.append(metrics.accuracy_score(y3_training_data, y3_pred_train) * 100)
        test_accs.append(metrics.accuracy_score(y3_test_data, y3_pred_test) * 100)
    
    ave_train_acc = 0
    ave_test_acc = 0
    
    print("\t","Training_Acc","\t","\t", "Testing_Acc")
    
    for i in range(num_splits):
        print(i,"\t", train_accs[i],"\t", test_accs[i])
        
        ave_train_acc+= train_accs[i]/num_splits
        ave_test_acc+= test_accs[i]/num_splits
        
    print("Av", "\t", ave_train_acc,"\t", ave_test_acc)

i keep get this error here x3_training_data, x3_test_data = x3_data[train_index], x3_data[test_index]

the error is

"None of [Int64Index([      0,       1,       3,       4,       5,       6,       7,\n                  8,       9,      10,\n ],\n           dtype='int64', length=943717)] are in the [columns]"

please help thank you

this is the full error

---------------------------------------------------------------------------

KeyError                                  Traceback (most recent call last)

<ipython-input-4-bf19b4c5d314> in <module>
     64 train_accs, test_accs = [], []  #create empty lists to store accurcy values
     65 for train_index, test_index in kfold.split(x3_data, y3_data):  #Generate indices to split data into training and test set.
---> 66     x3_training_data, x3_test_data = x3_data[train_index], x3_data[test_index]
     67     y3_training_data, y3_test_data = y3_data[train_index], y3_data[test_index]
     68     logreg.fit(x3_training_data,y3_training_data)

2 frames

/usr/local/lib/python3.8/dist-packages/pandas/core/indexing.py in _validate_read_indexer(self, key, indexer, axis)
   1372                 if use_interval_msg:
   1373                     key = list(key)
-> 1374                 raise KeyError(f"None of [{key}] are in the [{axis_name}]")
   1375 
   1376             not_found = list(ensure_index(key)[missing_mask.nonzero()[0]].unique())

KeyError: "None of [Int64Index([      0,       1,       3,       4,       5,       6,       7,\n                  8,       9,      10,\n            ...\n            1048565, 1048566, 1048567, 1048568, 1048569, 1048570, 1048571,\n            1048572, 1048573, 1048574],\n           dtype='int64', length=943717)] are in the [columns]"

hi i posted the full error in the post, thank you and if you have another code that will help with the 10-folf cross validation i would appreciate it . — kjnk, Dec 24 '22 at 09:19
A reproducible example with few rows of dataframe would help to look into this issue — Ahsan Nawaz, Dec 25 '22 at 13:02
Try this answer: https://stackoverflow.com/a/64691642/9401054 — Ahsan Nawaz, Dec 25 '22 at 13:04

10-fold cross validation for a logistic regression in google colab python

0 Answers0