I am having 16 csv files and each file contains around 11250 rows with 19 features and one column for labels. I want to implement Leave One Group Out as Cross Validation for feature selection algoithm like Sequential Forward Selection and Mutual Information. I don't know how to implement this cross validation technique. The following code is below:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.linear_model import LogisticRegression
from numpy import array
import pandas as pd
import matplotlib.pyplot as plt
import glob
#################
#Reading cvs file
#################
print("Reading multiple csv files")
print("")
csv_files = glob.iglob('D:/Project/csvfiles/*')
dataframe = pd.DataFrame()
print("Merging all the csv files")
print("")
#append all files together
for file in csv_files:
df_temp = pd.read_csv(file)
dataframe = dataframe.append(df_temp, ignore_index=True)
print("Dataframe is saved")
print("")
# dataframe.to_csv("Dataset.csv")
X = dataframe.drop(['Unnamed: 0', 'Labels'], axis=1)
y = dataframe['Labels']
#############################
# Class for Feature Selection
#############################
class FeatureSelection:
def __init__(self, dataframe, target):
self.dataframe = dataframe
self.target = target
###########################
# Normalizing the dataframe
###########################
def normalization(self):
print("Performing Normalization")
print("")
for column in self.dataframe.columns:
self.dataframe[column] = (self.dataframe[column]-self.dataframe[column].min()) / (self.dataframe[column].max() - self.dataframe[column].min())
print("Normalization Completed")
print("")
##############################
# Sequential Forward Selection
##############################
def sequential_forward_selection(self):
print("Performing Sequential Forward Selection")
print("")
sfs = SFS(rfc(n_jobs=-1),
k_features='best',
forward=True,
floating=False,
verbose=2,
scoring='accuracy', # sklearn classifiers
cv=5)
sfs = sfs.fit(self.dataframe, self.target)
print('Dictionary: ',sfs.get_metric_dict())
print('')
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
fig1 = plot_sfs(sfs.get_metric_dict(confidence_interval=0.95), kind='std_err')
plt.title('Sequential Forward Selection (with normalization)')
plt.grid()
plt.show()
print('Best Features {} with Index number {}:'.format(sfs.k_feature_names_, sfs.k_feature_idx_))
print('')
df = pd.DataFrame.from_dict(sfs.get_metric_dict()).T
print(df[["feature_idx","avg_score"]] )
print("Sequential Forward Selection completed")
print("")
####################
# Mutual Information
####################
def mutual_information(self):
print("Performing Mutual Information")
print("")
mic = SelectKBest(score_func=mutual_info_classif, k=15)
mic.fit(self.dataframe, self.target)
feature_MI_score = pd.Series(mic.scores_, index=self.dataframe.columns)
print(feature_MI_score.sort_values(ascending=False))
feature_MI_score.sort_values(ascending=False).plot.bar(figsize=(10, 8))
plt.show()
print("")
print("Mutual Information completed")
print("")
if __name__ == "__main__":
featureselection = FeatureSelection(X, y)
featureselection.normalization()
featureselection.sequential_forward_selection()
featureselection.mutual_information()
My task is to use Leave One Group Out as Cross Validation where it will first take 1 dataset as test data and remaining 15 as training data. Then it should move on until all the dataset once become test data and find the best features out of that. Any help is appreciated.