i want to use principal component analysis-mutual information (PCA-MI) to have data representation from source which has source relevance of (value from smartinsole) and ouput variable (value from force plate). PCA was used to determine the principal component of Ni provided that the cumulative variance is greater than 98% of the source information measured from 89 insole sensors. MI is generally used in the selection of input variables for predictive models because it is a good indicator of the relationship between input variables and output variables. here I want to get results like a flowchart as below
then I try to make code like below. but I can't generate like what's in the flowchart
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
# load the dataset
def load_dataset(filename):
# load the dataset as a pandas DataFrame
data = read_csv(filename, header=None)
# retrieve numpy array
dataset = data.values
y = dataset
return y
def load_dataset2(filename):
# load the dataset as a pandas DataFrame
data2 = read_csv(filename, header=None)
# retrieve numpy array
dataset2 = data2.values
X = dataset2
return X
# feature selection
def select_features(X_train, y_train, X_test):
# configure to select a subset of features
fs = SelectKBest(score_func=mutual_info_classif, k=4)
# learn relationship from training data
fs.fit(X_train, y_train)
# transform train input data
X_train_fs = fs.transform(X_train)
# transform test input data
X_test_fs = fs.transform(X_test)
return X_train_fs, X_test_fs, fs
# load the dataset
Insole = pd.read_csv('1119_Rwalk40s1_list.txt', header=None, low_memory=False)
SIData = np.asarray(Insole)
df = pd.read_csv('1119_Rwalk40s1.csv', low_memory=False)
columns = ['Fx','Fy','Fz','Mx','My','Mz']
selected_df = df[columns]
FCDatas = selected_df
SmartInsole = np.array(SIData)
FCData = np.array(FCDatas)
scaler_x = MinMaxScaler(feature_range=(0, 1))
scaler_x.fit(SmartInsole)
xscale = scaler_x.transform(SmartInsole)
scaler_y = MinMaxScaler(feature_range=(0, 1))
scaler_y.fit(FCData)
yscale = scaler_y.transform(FCData)
SIDataPCA = xscale
pca = PCA(n_components=89)
pca.fit(SIDataPCA)
SIdata_pca = pca.transform(SIDataPCA)
X = SIdata_pca
y = yscale
X = SIdata_pca
y = yscale
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# feature selection
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
# fit the model
model = LogisticRegression(solver='liblinear')
model.fit(X_train_fs, y_train)
# evaluate the model
yhat = model.predict(X_test_fs)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))
how can I get the correct PCA-MI result data?