0

i want to use principal component analysis-mutual information (PCA-MI) to have data representation from source which has source relevance of (value from smartinsole) and ouput variable (value from force plate). PCA was used to determine the principal component of Ni provided that the cumulative variance is greater than 98% of the source information measured from 89 insole sensors. MI is generally used in the selection of input variables for predictive models because it is a good indicator of the relationship between input variables and output variables. here I want to get results like a flowchart as below enter image description here

then I try to make code like below. but I can't generate like what's in the flowchart

from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score



# load the dataset
def load_dataset(filename):
 # load the dataset as a pandas DataFrame
 data = read_csv(filename, header=None)
 # retrieve numpy array
 dataset = data.values
 y = dataset
 return y

def load_dataset2(filename):
 # load the dataset as a pandas DataFrame
 data2 = read_csv(filename, header=None)
 # retrieve numpy array
 dataset2 = data2.values
 X = dataset2
 return X
 
# feature selection
def select_features(X_train, y_train, X_test):
 # configure to select a subset of features
 fs = SelectKBest(score_func=mutual_info_classif, k=4)
 # learn relationship from training data
 fs.fit(X_train, y_train)
 # transform train input data
 X_train_fs = fs.transform(X_train)
 # transform test input data
 X_test_fs = fs.transform(X_test)
 return X_train_fs, X_test_fs, fs
 
# load the dataset
Insole = pd.read_csv('1119_Rwalk40s1_list.txt', header=None, low_memory=False)
SIData =  np.asarray(Insole)

df = pd.read_csv('1119_Rwalk40s1.csv', low_memory=False)
columns = ['Fx','Fy','Fz','Mx','My','Mz']
selected_df = df[columns]
FCDatas = selected_df

SmartInsole = np.array(SIData)
FCData = np.array(FCDatas)

scaler_x = MinMaxScaler(feature_range=(0, 1))
scaler_x.fit(SmartInsole)
xscale = scaler_x.transform(SmartInsole)

scaler_y = MinMaxScaler(feature_range=(0, 1))
scaler_y.fit(FCData)
yscale = scaler_y.transform(FCData)

SIDataPCA = xscale
pca = PCA(n_components=89)
pca.fit(SIDataPCA)
SIdata_pca = pca.transform(SIDataPCA)

X = SIdata_pca
y = yscale

X = SIdata_pca
y = yscale
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# feature selection
X_train_fs, X_test_fs, fs = select_features(X_train, y_train, X_test)
# fit the model
model = LogisticRegression(solver='liblinear')
model.fit(X_train_fs, y_train)
# evaluate the model
yhat = model.predict(X_test_fs)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat)
print('Accuracy: %.2f' % (accuracy*100))

how can I get the correct PCA-MI result data?

stack offer
  • 143
  • 8

0 Answers0