Instead of wrapping StandardScaler and RFECV in a same pipeline, do that for StandardScaler and RandomForestClassifier and pass that pipeline to the RFECV as an estimator. In this no traininf info will be leaked.
estimators = [('standardize' , StandardScaler()),
('clf', RandomForestClassifier())]
pipeline = Pipeline(estimators)
rfecv = RFECV(estimator=pipeline, scoring='accuracy')
rfecv_data = rfecv.fit(X, Y)
Update: About the error 'RuntimeError: The classifier does not expose "coef_" or "feature_importances_" attributes'
Yes thats a known issue in scikit-learn pipeline. You can look at my other answer here for more details and use the new pipeline I created there.
Define a custom pipeline like this:
class Mypipeline(Pipeline):
@property
def coef_(self):
return self._final_estimator.coef_
@property
def feature_importances_(self):
return self._final_estimator.feature_importances_
And use that:
pipeline = Mypipeline(estimators)
rfecv = RFECV(estimator=pipeline, scoring='accuracy')
rfecv_data = rfecv.fit(X, Y)
Update 2:
@brute, For your data and code, the algorithms completes within a minute on my PC. This is the complete code I use:
import numpy as np
import glob
from sklearn.utils import resample
files = glob.glob('/home/Downloads/Untitled Folder/*')
outs = []
for fi in files:
data = np.genfromtxt(fi, delimiter='|', dtype=float)
data = data[~np.isnan(data).any(axis=1)]
data = resample(data, replace=False, n_samples=1800, random_state=0)
outs.append(data)
X = np.vstack(outs)
print X.shape
Y = np.repeat([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 1800)
print Y.shape
#from sklearn.utils import shuffle
#X, Y = shuffle(X, Y, random_state=0)
from sklearn.feature_selection import RFECV
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
clf = RandomForestClassifier()
kf = KFold(n_splits=10, shuffle=True, random_state=0)
estimators = [('standardize' , StandardScaler()),
('clf', RandomForestClassifier())]
class Mypipeline(Pipeline):
@property
def coef_(self):
return self._final_estimator.coef_
@property
def feature_importances_(self):
return self._final_estimator.feature_importances_
pipeline = Mypipeline(estimators)
rfecv = RFECV(estimator=pipeline, scoring='accuracy', verbose=10)
rfecv_data = rfecv.fit(X, Y)
print ('no. of selected features =', rfecv_data.n_features_)
Update 3: For cross_val_predict
X_new = rfecv.transform(X)
print X_new.shape
# Here change clf to pipeline,
# because RFECV has found features according to scaled data,
# which is not present when you pass clf
y_predicts = cross_val_predict(pipeline, X_new, Y, cv=kf)
accuracy = accuracy_score(Y, y_predicts)
print ('accuracy =', accuracy)