I have a scikit learn pipeline to scale numeric features and encode categorical features. It was working fine until I tried to implement the RandomUnderSampler from imblearn. My goal is to implement the undersampler step since my dataset is very imbalanced 1:1000.
I made sure to use the Pipeline method from imblearn instead of sklearn. And below is the code I've tried.
Code data works (using sklearn pipeline) without the undersampler method.
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.pipeline import Pipeline as Pipeline_imb
from sklearn.base import BaseEstimator, TransformerMixin
class TypeSelector(BaseEstimator, TransformerMixin):
def __init__(self, dtype):
self.dtype = dtype
def fit(self, X, y=None):
return self
def transform(self, X):
assert isinstance(X, pd.DataFrame)
return X.select_dtypes(include=[self.dtype])
transformer = Pipeline([
# Union numeric, categoricals and boolean
('features', FeatureUnion(n_jobs=1, transformer_list=[
# Select bolean features
('boolean', Pipeline([
('selector', TypeSelector('bool')),
])),
# Select and scale numericals
('numericals', Pipeline([
('selector', TypeSelector(np.number)),
('scaler', StandardScaler()),
])),
# Select and encode categoricals
('categoricals', Pipeline([
('selector', TypeSelector('category')),
('encoder', OneHotEncoder(handle_unknown='ignore')),
]))
])),
])
pipe = Pipeline([('prep', transformer),
('clf', RandomForestClassifier(n_estimators=500, class_weight='balanced'))
])
Code that does not work (using imblearn pipeline) with the undersampler method.
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from imblearn.pipeline import Pipeline as Pipeline_imb
from sklearn.base import BaseEstimator, TransformerMixin
class TypeSelector(BaseEstimator, TransformerMixin):
def __init__(self, dtype):
self.dtype = dtype
def fit(self, X, y=None):
return self
def transform(self, X):
assert isinstance(X, pd.DataFrame)
return X.select_dtypes(include=[self.dtype])
transformer = Pipeline_imb([
# Union numeric, categoricals and boolean
('features', FeatureUnion(n_jobs=1, transformer_list=[
# Select bolean features
('boolean', Pipeline_imb([
('selector', TypeSelector('bool')),
])),
# Select and scale numericals
('numericals', Pipeline_imb([
('selector', TypeSelector(np.number)),
('scaler', StandardScaler()),
])),
# Select and encode categoricals
('categoricals', Pipeline_imb([
('selector', TypeSelector('category')),
('encoder', OneHotEncoder(handle_unknown='ignore')),
]))
])),
])
pipe = Pipeline_imb([
('sampler', RandomUnderSampler(0.1)),
('prep', transformer),
('clf', RandomForestClassifier(n_estimators=500, class_weight='balanced'))
])
Here is the error I get:
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in __init__(self, steps, memory, verbose)
133 def __init__(self, steps, memory=None, verbose=False):
134 self.steps = steps
--> 135 self._validate_steps()
136 self.memory = memory
137 self.verbose = verbose
/usr/local/lib/python3.6/dist-packages/imblearn/pipeline.py in _validate_steps(self)
144 if isinstance(t, pipeline.Pipeline):
145 raise TypeError(
--> 146 "All intermediate steps of the chain should not be"
147 " Pipelines")
148
TypeError: All intermediate steps of the chain should not be Pipelines