0

I have a pipeline that consists of two custom column transformers, one of them is working while on another one it gives NotFittedError. Here is the ppl code:

class SkipSimpleImputer(SimpleImputer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

    def transform(self, X, y=None):
        if 'MARITAL_STATUS' in X.columns:
            print('\t---- MARITAL STATUS found in skipsimpleimpute, all cols are: ', X.columns)
            transformed_X = super().transform(X['MARITAL_STATUS'])
            X['MARITAL_STATUS'] = transformed_X
        return X

    def fit(self, X, y=None):
        return self
drop_cols =  Pipeline(steps=[
        ("columnDropper", columnDropperTransformer(id_cols))
    ])

    feature_remover = Pipeline(steps=[
        ("columnDropper", missingRemover())
    ])

    class CustomZeroImputer(SimpleImputer):
        def __init__(self, columns, **kwargs):
            self.columns = columns
            print('---- Got zeroImp request: ', columns)
            super().__init__(**kwargs)
            
        def fit(self, X, y=None):
            return self

        def transform(self, X, y=None):
            cols_to_imp = [col for col in self.columns if col in X.columns]
            
            print('---- Imputing zero cols: ', cols_to_imp)
            X[cols_to_imp] = super().transform(X[cols_to_imp])
            return X

    fill_na_zero_transformer = Pipeline(steps=[
        ('zero_imputer', CustomZeroImputer(fill_zero_cols, strategy='constant', fill_value=0))
    ])

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy = "constant", fill_value=-1, add_indicator=True)),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('categorical_imputer', SkipSimpleImputer(strategy="constant", fill_value='Unknown')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocess_ppl = ColumnTransformer(
        transformers=[
            ('encode', categorical_transformer, make_column_selector(dtype_include=object)),
            ('zero_impute', fill_na_zero_transformer, make_column_selector(dtype_include=np.number)),
            ('numeric', numeric_transformer, make_column_selector(dtype_include=np.number))
        ]
    )
    pipeline2 = Pipeline(
        steps=[
            ('dropper', drop_cols),
            ('remover',feature_remover),
            ("preprocessor", preprocess_ppl),
            ("estimator", customOLS(sm.OLS))
            ]
    )

    print('---- Fitting pipeline')
    pipeline2.fit(X_train,y_train)

The CustomZeroImputer only tries to impute the column exisitng in the intermediate data, coz otherwise it gives column not found error. The error I receive is:

NotFittedError: This CustomZeroImputer instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.

Obiii
  • 698
  • 1
  • 6
  • 26
  • 1
    Always provide the full error traceback. The `super().transform` call is probably at fault: the `SimpleImputer.transform` method will check if the estimator is fitted, which generally happens by just checking if an attribute with a trailing underscore exists. I'd probably suggest calling `super().fit(X[cols_to_imp])` in your `fit` method, or maybe better, not overriding the original `fit` at all. – Ben Reiniger Aug 12 '22 at 14:45

1 Answers1

0

There was mistake on my end, need to fit first

class CustomZeroImputer(SimpleImputer):
    def __init__(self, columns, **kwargs):
        self.columns = columns
        super().__init__(**kwargs)
        
    def fit(self, X, y=None):
        cols_to_imp = [col for col in self.columns if col in X.columns]
        self.columns = cols_to_imp
        return super().fit(X[self.columns],y)

    def transform(self, X, y=None):
        X[self.columns] = super().transform(X[self.columns])
        return X
Obiii
  • 698
  • 1
  • 6
  • 26