Using sklearn, I have a pipleline that works perfectly and basically looks and works like that :
model_1_KNeighborsClassifier = make_pipeline(preprocessor, KNeighborsClassifier())
model_1_KNeighborsClassifier.fit(X_train, y_train)
But if I do bagging using this pipeline :
model_bagging = BaggingClassifier(base_estimator=model_1_KNeighborsClassifier,n_estimators=10)
model_bagging.fit(X_train,y_train)
It doesn't work anymore :
File c:\Users\gui-r\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\utils\__init__.py:423, in _get_column_indices(X, key)
422 try:
--> 423 all_columns = X.columns
424 except AttributeError:
AttributeError: 'numpy.ndarray' object has no attribute 'columns'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
Cell In[9], line 6
1 from sklearn.ensemble import BaggingClassifier
2 model_bagging = BaggingClassifier(base_estimator=model_1_KNeighborsClassifier,n_estimators=10)
----> 6 model_bagging.fit(X_train,y_train)
7 #model_bagging.score(X_test,y_test)
File c:\Users\gui-r\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py:1151, in _fit_context..decorator..wrapper(estimator, *args, **kwargs)
1144 estimator._validate_params()
1146 with config_context(
1147 skip_parameter_validation=(
1148 prefer_skip_nested_validation or global_skip_validation
1149 )
1150 ):
...
428 )
429 if isinstance(key, str):
430 columns = [key]
ValueError: Specifying the columns using strings is only supported for pandas DataFrames
As if bagging cannot take processed data through the pipeline.
The entire code is the following :
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder,StandardScaler
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.compose import make_column_transformer
from sklearn.ensemble import BaggingClassifier
titanic = sns.load_dataset('titanic')
y = titanic['survived']
X = titanic.drop('survived', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
numerical_features = [ 'age', 'fare']
categorical_features = ['sex', 'deck', 'alone']
other_features=['pclass']
numerical_pipeline = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
categorical_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())
other_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'))
preprocessor = make_column_transformer((numerical_pipeline, numerical_features),
(categorical_pipeline, categorical_features),
(other_pipeline, other_features),)
processed_data=preprocessor.fit_transform(titanic)
model_1_KNeighborsClassifier = make_pipeline(preprocessor, KNeighborsClassifier(algorithm='ball_tree',metric='manhattan',n_neighbors=11))
model_bagging = BaggingClassifier(base_estimator=model_1_KNeighborsClassifier,n_estimators=10)
""" here those 2 lines work :
model_1_KNeighborsClassifier.fit(X_train,y_train)
print(model_1_KNeighborsClassifier.score(X_test,y_test)) """
model_bagging.fit(X_train,y_train)
print(model_bagging.score(X_test,y_test))
Any idea on what's wrong ?
Again, the pipeline itself works