I am using a sklearn pipeline for a classification task as below:
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder
import shap
# -----------------------------------------------------------------------------
# Data
# -----------------------------------------------------------------------------
X, y = fetch_openml("titanic", version=1, as_frame=True, return_X_y=True)
categorical_columns = ["pclass", "sex", "embarked"]
numerical_columns = ["age", "sibsp", "parch", "fare"]
X = X[categorical_columns + numerical_columns] # [1309, 7] , there is Nan values.
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
# -----------------------------------------------------------------------------
# Data preprocessing
# -----------------------------------------------------------------------------
categorical_encoder = OrdinalEncoder(
handle_unknown="use_encoded_value", unknown_value=-1, encoded_missing_value=-1
)
numerical_imputer = SimpleImputer(strategy="mean")
preprocessing = ColumnTransformer(
[
("cat", categorical_encoder, categorical_columns),
("num", numerical_imputer, numerical_columns),
],
verbose_feature_names_out=False,
)
# -----------------------------------------------------------------------------
# Pipeline
# -----------------------------------------------------------------------------
rf = Pipeline(
[
("preprocess", preprocessing),
("classifier", RandomForestClassifier(random_state=42)),
]
)
rf.fit(X_train, y_train)
print(f"RF train accuracy: {rf.score(X_train, y_train):.3f}")
print(f"RF test accuracy: {rf.score(X_test, y_test):.3f}")
# -----------------------------------------------------------------------------
# Shap
# -----------------------------------------------------------------------------
explainer = shap.Explainer(rf["classifier"], feature_names=rf["preprocess"].get_feature_names_out())
X_test_processed = rf['preprocess'].transform(X_test)
shap_values = explainer(X_test_processed)
However when I try to get Shap plots, I get the following errors:
shap.summary_plot(shap_values, X_test_processed)
- Error: TypeError: only integer scalar arrays can be converted to a scalar index
shap.summary_plot(shap_values, X_test_processed, plot_type="bar")
- Error : TypeError: only integer scalar arrays can be converted to a scalar index
shap.plots.beeswarm(shap_values)
- Error : ValueError: The beeswarm plot does not support plotting explanations with instances that have more than one dimension!
shap.plots.bar(shap_values)
- Error : IndexError: list index out of range
What am I doing wrong? Please let me know any idea to solve this issue.