I need to understand the following error: "CatBoostError: The left argument is not fitted, only fitted models could be compared."

Question

I am trying to run a RandomizedSearchCV on various classification models through "For" loop for Hyperparameter tuning. There is no issue with running any other models except CatBoost. Also the issue with Catboost arises when I used Pipeline in defining function.

My Code:

#Building the models:

lr = LogisticRegression()
knn = KNeighborsClassifier()
svm = SVC()
dt = DecisionTreeClassifier(random_state=1)
bag = BaggingClassifier(random_state=1)
adb = AdaBoostClassifier(random_state=1)
gb = GradientBoostingClassifier(random_state=1)
rf = RandomForestClassifier(random_state=1)
xgb = XGBClassifier()
cgb = CatBoostClassifier()
lgb = LGBMClassifier()

#Defining a function:

def fun_exp(model, name, x_tr, x_te, y_tr, y_te):
    start = time.time()
    pipe = Pipeline([('scale', StandardScaler()), ('pca', PCA(n_components = 62)), (name, model)])
    rscv = RandomizedSearchCV(pipe, params, cv=10, random_state=1)
    rscv.fit(x_tr, y_tr)
    rscv_best_params = rscv.best_params_
    rscv_best_score = rscv.best_score_
    rscv_score_train = rscv.score(x_tr, y_tr)
    rscv_score_test = rscv.score(x_te, y_te)
    rscv_pred = rscv.predict(x_te)
    end = time.time()
    pickle.dump(rscv, open(name, 'wb'))

    rscv_duration = end-start
    return rscv_best_params, rscv_best_score, rscv_score_train, rscv_score_test, rscv_duration, rscv_pred

Running the above function in for loop & saving the result in a dictionary:

exp_result = {}

\#Fitting & Testing the model

  for model, name in zip([lr, knn, svm, dt, bag, adb, gb, rf, lgb, cgb,  xgb], ['Logistic Regression', 'KNeighbors', 'SVM', 'DecisionTree', 'Bagging', 'AdaBoost', 'GradientBoost', 'Random Forest', 'LightGBM', 'CatBoost', 'XGBoost']):

    if model == lr:
        params = {'Logistic Regression__solver': ['liblinear', 'lfbgs', 'sag', 'saga'], 'Logistic Regression__penalty':['elasticnet', 'l1', 'l2', 'none'], 'Logistic Regression__multi_class': ['auto', 'ovr', 'multinomial'], 'Logistic Regression__C':[0.1, 1, 10], 'Logistic Regression__tol': [0.00001, 0.0001, 0.001], 'Logistic Regression__class_weight': ['balanced', None]}
       
    if model == knn:
        params = {'KNeighbors__n_neighbors':np.arange(5,50,5), 'KNeighbors__weights': ['uniform', 'distance'], 'KNeighbors__algorithm':['auto', 'knn__ball_tree', 'kd_tree', 'brute'], 'KNeighbors__leaf_size': np.arange(10,51,10), 'KNeighbors__metric': ['minkowski', 'euclidean', 'manhattan']}
        
    if model == svm:
        params = {'SVM__gamma': [10, 1, 0.1, 0.01, 0.001, 0.0001], 'SVM__C': [1000, 100, 10, 1, 0.1, 0.01, 0.001], 'SVM__kernel': ['poly', 'rbf', 'sigmoid'], 'SVM__class_weight': ['balanced', None], 'SVM__decision_function_shape': ['ovo', 'ovr']}
        
    if model == dt:
        params = {'DecisionTree__criterion':['gini', 'entropy', 'log_loss'], 'DecisionTree__splitter':['best', 'random'], 'DecisionTree__max_depth':[None, np.arange(1,11)], 'DecisionTree__max_features': np.arange(8, 21, 2), 'DecisionTree__random_state':[1], 'DecisionTree__class_weight':['balanced', None]}
        
    if model==bag:
        params = {'Bagging__n_estimators': [10, 30, 50, 100, 500], 'Bagging__max_features': np.arange(8, 21, 2), 'Bagging__random_state':[1]}
       
    if model == adb:
        params = {'AdaBoost__n_estimators': [10, 30, 50, 100, 500], 'AdaBoost__learning_rate':[0.001, 0.01, 0.1, 1, 10], 'AdaBoost__algorithm':['SAMME.R', 'SAMME'], 'AdaBoost__random_state':[1]}
        
    if model == gb:
        params = {'GradientBoost__loss':['log_loss', 'exponential'], 'GradientBoost__learning_rate':[0.001, 0.01, 0.1, 1, 10], 'GradientBoost__n_estimators': [10, 30, 50, 100, 500], 'GradientBoost__max_depth':np.arange(1,11), 'GradientBoost__random_state':[1], 'GradientBoost__max_features': np.arange(8, 21, 2)}
       
    if model == rf:
        params = {'Random Forest__n_estimators': [10, 30, 50, 100, 500], 'Random Forest__criterion':['gini', 'entropy', 'log_loss'], 'Random Forest__max_depth':np.arange(1,11), 'Random Forest__max_features': np.arange(8, 21, 2), 'Random Forest__random_state':[1]}
        
    if model == lgb:
        params = {'LightGBM__boosting_type':['gbdt', 'rf'], 'LightGBM__num_leaves':np.arange(20, 40), 'LightGBM__max_depth':np.arange(1,11), 'LightGBM__learning_rate':[0.001, 0.01, 0.1, 1, 10], 'LightGBM__n_estimators': [10, 30, 50, 100, 500], 'LightGBM__class_weight': ['balanced', None], 'LightGBM__random_state':[1]}
    
    if model == cgb:
        params = {'CatBoost__learning_rate':[0.001, 0.01, 0.1, 1], 'CatBoost__n_estimators': [100, 500], 'CatBoost__max_depth':np.arange(1,11), 'CatBoost__random_state':[1], 'CatBoost__feature_border_type': ['Median', 'Uniform', 'UniformAndQuantiles', 'GreedyLogSum', 'MaxLogSum', 'MinEntropy']}
        
    if model == xgb:
        le = LabelEncoder()
        y_tr = le.fit_transform(y_tr)
        y_te = le.fit_transform(y_te)
        params = {'XGBoost__n_estimators': [10, 30, 50, 100, 500], 'XGBoost__max_depth':np.arange(1,11), 'XGBoost__max_leaves': np.arange(0, 150), 'XGBoost__learning_rate':[0.001, 0.01, 0.1, 1, 10], 'XGBoost__random_state':[1]}
        
    
    exp_result[name] = fun_exp(model, name, x_tr, x_te, y_tr, y_te)

I need to understand the following error: "CatBoostError: The left argument is not fitted, only fitted models could be compared."

0 Answers0