when comparing sklearn.GridSearchCV with xgboost.cv I get different results...below I explain what I would like to do:
1) import libraries
import numpy as np
from sklearn import datasets
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import StratifiedKFold
2) set seed and folds
seed = 5
n_fold_inner = 5
skf_inner = StratifiedKFold(n_splits=n_fold_inner,random_state=seed, shuffle=True)
3) load dataset
X, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)
X = X.astype(np.float32)
# map labels from {-1, 1} to {0, 1}
labels, y = np.unique(y, return_inverse=True)
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]
dtrain = xgb.DMatrix(X_train, label=y_train, missing = np.nan)
4) define parameters xgboost
fixed_parameters = {
'max_depth':3,
'min_child_weight':3,
'learning_rate':0.3,
'colsample_bytree':0.8,
'subsample':0.8,
'gamma':0,
'max_delta_step':0,
'colsample_bylevel':1,
'reg_alpha':0,
'reg_lambda':1,
'scale_pos_weight':1,
'base_score':0.5,
'seed':5,
'objective':'binary:logistic',
'silent': 1}
5) parameters for wich I do the grid search (only one, ie the number of estimators)
params_grid = {
'n_estimators':np.linspace(1, 20, 20).astype('int')
}
6) perform grid search
bst_grid = GridSearchCV(
estimator=XGBClassifier(**fixed_parameters),param_grid=params_grid,n_jobs=4,
cv=skf_inner,scoring='roc_auc',iid=False,refit=False,verbose=1)
bst_grid.fit(X_train,y_train)
best_params_grid_search = bst_grid.best_params_
best_score_grid_search = bst_grid.best_score_
means_train = bst_grid.cv_results_['mean_train_score']
stds_train = bst_grid.cv_results_['std_train_score']
means_test = bst_grid.cv_results_['mean_test_score']
stds_test = bst_grid.cv_results_['std_test_score']
7) print results
print('\ntest-auc-mean test-auc-std train-auc-mean train-auc-std')
for idx in range(0, len(means_test)):
print means_test[idx], stds_test[idx], means_train[idx], stds_train[idx]
8) now I run xgb.cv with the same parameters of before for 20 rounds (the n_estimators that I was giving as input to the gridsearch before. The problem is that I get different results...
num_rounds = 20
best_params_grid_search['objective']= 'binary:logistic'
best_params_grid_search['silent']= 1
cv_xgb = xgb.cv(best_params_grid_search,dtrain,num_boost_round =num_rounds,folds=skf_inner,metrics={'auc'},seed=seed,maximize=True)
print(cv_xgb)
RESULT GRIDSEARCH (each row is using n estimators (1,2,3,...,20)
test-auc-mean test-auc-std train-auc-mean train-auc-std
0.610051313783 0.0161039540435 0.644057288587 0.0113345992869
0.69201880047 0.0162563563448 0.736006666658 0.00692672815659
0.745466211655 0.0171675737271 0.796345885396 0.00696679302744
0.783959748994 0.00705320521545 0.841463145757 0.00948465661336
0.814666429161 0.0205663250121 0.876016226998 0.00594191823748
0.834757856446 0.0380407635359 0.89839145346 0.0119466187041
0.846589877247 0.0250769570711 0.918506450202 0.00400934458132
0.856519550489 0.02076405634 0.929968936282 0.00287173282935
0.874262106553 0.0270140215944 0.940190511945 0.00335749381638
0.884796282407 0.0242102758081 0.947369708661 0.00274634034559
0.890833683342 0.0240690598159 0.953708404754 0.00332080069217
0.898287157179 0.0212975975614 0.958794323829 0.00463360376002
0.905931348284 0.0240526927266 0.963055575138 0.00385161158711
0.911782932073 0.0169788764956 0.966542306102 0.00274612227499
0.912551138778 0.0175200936415 0.969060984867 0.00135518880398
0.915046588665 0.0169918459539 0.971904231381 0.00177694652262
0.917921423036 0.0131486037603 0.975162276052 0.0025983006922
0.921909172729 0.0113192686772 0.976056924526 0.0022670828819
0.928131653291 0.0117709832599 0.978585868159 0.00211167800105
0.931493562339 0.0119475329984 0.98098486872 0.00186032225868
RESULT XGB.CV
test-auc-mean test-auc-std train-auc-mean train-auc-std
0 0.669881 0.013938 0.772116 0.011315
1 0.759682 0.019225 0.883394 0.004381
2 0.798337 0.016992 0.939274 0.005196
3 0.827751 0.007224 0.962461 0.007382
4 0.850340 0.011451 0.978809 0.001102
5 0.864438 0.020012 0.986584 0.000858
6 0.879706 0.014168 0.991765 0.001926
7 0.889308 0.013851 0.994663 0.000970
8 0.897973 0.011383 0.996704 0.000481
9 0.903878 0.012139 0.997494 0.000432
10 0.909599 0.010234 0.998301 0.000602
11 0.912682 0.014475 0.998972 0.000306
12 0.914289 0.014122 0.999392 0.000207
13 0.916273 0.011744 0.999568 0.000185
14 0.918050 0.011219 0.999718 0.000140
15 0.922161 0.011968 0.999788 0.000146
16 0.922990 0.010124 0.999863 0.000085
17 0.924221 0.009026 0.999893 0.000082
18 0.925718 0.008859 0.999929 0.000060
19 0.926104 0.007586 0.999959 0.000030