My objective was to use Scikit-Optimize library in python to minimize the function value in order to find the optimized parameters for xgboost model. The process involve running the model with different random parameters for 5,000 times.
However, it seems that the loop stopped at some point and gave me a RuntimeError: can't start new thread. I am using ubuntu 20.04 and is running python 3.8.5, Scikit-Optimize version is 0.8.1. I ran the same code in windows 10 and it appears that I do not encounter this RuntimeError, however, the code runs much more slower.
I think I may need a threadpool to solve this issue but after searching through the web and I had no luck on finding a solution to implement the threadpool.
Below is a simplified version of the codes:
#This function will be passed to Scikit-Optimize to find the optimized parameters (Params)
def find_best_xgboost_para(params):`
#Defines the parameters that I want to optimize
learning_rate,gamma,max_depth,min_child_weight,reg_alpha,reg_lambda,subsample,max_bin,num_parallel_tree,colsamp_lev,colsamp_tree,StopSteps\
=float(params[0]),float(params[1]),int(params[2]),int(params[3]),\
int(params[4]),int(params[5]),float(params[6]),int(params[7]),int(params[8]),float(params[9]),float(params[10]),int(params[11])
xgbc=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=colsamp_lev,
colsample_bytree=colsamp_tree, gamma=gamma, learning_rate=learning_rate, max_delta_step=0,
max_depth=max_depth, min_child_weight=min_child_weight, missing=None, n_estimators=nTrees,
objective='binary:logistic',random_state=101, reg_alpha=reg_alpha,
reg_lambda=reg_lambda, scale_pos_weight=1,seed=101,
subsample=subsample,importance_type='gain',gpu_id=GPUID,max_bin=max_bin,
tree_method='gpu_hist',num_parallel_tree=num_parallel_tree,predictor='gpu_predictor',verbosity=0,\
refresh_leaf=0,grow_policy='depthwise',process_type=TreeUpdateStatus,single_precision_histogram=SinglePrecision)
tscv = TimeSeriesSplit(CV_nSplit)
error_data=xgboost.cv(xgbc.get_xgb_params(), CVTrain, num_boost_round=CVBoostRound, nfold=None, stratified=False, folds=tscv, metrics=(), \
obj=None, feval=f1_eval, maximize=False, early_stopping_rounds=StopSteps, fpreproc=None, as_pandas=True, \
verbose_eval=True, show_stdv=True, seed=101, shuffle=shuffle_trig)
eval_set = [(X_train, y_train), (X_test, y_test)]
xgbc.fit(X_train, y_train, eval_metric=f1_eval, early_stopping_rounds=StopSteps, eval_set=eval_set,verbose=True)
xgbc_predictions=xgbc.predict(X_test)
error=(1-metrics.f1_score(y_test, xgbc_predictions,average='macro'))
del xgbc
return error
#Define the range of values that Scikit-Optimize can choose from to find the optimized parameters
lr_low, lr_high=float(XgParamDict['lr_low']), float(XgParamDict['lr_high'])
gama_low, gama_high=float(XgParamDict['gama_low']), float(XgParamDict['gama_high'])
depth_low, depth_high=int(XgParamDict['depth_low']), int(XgParamDict['depth_high'])
child_weight_low, child_weight_high=int(XgParamDict['child_weight_low']), int(XgParamDict['child_weight_high'])
alpha_low,alpha_high=int(XgParamDict['alpha_low']),int(XgParamDict['alpha_high'])
lambda_low,lambda_high=int(XgParamDict['lambda_low']),int(XgParamDict['lambda_high'])
subsamp_low,subsamp_high=float(XgParamDict['subsamp_low']),float(XgParamDict['subsamp_high'])
max_bin_low,max_bin_high=int(XgParamDict['max_bin_low']),int(XgParamDict['max_bin_high'])
num_parallel_tree_low,num_parallel_tree_high=int(XgParamDict['num_parallel_tree_low']),int(XgParamDict['num_parallel_tree_high'])
colsamp_lev_low,colsamp_lev_high=float(XgParamDict['colsamp_lev_low']),float(XgParamDict['colsamp_lev_high'])
colsamp_tree_low,colsamp_tree_high=float(XgParamDict['colsamp_tree_low']),float(XgParamDict['colsamp_tree_high'])
StopSteps_low,StopSteps_high=float(XgParamDict['StopSteps_low']),float(XgParamDict['StopSteps_high'])
#Pass the target function (find_best_xgboost_para) as well as parameter ranges to Scikit-Optimize, 'res' will be an array of values that will need to be pass to another function
res=gbrt_minimize(find_best_xgboost_para,[(lr_low,lr_high),(gama_low, gama_high),(depth_low,depth_high),(child_weight_low,child_weight_high),\
(alpha_low,alpha_high),(lambda_low,lambda_high),(subsamp_low,subsamp_high),(max_bin_low,max_bin_high),\
(num_parallel_tree_low,num_parallel_tree_high),(colsamp_lev_low,colsamp_lev_high),(colsamp_tree_low,colsamp_tree_high),\
(StopSteps_low,StopSteps_high)],random_state=101,n_calls=5000,n_random_starts=1500,verbose=True,n_jobs=-1)
Below is the error message:
Traceback (most recent call last):
File "/home/FactorOpt.py", line 91, in <module>Opt(**FactorOptDict)
File "/home/anaconda3/lib/python3.8/site-packages/skopt/optimizer/gbrt.py", line 179, in gbrt_minimize return base_minimize(func, dimensions, base_estimator,
File "/home/anaconda3/lib/python3.8/site-packages/skopt/optimizer/base.py", line 301, in base_minimize
next_y = func(next_x)
File "/home/anaconda3/lib/python3.8/modelling/FactorOpt.py", line 456, in xgboost_opt
res=gbrt_minimize(find_best_xgboost_para,[(lr_low,lr_high),(gama_low, gama_high),(depth_low,depth_high),(child_weight_low,child_weight_high),\
File "/home/anaconda3/lib/python3.8/site-packages/skopt/optimizer/gbrt.py", line 179, in gbrt_minimize
return base_minimize(func, dimensions, base_estimator,
File "/home/anaconda3/lib/python3.8/site-packages/skopt/optimizer/base.py", line 302, in base_minimize
result = optimizer.tell(next_x, next_y)
File "/home/anaconda3/lib/python3.8/site-packages/skopt/optimizer/optimizer.py", line 493, in tell
return self._tell(x, y, fit=fit)
File "/home/anaconda3/lib/python3.8/site-packages/skopt/optimizer/optimizer.py", line 536, in _tell
est.fit(self.space.transform(self.Xi), self.yi)
File "/home/anaconda3/lib/python3.8/site-packages/skopt/learning/gbrt.py", line 85, in fit
self.regressors_ = Parallel(n_jobs=self.n_jobs, backend='threading')(
File "/home/anaconda3/lib/python3.8/site-packages/joblib/parallel.py", line 1048, in __call__
if self.dispatch_one_batch(iterator):
File "/home/anaconda3/lib/python3.8/site-packages/joblib/parallel.py", line 866, in dispatch_one_batch
self._dispatch(tasks)
File "/home/anaconda3/lib/python3.8/site-packages/joblib/parallel.py", line 784, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/home/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 252, in apply_async
return self._get_pool().apply_async(
File "/home/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 407, in _get_pool
self._pool = ThreadPool(self._n_jobs)
File "/home/anaconda3/lib/python3.8/multiprocessing/pool.py", line 925, in __init__
Pool.__init__(self, processes, initializer, initargs)
File "/home/anaconda3/lib/python3.8/multiprocessing/pool.py", line 232, in __init__
self._worker_handler.start()
File "/home/anaconda3/lib/python3.8/threading.py", line 852, in start
_start_new_thread(self._bootstrap, ())
RuntimeError: can't start new thread