XGBoostError: ../src/gbm/gbtree.cc:588: Check failed: common::AllVisibleGPUs() >= 1 (0 vs. 1) : No visible GPU is found for XGBoost

Question

When I'm trying to run hyper parameter Bayesian optimization using: from skopt import BayesSearchCV

I'm getting this error on a JupterLab AWS platform with pip environment.

The full error:

---------------------------------------------------------------------------
XGBoostError                              Traceback (most recent call last)
Input In [49], in <cell line: 1>()
----> 1 opt.fit(X_train, y_train['dr_with_pp_assumption_dpd30'])

File ~/wd/venv/lib/python3.8/site-packages/skopt/searchcv.py:466, in BayesSearchCV.fit(self, X, y, groups, callback, **fit_params)
    463 else:
    464     self.optimizer_kwargs_ = dict(self.optimizer_kwargs)
--> 466 super().fit(X=X, y=y, groups=groups, **fit_params)
    468 # BaseSearchCV never ranked train scores,
    469 # but apparently we used to ship this (back-compat)
    470 if self.return_train_score:

File ~/wd/venv/lib/python3.8/site-packages/sklearn/model_selection/_search.py:891, in BaseSearchCV.fit(self, X, y, groups, **fit_params)
    885     results = self._format_results(
    886         all_candidate_params, n_splits, all_out, all_more_results
    887     )
    889     return results
--> 891 self._run_search(evaluate_candidates)
    893 # multimetric is determined here because in the case of a callable
    894 # self.scoring the return type is only known after calling
    895 first_test_score = all_out[0]["test_scores"]

File ~/wd/venv/lib/python3.8/site-packages/skopt/searchcv.py:512, in BayesSearchCV._run_search(self, evaluate_candidates)
    508 while n_iter > 0:
    509     # when n_iter < n_points points left for evaluation
    510     n_points_adjusted = min(n_iter, n_points)
--> 512     optim_result = self._step(
    513         search_space, optimizer,
    514         evaluate_candidates, n_points=n_points_adjusted
    515     )
    516     n_iter -= n_points
    518     if eval_callbacks(callbacks, optim_result):

File ~/wd/venv/lib/python3.8/site-packages/skopt/searchcv.py:408, in BayesSearchCV._step(self, search_space, optimizer, evaluate_candidates, n_points)
    405 # make lists into dictionaries
    406 params_dict = [point_asdict(search_space, p) for p in params]
--> 408 all_results = evaluate_candidates(params_dict)
    409 # Feed the point and objective value back into optimizer
    410 # Optimizer minimizes objective, hence provide negative score
    411 local_results = all_results["mean_test_score"][-len(params):]

File ~/wd/venv/lib/python3.8/site-packages/sklearn/model_selection/_search.py:838, in BaseSearchCV.fit.<locals>.evaluate_candidates(candidate_params, cv, more_results)
    830 if self.verbose > 0:
    831     print(
    832         "Fitting {0} folds for each of {1} candidates,"
    833         " totalling {2} fits".format(
    834             n_splits, n_candidates, n_candidates * n_splits
    835         )
    836     )
--> 838 out = parallel(
    839     delayed(_fit_and_score)(
    840         clone(base_estimator),
    841         X,
    842         y,
    843         train=train,
    844         test=test,
    845         parameters=parameters,
    846         split_progress=(split_idx, n_splits),
    847         candidate_progress=(cand_idx, n_candidates),
    848         **fit_and_score_kwargs,
    849     )
    850     for (cand_idx, parameters), (split_idx, (train, test)) in product(
    851         enumerate(candidate_params), enumerate(cv.split(X, y, groups))
    852     )
    853 )
    855 if len(out) < 1:
    856     raise ValueError(
    857         "No fits were performed. "
    858         "Was the CV iterator empty? "
    859         "Were there no candidates?"
    860     )

File ~/wd/venv/lib/python3.8/site-packages/joblib/parallel.py:1085, in Parallel.__call__(self, iterable)
   1076 try:
   1077     # Only set self._iterating to True if at least a batch
   1078     # was dispatched. In particular this covers the edge
   (...)
   1082     # was very quick and its callback already dispatched all the
   1083     # remaining jobs.
   1084     self._iterating = False
-> 1085     if self.dispatch_one_batch(iterator):
   1086         self._iterating = self._original_iterator is not None
   1088     while self.dispatch_one_batch(iterator):

File ~/wd/venv/lib/python3.8/site-packages/joblib/parallel.py:901, in Parallel.dispatch_one_batch(self, iterator)
    899     return False
    900 else:
--> 901     self._dispatch(tasks)
    902     return True

File ~/wd/venv/lib/python3.8/site-packages/joblib/parallel.py:819, in Parallel._dispatch(self, batch)
    817 with self._lock:
    818     job_idx = len(self._jobs)
--> 819     job = self._backend.apply_async(batch, callback=cb)
    820     # A job can complete so quickly than its callback is
    821     # called before we get here, causing self._jobs to
    822     # grow. To ensure correct results ordering, .insert is
    823     # used (rather than .append) in the following line
    824     self._jobs.insert(job_idx, job)

File ~/wd/venv/lib/python3.8/site-packages/joblib/_parallel_backends.py:208, in SequentialBackend.apply_async(self, func, callback)
    206 def apply_async(self, func, callback=None):
    207     """Schedule a func to be run"""
--> 208     result = ImmediateResult(func)
    209     if callback:
    210         callback(result)

File ~/wd/venv/lib/python3.8/site-packages/joblib/_parallel_backends.py:597, in ImmediateResult.__init__(self, batch)
    594 def __init__(self, batch):
    595     # Don't delay the application, to avoid keeping the input
    596     # arguments in memory
--> 597     self.results = batch()

File ~/wd/venv/lib/python3.8/site-packages/joblib/parallel.py:288, in BatchedCalls.__call__(self)
    284 def __call__(self):
    285     # Set the default nested backend to self._backend but do not set the
    286     # change the default number of processes to -1
    287     with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288         return [func(*args, **kwargs)
    289                 for func, args, kwargs in self.items]

File ~/wd/venv/lib/python3.8/site-packages/joblib/parallel.py:288, in <listcomp>(.0)
    284 def __call__(self):
    285     # Set the default nested backend to self._backend but do not set the
    286     # change the default number of processes to -1
    287     with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 288         return [func(*args, **kwargs)
    289                 for func, args, kwargs in self.items]

File ~/wd/venv/lib/python3.8/site-packages/sklearn/utils/fixes.py:216, in _FuncWrapper.__call__(self, *args, **kwargs)
    214 def __call__(self, *args, **kwargs):
    215     with config_context(**self.config):
--> 216         return self.function(*args, **kwargs)

File ~/wd/venv/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:680, in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
    678         estimator.fit(X_train, **fit_params)
    679     else:
--> 680         estimator.fit(X_train, y_train, **fit_params)
    682 except Exception:
    683     # Note fit time as time until error
    684     fit_time = time.time() - start_time

File ~/wd/venv/lib/python3.8/site-packages/xgboost/core.py:506, in _deprecate_positional_args.<locals>.inner_f(*args, **kwargs)
    504 for k, arg in zip(sig.parameters, args):
    505     kwargs[k] = arg
--> 506 return f(**kwargs)

File ~/wd/venv/lib/python3.8/site-packages/xgboost/sklearn.py:789, in XGBModel.fit(self, X, y, sample_weight, base_margin, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights, callbacks)
    786     obj = None
    788 model, feval, params = self._configure_fit(xgb_model, eval_metric, params)
--> 789 self._Booster = train(
    790     params,
    791     train_dmatrix,
    792     self.get_num_boosting_rounds(),
    793     evals=evals,
    794     early_stopping_rounds=early_stopping_rounds,
    795     evals_result=evals_result,
    796     obj=obj,
    797     feval=feval,
    798     verbose_eval=verbose,
    799     xgb_model=model,
    800     callbacks=callbacks,
    801 )
    803 self._set_evaluation_result(evals_result)
    804 return self

File ~/wd/venv/lib/python3.8/site-packages/xgboost/training.py:188, in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, xgb_model, callbacks)
    115 def train(params, dtrain, num_boost_round=10, evals=(), obj=None, feval=None,
    116           maximize=None, early_stopping_rounds=None, evals_result=None,
    117           verbose_eval=True, xgb_model=None, callbacks=None):
    118     # pylint: disable=too-many-statements,too-many-branches, attribute-defined-outside-init
    119     """Train a booster with given parameters.
    120 
    121     Parameters
   (...)
    186     Booster : a trained booster model
    187     """
--> 188     bst = _train_internal(params, dtrain,
    189                           num_boost_round=num_boost_round,
    190                           evals=evals,
    191                           obj=obj, feval=feval,
    192                           xgb_model=xgb_model, callbacks=callbacks,
    193                           verbose_eval=verbose_eval,
    194                           evals_result=evals_result,
    195                           maximize=maximize,
    196                           early_stopping_rounds=early_stopping_rounds)
    197     return bst

File ~/wd/venv/lib/python3.8/site-packages/xgboost/training.py:81, in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks, evals_result, maximize, verbose_eval, early_stopping_rounds)
     79 if callbacks.before_iteration(bst, i, dtrain, evals):
     80     break
---> 81 bst.update(dtrain, i, obj)
     82 if callbacks.after_iteration(bst, i, dtrain, evals):
     83     break

File ~/wd/venv/lib/python3.8/site-packages/xgboost/core.py:1680, in Booster.update(self, dtrain, iteration, fobj)
   1677 self._validate_features(dtrain)
   1679 if fobj is None:
-> 1680     _check_call(_LIB.XGBoosterUpdateOneIter(self.handle,
   1681                                             ctypes.c_int(iteration),
   1682                                             dtrain.handle))
   1683 else:
   1684     pred = self.predict(dtrain, output_margin=True, training=True)

File ~/wd/venv/lib/python3.8/site-packages/xgboost/core.py:218, in _check_call(ret)
    207 """Check the return value of C API call
    208 
    209 This function will raise exception when error occurs.
   (...)
    215     return value from API calls
    216 """
    217 if ret != 0:
--> 218     raise XGBoostError(py_str(_LIB.XGBGetLastError()))

XGBoostError: [13:46:18] ../src/gbm/gbtree.cc:588: Check failed: common::AllVisibleGPUs() >= 1 (0 vs. 1) : No visible GPU is found for XGBoost.
Stack trace:
  [bt] (0) /root/wd/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x179459) [0x7f00362cb459]
  [bt] (1) /root/wd/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x179b82) [0x7f00362cbb82]
  [bt] (2) /root/wd/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x17a1da) [0x7f00362cc1da]
  [bt] (3) /root/wd/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(+0x1b46e5) [0x7f00363066e5]
  [bt] (4) /root/wd/venv/lib/python3.8/site-packages/xgboost/lib/libxgboost.so(XGBoosterUpdateOneIter+0x68) [0x7f00361eb4e8]
  [bt] (5) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call_unix64+0x4c) [0x7f01f861cdae]
  [bt] (6) /usr/lib/x86_64-linux-gnu/libffi.so.6(ffi_call+0x22f) [0x7f01f861c71f]
  [bt] (7) /root/wd/venv/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(_ctypes_callproc+0x8ce) [0x7f01f883234e]
  [bt] (8) /root/wd/venv/lib/python3.8/lib-dynload/_ctypes.cpython-38-x86_64-linux-gnu.so(+0xe4e4) [0x7f01f882d4e4]

Any suggestions how to make it work?

I tried to solve this using the answers from here:

XGBoostError: [10:10:03] /workspace/src/tree/updater_gpu_hist.cu:1407: Exception in gpu_hist: NCCL failure

but it didn't work

XGBoostError: ../src/gbm/gbtree.cc:588: Check failed: common::AllVisibleGPUs() >= 1 (0 vs. 1) : No visible GPU is found for XGBoost

0 Answers0