I am using rapids UMAP in conjunction with HDBSCAN inside a rapidsai docker container : rapidsai/rapidsai-core:0.18-cuda11.0-runtime-ubuntu18.04-py3.7
import cudf
import cupy
from cuml.manifold import UMAP
import hdbscan
from sklearn.datasets import make_blobs
from cuml.experimental.preprocessing import StandardScaler
blobs, labels = make_blobs(n_samples=100000, n_features=10)
df_gpu=cudf.DataFrame(blobs)
scaler= StandardScaler()
cupy_scaled=scaler.fit_transform(df_gpu.values)
projector= UMAP(n_components=3, n_neighbors=2000)
cupy_projected=projector.fit_transform(cupy_scaled)
numpy_projected=cupy.asnumpy(cupy_projected)
clusterer= hdbscan.HDBSCAN(min_cluster_size=1000, prediction_data=True, gen_min_span_tree=True)#, core_dist_n_jobs=1)
clusterer.fit(numpy_projected)
I get an error which is fixed if I use core_dist_n_jobs=1
but makes the code slower:
--------------------------------------------------------------------------- TerminatedWorkerError Traceback (most recent call last) in 1 clusterer= hdbscan.HDBSCAN(min_cluster_size=1000, prediction_data=True, gen_min_span_tree=True) ----> 2 clusterer.fit(numpy_projected)
/opt/conda/envs/rapids/lib/python3.7/site-packages/hdbscan/hdbscan_.py in fit(self, X, y) 917 self._condensed_tree, 918 self._single_linkage_tree, --> 919 self._min_spanning_tree) = hdbscan(X, **kwargs) 920 921 if self.prediction_data:
/opt/conda/envs/rapids/lib/python3.7/site-packages/hdbscan/hdbscan_.py in hdbscan(X, min_cluster_size, min_samples, alpha, cluster_selection_epsilon, metric, p, leaf_size, algorithm, memory, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, cluster_selection_method, allow_single_cluster, match_reference_implementation, **kwargs) 613 approx_min_span_tree, 614 gen_min_span_tree, --> 615 core_dist_n_jobs, **kwargs) 616 else: # Metric is a valid BallTree metric 617 # TO DO: Need heuristic to decide when to go to boruvka;
/opt/conda/envs/rapids/lib/python3.7/site-packages/joblib/memory.py in call(self, *args, **kwargs) 350 351 def call(self, *args, **kwargs): --> 352 return self.func(*args, **kwargs) 353 354 def call_and_shelve(self, *args, **kwargs):
/opt/conda/envs/rapids/lib/python3.7/site-packages/hdbscan/hdbscan_.py in _hdbscan_boruvka_kdtree(X, min_samples, alpha, metric, p, leaf_size, approx_min_span_tree, gen_min_span_tree, core_dist_n_jobs, **kwargs) 276 leaf_size=leaf_size // 3, 277 approx_min_span_tree=approx_min_span_tree, --> 278 n_jobs=core_dist_n_jobs, **kwargs) 279 min_spanning_tree = alg.spanning_tree() 280 # Sort edges of the min_spanning_tree by weight
hdbscan/_hdbscan_boruvka.pyx in hdbscan._hdbscan_boruvka.KDTreeBoruvkaAlgorithm.init()
hdbscan/_hdbscan_boruvka.pyx in hdbscan._hdbscan_boruvka.KDTreeBoruvkaAlgorithm._compute_bounds()
/opt/conda/envs/rapids/lib/python3.7/site-packages/joblib/parallel.py in call(self, iterable) 1052 1053 with self._backend.retrieval_context(): -> 1054 self.retrieve() 1055 # Make sure that we get a last message telling us we are done 1056
elapsed_time = time.time() - self._start_time/opt/conda/envs/rapids/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self) 931 try: 932 if getattr(self._backend, 'supports_timeout', False): --> 933 self._output.extend(job.get(timeout=self.timeout)) 934 else: 935 self._output.extend(job.get())
/opt/conda/envs/rapids/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout) 540 AsyncResults.get from multiprocessing.""" 541 try: --> 542 return future.result(timeout=timeout) 543 except CfTimeoutError as e: 544 raise TimeoutError from e
/opt/conda/envs/rapids/lib/python3.7/concurrent/futures/_base.py in result(self, timeout) 433 raise CancelledError() 434 elif self._state == FINISHED: --> 435 return self.__get_result() 436 else: 437 raise TimeoutError()
/opt/conda/envs/rapids/lib/python3.7/concurrent/futures/_base.py in __get_result(self) 382 def __get_result(self): 383 if self._exception: --> 384 raise self._exception 385 else: 386 return self._result
TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.
The exit codes of the workers are {EXIT(1)}
Is there a way to solve this issue but still keep HDBSCAN to be fast?