I want to train an XGBoost classifier with coiled and dask. The problem is that my training data is really big and is stored in an h5py file on my computer. Is there a way to upload the h5py file directly to the workers?
To show my problem I created an example. For this example, I create some random data and store it in an h5py file so you can see what my data looks like. In my real work case, the data has 7245346 features and 2157 samples.
import coiled
import h5py
import numpy as np
import dask.array as da
from dask.distributed import Client
import xgboost as xgb
input_path = "test.h5"
# create some random data
n_features = 500
n_samples = 200
X = np.random.randint(0,3,size=[n_samples, n_features])
y = np.random.randint(0,5,size=[n_samples])
with h5py.File(input_path, mode='w') as file:
file.create_dataset('X', data=X)
file.create_dataset('y', data=y)
rows_per_chunk = 100
coiled.create_software_environment(
name="xgboost-on-coiled",
pip=["coiled", "h5py", "dask", "xgboost"])
with coiled.Cluster(
name="xgboost-cluster",
n_workers=2,
worker_cpu=8,
worker_memory="16GiB",
software="xgboost-on-coiled") as cluster:
with Client(cluster) as client:
file = h5py.File(input_path, mode='r')
n_features = file["X"].shape[1]
X = da.from_array(file["X"], chunks=(rows_per_chunk, n_features))
X = X.rechunk(chunks=(rows_per_chunk, n_features))
X.astype("int8")
X.persist()
y = da.from_array(file["y"], chunks=rows_per_chunk)
n_class = np.unique(y.compute()).size
y = y.astype("int8")
y.persist()
dtrain = xgb.dask.DaskDMatrix(
client,
X,
y,
feature_names=['%i' % i for i in range(n_features)])
model_params = {
'objective': 'multi:softprob',
'eval_metric': 'mlogloss',
'num_class': n_class}
# train model
output = xgb.dask.train(
client,
params=model_params,
dtrain=dtrain)
booster = output["booster"]
The Error message:
FileNotFoundError: [Errno 2] Unable to open file (unable to open file: name = 'test.h5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)
For smaller amounts of data, I can load the data directly out of RAM. But for more data, this does not work anymore. Just that you know what I am talking about:
input_path = "test.h5"
n_features = 500
n_samples = 200
X = np.random.randint(0,3,size=[n_samples, n_features])
y = np.random.randint(0,5,size=[n_samples])
with h5py.File(input_path, mode='w') as file:
file.create_dataset('X', data=X)
file.create_dataset('y', data=y)
rows_per_chunk = 100
coiled.create_software_environment(
name="xgboost-on-coiled",
pip=["coiled", "h5py", "dask", "xgboost"])
with coiled.Cluster(
name="xgboost-cluster",
n_workers=2,
worker_cpu=8,
worker_memory="16GiB",
software="xgboost-on-coiled") as cluster:
with Client(cluster) as client:
file = h5py.File(input_path, mode='r')
n_features = file["X"].shape[1]
X = file["X"][:]
X = da.from_array(X, chunks=(rows_per_chunk, n_features))
y = file["y"][:]
n_class = np.unique(y).size
y = da.from_array(y, chunks=rows_per_chunk)
dtrain = xgb.dask.DaskDMatrix(
client,
X,
y,
feature_names=['%i' % i for i in range(n_features)])
model_params = {
'objective': 'multi:softprob',
'eval_metric': 'mlogloss',
'num_class': n_class}
# train model
output = xgb.dask.train(
client,
params=model_params,
dtrain=dtrain)
booster = output["booster"]
If this code is used with large amounts of data, no error message is displayed. In this case, simply nothing happens. I do not see the data being uploaded.
I have tried so many things and nothing has worked. I would be very grateful if you have some advice for me on how to do this.
(Just in case you are wondering why I am trying to train a model on 7 million features: I want to get the feature importance for feature selection)