I am simply trying to understand how to format a config file to allow multiple GPUs/distributed training to take place via the "train" command.
The only clear tutorial out there is seemingly for much older versions of AllenNLP: Tutorial: How to train with multiple GPUs in AllenNLP and does not work as the "distributed" argument is now a Bool and will not accept a list of CUDA device IDs.
"trainer": {
// Set use_amp to true to use automatic mixed-precision during training (if your GPU supports it)
"use_amp": true,
"cuda_devices": [7,8],
"optimizer": {
"type": "huggingface_adamw",
"lr": 5e-5,
"eps": 1e-06,
"correct_bias": false,
"weight_decay": 0.1,
"parameter_groups": [
// Apply weight decay to pre-trained params, excluding LayerNorm params and biases
[["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}],
],
},
"callbacks":[{"type":'tensorboard'}],
"num_epochs": 10,
"checkpointer": {
// A value of null or -1 will save the weights of the model at the end of every epoch
"keep_most_recent_by_count": 2,
},
"grad_norm": 1.0,
"learning_rate_scheduler": {
"type": "slanted_triangular",
},
"distributed": {"cuda_devices": [7,8],},
"world_size": 2,
},
}
Leads to:
kwargs = create_kwargs(constructor_to_inspect, cls, params, **extras)
File "/home/niallt/DeCLUTR/allennlp/allennlp/common/from_params.py", line 206, in create_kwargs
constructed_arg = pop_and_construct_arg(
File "/home/niallt/DeCLUTR/allennlp/allennlp/common/from_params.py", line 314, in pop_and_construct_arg
return construct_arg(class_name, name, popped_params, annotation, default, **extras)
File "/home/niallt/DeCLUTR/allennlp/allennlp/common/from_params.py", line 363, in construct_arg
raise TypeError(f"Expected {argument_name} to be a {annotation.__name__}.")
TypeError: Expected distributed to be a bool.
Then trying to shift towards allennlp v2.10 by setting distributed to bool and providing the cuda_devices as a list leads to the following:
"trainer": {
// Set use_amp to true to use automatic mixed-precision during training (if your GPU supports it)
"use_amp": true,
"cuda_devices": [7,8],
"optimizer": {
"type": "huggingface_adamw",
"lr": 5e-5,
"eps": 1e-06,
"correct_bias": false,
"weight_decay": 0.1,
"parameter_groups": [
// Apply weight decay to pre-trained params, excluding LayerNorm params and biases
[["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}],
],
},
"callbacks":[{"type":'tensorboard'}],
"num_epochs": 10,
"checkpointer": {
// A value of null or -1 will save the weights of the model at the end of every epoch
"keep_most_recent_by_count": 2,
},
"grad_norm": 1.0,
"learning_rate_scheduler": {
"type": "slanted_triangular",
},
"distributed": true,
"world_size": 2
},
}
With the following error:
File "/home/niallt/DeCLUTR/allennlp/allennlp/commands/train.py", line 786, in from_partial_objects
trainer_ = trainer.construct(
File "/home/niallt/DeCLUTR/allennlp/allennlp/common/lazy.py", line 82, in construct
return self.constructor(**contructor_kwargs)
File "/home/niallt/DeCLUTR/allennlp/allennlp/common/lazy.py", line 66, in constructor_to_use
return self._constructor.from_params( # type: ignore[union-attr]
File "/home/niallt/DeCLUTR/allennlp/allennlp/common/from_params.py", line 604, in from_params
return retyped_subclass.from_params(
File "/home/niallt/DeCLUTR/allennlp/allennlp/common/from_params.py", line 638, in from_params
return constructor_to_call(**kwargs) # type: ignore
File "/home/niallt/DeCLUTR/allennlp/allennlp/training/gradient_descent_trainer.py", line 1154, in from_partial_objects
ddp_accelerator = TorchDdpAccelerator(cuda_device=cuda_device)
File "/home/niallt/DeCLUTR/allennlp/allennlp/nn/parallel/ddp_accelerator.py", line 138, in __init__
super().__init__(local_rank=local_rank, world_size=world_size, cuda_device=cuda_device)
File "/home/niallt/DeCLUTR/allennlp/allennlp/nn/parallel/ddp_accelerator.py", line 102, in __init__
self.local_rank: int = local_rank if local_rank is not None else dist.get_rank()
File "/home/niallt/venvs/39_declutr/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 844, in get_rank
default_pg = _get_default_group()
File "/home/niallt/venvs/39_declutr/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 429, in _get_default_group
raise RuntimeError(
RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.
I'm guessing I may just be missing some key arguments here - but struggling to determine what.
Any help would be much appreciated