0

I am simply trying to understand how to format a config file to allow multiple GPUs/distributed training to take place via the "train" command.

The only clear tutorial out there is seemingly for much older versions of AllenNLP: Tutorial: How to train with multiple GPUs in AllenNLP and does not work as the "distributed" argument is now a Bool and will not accept a list of CUDA device IDs.

"trainer": {
        // Set use_amp to true to use automatic mixed-precision during training (if your GPU supports it)
        "use_amp": true,
        "cuda_devices": [7,8],
        "optimizer": {
            "type": "huggingface_adamw",
            "lr": 5e-5,
            "eps": 1e-06,
            "correct_bias": false,
            "weight_decay": 0.1,
            "parameter_groups": [
                // Apply weight decay to pre-trained params, excluding LayerNorm params and biases
                [["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}],
            ],
        },
        "callbacks":[{"type":'tensorboard'}],
        "num_epochs": 10,
        "checkpointer": {
            // A value of null or -1 will save the weights of the model at the end of every epoch
            "keep_most_recent_by_count": 2,
        },
        "grad_norm": 1.0,
        "learning_rate_scheduler": {
            "type": "slanted_triangular",
        },
        "distributed": {"cuda_devices": [7,8],},
        "world_size": 2,   
    },   
}

Leads to:

    kwargs = create_kwargs(constructor_to_inspect, cls, params, **extras)
  File "/home/niallt/DeCLUTR/allennlp/allennlp/common/from_params.py", line 206, in create_kwargs
    constructed_arg = pop_and_construct_arg(
  File "/home/niallt/DeCLUTR/allennlp/allennlp/common/from_params.py", line 314, in pop_and_construct_arg
    return construct_arg(class_name, name, popped_params, annotation, default, **extras)
  File "/home/niallt/DeCLUTR/allennlp/allennlp/common/from_params.py", line 363, in construct_arg
    raise TypeError(f"Expected {argument_name} to be a {annotation.__name__}.")
TypeError: Expected distributed to be a bool.

Then trying to shift towards allennlp v2.10 by setting distributed to bool and providing the cuda_devices as a list leads to the following:

"trainer": {
        // Set use_amp to true to use automatic mixed-precision during training (if your GPU supports it)
        "use_amp": true,
        "cuda_devices": [7,8],
        "optimizer": {
            "type": "huggingface_adamw",
            "lr": 5e-5,
            "eps": 1e-06,
            "correct_bias": false,
            "weight_decay": 0.1,
            "parameter_groups": [
                // Apply weight decay to pre-trained params, excluding LayerNorm params and biases
                [["bias", "LayerNorm\\.weight", "layer_norm\\.weight"], {"weight_decay": 0}],
            ],
        },
        "callbacks":[{"type":'tensorboard'}],
        "num_epochs": 10,
        "checkpointer": {
            // A value of null or -1 will save the weights of the model at the end of every epoch
            "keep_most_recent_by_count": 2,
        },
        "grad_norm": 1.0,
        "learning_rate_scheduler": {
            "type": "slanted_triangular",
        },
        "distributed": true,
        "world_size": 2    
    },   
}

With the following error:

  File "/home/niallt/DeCLUTR/allennlp/allennlp/commands/train.py", line 786, in from_partial_objects
    trainer_ = trainer.construct(
  File "/home/niallt/DeCLUTR/allennlp/allennlp/common/lazy.py", line 82, in construct
    return self.constructor(**contructor_kwargs)
  File "/home/niallt/DeCLUTR/allennlp/allennlp/common/lazy.py", line 66, in constructor_to_use
    return self._constructor.from_params(  # type: ignore[union-attr]
  File "/home/niallt/DeCLUTR/allennlp/allennlp/common/from_params.py", line 604, in from_params
    return retyped_subclass.from_params(
  File "/home/niallt/DeCLUTR/allennlp/allennlp/common/from_params.py", line 638, in from_params
    return constructor_to_call(**kwargs)  # type: ignore
  File "/home/niallt/DeCLUTR/allennlp/allennlp/training/gradient_descent_trainer.py", line 1154, in from_partial_objects
    ddp_accelerator = TorchDdpAccelerator(cuda_device=cuda_device)
  File "/home/niallt/DeCLUTR/allennlp/allennlp/nn/parallel/ddp_accelerator.py", line 138, in __init__
    super().__init__(local_rank=local_rank, world_size=world_size, cuda_device=cuda_device)
  File "/home/niallt/DeCLUTR/allennlp/allennlp/nn/parallel/ddp_accelerator.py", line 102, in __init__
    self.local_rank: int = local_rank if local_rank is not None else dist.get_rank()
  File "/home/niallt/venvs/39_declutr/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 844, in get_rank
    default_pg = _get_default_group()
  File "/home/niallt/venvs/39_declutr/lib/python3.9/site-packages/torch/distributed/distributed_c10d.py", line 429, in _get_default_group
    raise RuntimeError(
RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.

I'm guessing I may just be missing some key arguments here - but struggling to determine what.

Any help would be much appreciated

greybeard
  • 2,249
  • 8
  • 30
  • 66

0 Answers0