How can I use GPU properly on Ray tune?

Question

When I try to use ray tune for hyper-parameter optimization, an error below occurred.

RuntimeError: No CUDA GPUs are available
(main pid=4099) *** SIGSEGV received at time=1664685800 on cpu 0 ***
(main pid=4099) PC: @     0x7f7999651050  (unknown)  (unknown)
2022-10-02 04:43:20,455 WARNING worker.py:1829 -- A worker died or was killed while executing a task by an unexpected system error. To troubleshoot the problem, check the logs for the dead worker. RayTask ID: ffffffffffffffff7e397495e9840bc1819f011601000000 Worker ID: e9371df84e6c8ca09a2cf2da974ba9e78e9e125beb9488b22dc5a74f Node ID: a898df022b143e3de733f832dfee96aef8385bc6402e8a94da61e9ea Worker IP address: 172.28.0.2 Worker port: 41737 Worker PID: 4099 Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.
2022-10-02 04:43:20,456 ERROR trial_runner.py:980 -- Trial main_b7e58_00000: Error processing event.
ray.tune.error._TuneNoNextExecutorEventError: Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/ray/tune/execution/ray_trial_executor.py", line 989, in get_next_executor_event
    future_result = ray.get(ready_future)
  File "/usr/local/lib/python3.7/dist-packages/ray/_private/client_mode_hook.py", line 105, in wrapper
    return func(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/ray/_private/worker.py", line 2277, in get
    raise value
ray.exceptions.RayActorError: The actor died unexpectedly before finishing this task.
    class_name: wrap_function.<locals>.ImplicitFunc
    actor_id: 7e397495e9840bc1819f011601000000
    pid: 4099
    namespace: 8c989dd0-b724-425a-96f7-f4bb2992fe5a
    ip: 172.28.0.2
The actor is dead because its worker process has died. Worker exit type: SYSTEM_ERROR Worker exit detail: Worker unexpectedly exits with a connection error code 2. End of file. There are some potential root causes. (1) The process is killed by SIGKILL by OOM killer due to high memory usage. (2) ray stop --force is called. (3) The worker is crashed unexpectedly due to SIGSEGV or other unexpected errors.

In tune.run() coding, I wrote gpus_per_trial like this↓

def run_search():

  for i in range(len(subj_list)):
    output_dir = '/content/drive/MyDrive/_Results___es_patience20__train_counts1_batch32_lr5e-06_w-decay0.00025'
    subj_dir = output_dir + '/' + subj_list[i]

    config = {
    'lr_init':tune.quniform(1e-8,1e-3,5e-9),
    'weight_decay':tune.qloguniform(1e-4,1e-2,5e-5)
  }

    scheduler = ASHAScheduler(
      metric = 'clip_corr',
      mode = 'max',
      max_t = 5000,
      grace_period = 1 , #学習がうまくいかなくても、1 epochは回す
      reduction_factor = 2
      )
  
    reporter = CLIReporter(
      metric_columns = ['train_loss','train_clip_corr','val_loss','val_clip_corr']
  )

    result = tune.run(main,
                      config = config,
                      num_samples = 1,
                      resources_per_trial = {'cpu':8,'gpu':1},
                      verbose = 3,
                      scheduler = scheduler,
                      local_dir = subj_dir,
                      keep_checkpoints_num = 1, #val_clip_corrの最大化が目標
                      checkpoint_score_attr = 'val_clip_corr',
                      progress_reporter = reporter
                      )
    
    ### extract the best trial run from the search ###
    best_trial = result.get_best_trial(
        'val_clip_corr','max','last'
    )
    
    print('Best trial config :{}'.format(best_trial.config))
    print('Best trial final val_loss : {}'.format(best_trial.last_result['val_loss']))
    print('Best trial final val_clip_corr : {}'.format(best_trial.last_result['val_clip_corr']))

if __name__ == '__main__':
  run_search()

And then, in main() function, I set device cuda:0, but I still cannot use GPU properly.

I also set model to cuda:0 in the cell of definig model like model.cuda(gpu_id).

For more detail, I'll attach transfer_model()code below main(). In transfer_model(), I want to load some parameters saved in previous training on the same model.

    def main(config,
         gpu_id = 0,
         num_epochs = 5000,
         pretrained_type = 'IO',
         pretrained_model = '',
         train_counts =1,
         freeze_layer = ['cnn'],
         overfitting = False,
         early_stopping = False
         ):
  

  seeder(seed)

  output_dir = '/content/drive/MyDrive/_Results___es_patience20__train_counts1_batch32_lr5e-06_w-decay0.00025' 
  print(output_dir)

  for i in range(len(subj_list)):

    pretrained_model = output_dir 
    subj_dir = output_dir + '/' + subj_list[i]

    # get data loader
    train_loader = load_data(scaling = False,
                           downscale_median = True,
                           augmentation = True,
                           train_loader = True)
  
    val_loader = load_data(scaling = False,
                         downscale_median = True,
                         augmentation = True,
                         val_loader = True)
  
    test_loader = load_data(scaling = False,
                         downscale_median = True,
                         augmentation = True,
                         test_loader = True)

    
    

    if pretrained_type == "FT":
      sub_pretrained_model = pretrained_model

    if pretrained_type == "IO":
      sub_pretrained_model = pretrained_model + '/' + subj_list[i]

    
    model = transfer_model(train_counts = 1,
                           pre_model_path = sub_pretrained_model,
                           verbose = True,
                           gpu_id = 0)
    
    #device = os.environ['CUDA_VISIBLE_DEVICES'] = '0'
    model.cuda(gpu_id) 

    for param in model.parameters():
      param.requires_grad = True

    if 'cnn' in freeze_layer:
      for param in model.cnn.parameters():
        param.requires_grad = False

    if 'tdm' in freeze_layer:
      for param in model.tdm.parameters():
        param.requires_grad = False
    
    if 'u_cnn_5' in freeze_layer:
      for idx, param in enumerate(model.cnn.parameters()):
        if idx < 34:
          param.requires_grad = False

    if 'rnn' in freeze_layer:
      for param in model.rnn.parameters():
        param.requires_grad = False

    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(filter(lambda p:p.requires_grad, model.parameters()),
                           lr = config['lr_init'],
                           weight_decay = config['weight_decay'])

    
    if not overfitting:
      es = EarlyStopping(patience=20)


    for epoch in range(num_epochs):

      if epoch == 0:
        pass

      else:
        train_loss, train_clip_corr = train1(train_loader,
                                             model,
                                             criterion,
                                             optimizer,
                                             gpu_id = 0
                                             )
        
        val_loss, val_clip_corr = validate1(val_loader,
                                            model,
                                            gpu_id,
                                            criterion,
                                            corr_w=1.0,
                                            loss_type='MSE&Cosine',
                                            score_metric="spearmanr",
                                            gpu_id = 0)
    
    print('train_loss : {}'.format(train_loss))
    print('train_clip_corr:{}'.format(train_clip_corr))
    print('val_loss :{}'.format(val_loss))
    print('val_clip_corr :{}'.format(val_clip_corr))

    with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, 'checkpoint')
            torch.save((model.state_dict(), optimizer.state_dict()), path)

    tune.report(
            train_loss = train_loss,
            train_clip_corr = np.mean(train_clip_corr),
            val_loss = val_loss, 
            val_clip_corr = np.mean(val_clip_corr)
        )


def transfer_model(train_counts,
                   pre_model_path,
                   verbose=False,
                   gpu_id = 0):
    """"""

    if train_counts != 0:
        model = CRNN_VGG_BN_3FC_MaxPool(verbose=verbose, 
                                        gpu_id = 1, 
                                        train_counts=train_counts-1)
        model = add_tdm_layer(model, train_counts)

        if train_counts != 1:
            model = add_t_out(model, train_counts-1)
            model.cnn = model.cnn[:-1]
            
    if torch.cuda.is_available():
        model.load_state_dict(torch.load(pre_model_path + '/best_weight.pkl'), 
                                              strict = False)
    else:
      model.load_state_dict(torch.load(pre_model_path + '/best_weight.pkl',
                                       map_location = 'cpu')
                                        ,strict = False)
             
            
        
      
    if train_counts == 0:
        model = CRNN_VGG_BN_3FC_MaxPool(verbose=verbose,
                        gpu_id = 1,
                        train_counts=train_counts)
        
        model.load_state_dict(torch.load(pre_model_path + '/best_weight.pkl'), 
                                              strict = False)
    if verbose:
        print(model)
    
    model.cuda(gpu_id)
    
    return model

score 0 · Answer 1 · answered Oct 03 '22 at 22:25

You can try to initialize ray with ray.init before usage:

if __name__ == '__main__':
  ray.init(num_gpus=1, num_cpus=16)
  run_search()

It should autodetect GPUs but sometimes it fails so initialize it explicitly.

Other options:

https://docs.ray.io/en/latest/ray-core/package-ref.html#ray-init

How can I use GPU properly on Ray tune?

1 Answers1