How to get ray tune to run with pytorch - IsADirectoryError: [Errno 21] Is a directory

Question

I have this code (a complete reproducible example):

## Standard libraries
CHECKPOINT_PATH = "/home/ad1/test_predictor/new_dev_v1"
DATASET_PATH = "/home/ad1/test_predictor"
import torch
device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
from data_curation.views_common import get_peptide_segments, get_peptide_edges, get_network_structure_with_smiles_features, compute_desc, compute_FP
from importlib import reload
from itertools import *
import matplotlib
from itertools import groupby
from libs_public.api import get_pantry_token
from matplotlib import pyplot as plt
from matplotlib.colors import to_rgb
from openbabel import pybel
from openbabel.pybel import readstring,descs
from operator import itemgetter
from pathlib import Path
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from ray import tune
from ray.tune import CLIReporter
from ray.tune.integration.pytorch_lightning import TuneReportCallback, TuneReportCheckpointCallback
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
from sklearn import preprocessing
from sklearn.metrics import f1_score, precision_score, recall_score,roc_auc_score
from socket import TIPC_DEST_DROPPABLE
from torch.nn import Linear
from torch.utils.data import TensorDataset
from torch_geometric.data import Data, Dataset,DataLoader,DenseDataLoader,InMemoryDataset
from torch_geometric.datasets import TUDataset
from torch_geometric.nn import GCNConv
from torch_geometric.nn import global_mean_pool
from torchmetrics.functional import precision_recall
from torchvision import transforms
from torchvision.datasets import CIFAR10
from tqdm.notebook import tqdm
import getpass, argparse
import joblib
import json
import logging
import math
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np 
import openbabel
import os
import pandas as pd
import pytorch_lightning as pl
import random
import re
import requests
import seaborn as sns
import sklearn
import sys
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
import torch_geometric
import torch_geometric.data as geom_data
import torch_geometric.nn as geom_nn
import torchmetrics
import torchvision
import warnings
matplotlib.rcParams['lines.linewidth'] = 2.0
pl.seed_everything(42)
print(device)
sns.reset_orig()
sns.set()
sys.path.append('/home/ad1/git/')
torch.backends.cudnn.deterministic = True
warnings.filterwarnings('ignore')

import warnings
warnings.filterwarnings("ignore", ".*does not have many workers.*")

# Setting the seed
pl.seed_everything(42)

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print(device)


import torch
from torch_geometric.datasets import TUDataset
from torch.nn import Linear
from torch_geometric.nn import global_mean_pool
from torch_geometric.data import Data, Dataset,DataLoader


from torch.utils.data import TensorDataset
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
from ray.tune.integration.pytorch_lightning import TuneReportCallback, TuneReportCheckpointCallback

dataset = TUDataset(root='/tmp/MUTAG', name='MUTAG', use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)

train_dataset = dataset #just for testing
val_dataset = dataset
test_dataset = dataset

graph_train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True) #changed because of error: The number of training samples (3) is smaller than the logging interval Trainer(log_every_n_steps=50)
graph_val_loader = DataLoader(val_dataset, batch_size=8) # Additional loader if you want to change to a larger dataset
graph_test_loader = DataLoader(test_dataset, batch_size=8)


#will change this when it makes sense
#config = {
#    "dropout": tune.uniform(0.4,0.5)
#    } 

config = {'dropout':0.4}

gnn_layer_by_name = {
    "GCN": geom_nn.GCNConv,
    "GAT": geom_nn.GATConv,
    "GraphConv": geom_nn.GraphConv
}

class GCNLayer(nn.Module):
    def __init__(self, c_in, c_out):
        super().__init__()
        self.projection = nn.Linear(c_in, c_out)
        

    def forward(self, node_feats, adj_matrix):
        """
        Inputs:
            node_feats - Tensor with node features of shape [batch_size, num_nodes, c_in]
            adj_matrix - Batch of adjacency matrices of the graph. If there is an edge from i to j, adj_matrix[b,i,j]=1 else 0.
                         Supports directed edges by non-symmetric matrices. Assumes to already have added the identity connections. 
                         Shape: [batch_size, num_nodes, num_nodes]
        """
        # Num neighbours = number of incoming edges
        num_neighbours = adj_matrix.sum(dim=-1, keepdims=True)
        node_feats = self.projection(node_feats)
        node_feats = torch.bmm(adj_matrix, node_feats)
        node_feats = node_feats / num_neighbours
        return node_feats

class GNNModel(nn.Module):
    
    def __init__(self, c_in, c_hidden, c_out, num_layers=2, layer_name="GCN", dp_rate=config['dropout'], **kwargs):
        """
        Inputs:
            c_in - Dimension of input features
            c_hidden - Dimension of hidden features
            c_out - Dimension of the output features. Usually number of classes in classification
            num_layers - Number of "hidden" graph layers
            layer_name - String of the graph layer to use
            dp_rate - Dropout rate to apply throughout the network
            kwargs - Additional arguments for the graph layer (e.g. number of heads for GAT)
        """
        super().__init__()
        gnn_layer = gnn_layer_by_name[layer_name]
        
        layers = []
        in_channels, out_channels = c_in, c_hidden
        for l_idx in range(num_layers-1):
            layers += [
                gnn_layer(in_channels=in_channels, 
                          out_channels=out_channels,
                          **kwargs),
                nn.ReLU(inplace=True),
                nn.Dropout(config['dropout'])
            ]
            in_channels = c_hidden
        layers += [gnn_layer(in_channels=in_channels, 
                             out_channels=c_out,
                             **kwargs)]
        self.layers = nn.ModuleList(layers)
    
    def forward(self, x, edge_index):
        """
        Inputs:
            x - Input features per node
            edge_index - List of vertex index pairs representing the edges in the graph (PyTorch geometric notation)
        """
        for l in self.layers:
            # For graph layers, we need to add the "edge_index" tensor as additional input
            # All PyTorch Geometric graph layer inherit the class "MessagePassing", hence
            # we can simply check the class type.
            if isinstance(l, geom_nn.MessagePassing):
                x = l(x, edge_index)
            else:
                x = l(x)
        return x



class GraphGNNModel(nn.Module):
    
    def __init__(self, c_in, c_hidden, c_out, dp_rate_linear=0.5, **kwargs):
        """
        Inputs:
            c_in - Dimension of input features
            c_hidden - Dimension of hidden features
            c_out - Dimension of output features (usually number of classes)
            dp_rate_linear - Dropout rate before the linear layer (usually much higher than inside the GNN)
            kwargs - Additional arguments for the GNNModel object
        """
        super().__init__()
        self.GNN = GNNModel(c_in=c_in, 
                            c_hidden=c_hidden, 
                            c_out=c_hidden, # Not our prediction output yet!
                            **kwargs)
        self.head = nn.Sequential(
            nn.Dropout(config['dropout']),
            nn.Linear(c_hidden, c_out)
        )

    def forward(self, x, edge_index, batch_idx):
        """
        Inputs:
            x - Input features per node
            edge_index - List of vertex index pairs representing the edges in the graph (PyTorch geometric notation)
            batch_idx - Index of batch element for each node
        """
        x = self.GNN(x, edge_index)
        x = geom_nn.global_mean_pool(x, batch_idx) # Average pooling
        x = self.head(x)
        return x


#see https://pytorch-lightning.readthedocs.io/en/stable/common/lightning_module.html
class GraphLevelGNN(pl.LightningModule):
    
    def __init__(self, **model_kwargs):
        super().__init__()
        # Saving hyperparameters
        self.save_hyperparameters()
        
        self.model = GraphGNNModel(**model_kwargs)
        self.loss_module = nn.BCEWithLogitsLoss() #if self.hparams.c_out == 1 else nn.CrossEntropyLoss()

    def forward(self, data, mode="train"):
        x, edge_index, batch_idx = data.x, data.edge_index, data.batch
        x = self.model(x, edge_index, batch_idx)
        x = x.squeeze(dim=-1)
        
        if self.hparams.c_out == 1:
            preds = (x > 0).float()
            data.y = data.y.float()
        else:
            preds = x.argmax(dim=-1)

        loss = self.loss_module(x, data.y)
        acc = (preds == data.y).sum().float() / preds.shape[0]
        f1 = f1_score(preds,data.y)  ##change f1/precision and recall was just testing
        precision = precision_score(preds,data.y)
        recall = recall_score(preds,data.y)
        #roc_auc = roc_auc_score(preds,data.y)  ##ADD THIS BACK IN
        return loss, acc, f1,precision, recall

    def configure_optimizers(self):
        optimizer = optim.SGD(self.parameters(),lr=0.1) # High lr because of small dataset and small model
        return optimizer

    def training_step(self, batch, batch_idx):
        loss, acc, _,_, _ = self.forward(batch, mode="train")
        self.log('train_loss', loss,on_epoch=True,logger=True)
        self.log('train_acc', acc,on_epoch=True,logger=True)
        #self.log('train_precision',precision_and_recall)
        return loss

    def validation_step(self, batch, batch_idx):
        loss, acc, _,_, _ = self.forward(batch, mode="val")
        self.log('val_acc', acc,on_epoch=True,logger=True)
        self.log('val_loss', loss,on_epoch=True,logger=True)

    def test_step(self, batch, batch_idx):
        loss, acc, f1,precision, recall = self.forward(batch, mode="test")
        self.log('test_acc', acc,on_epoch=True,logger=True)
        self.log('test_f1', f1,on_epoch=True,logger=True)
        self.log('test_precision', precision,on_epoch=True,logger=True)       
        self.log('test_recall', recall,on_epoch=True,logger=True) 
        #self.log('roc_auc', roc_auc,on_epoch=True,logger=True) 


from pytorch_lightning import loggers as pl_loggers
def train_graph_classifier(model_name, **model_kwargs):
    pl.seed_everything(42)
    
    # Create a PyTorch Lightning trainer with the generation callback
    root_dir = os.path.join(CHECKPOINT_PATH, "GraphLevel" + model_name)
    os.makedirs(root_dir, exist_ok=True)
    csv_logger = pl_loggers.CSVLogger(save_dir="logs/")

    tune_report_callback = TuneReportCheckpointCallback(
    metrics={
    "val_loss": "val_loss",
    "val_acc": "val_acc",
    },
    filename="ray_ckpt",
    on="validation_end",
    )

    trainer = pl.Trainer(default_root_dir=root_dir,
                         callbacks=[ModelCheckpoint(save_weights_only=True, mode="max", monitor="val_acc"),tune_report_callback],
                                 #   TuneReportCallback(
                                #    {
                                #        "loss": "val_loss",
                                #        "mean_accuracy": "val_accuracy" 
                                #    },
                                #        on="test_end")] # need to change this to validation but error at the minute
                                 #   ,
                         gpus=1 if str(device).startswith("cuda") else 0,
                         max_epochs=3,
                         progress_bar_refresh_rate=1,
                         logger=csv_logger,                         
                         )

    trainer.logger._default_hp_metric = None # Optional logging argument that we don't need

    # Check whether pretrained model exists. If yes, load it and skip training
    pretrained_filename = os.path.join(CHECKPOINT_PATH, f"GraphLevel{model_name}.ckpt")

    if os.path.isfile(pretrained_filename):
        print("Found pretrained model, loading...")
        model = GraphLevelGNN.load_from_checkpoint(pretrained_filename)
    else:
        pl.seed_everything(42)
        model = GraphLevelGNN(c_in = dataset.num_node_features, 
                              c_out=1, #if tu_dataset.num_classes==2 else tu_dataset.num_classes, 
                              **model_kwargs)
        trainer.fit(model, graph_train_loader, graph_val_loader)
        model = GraphLevelGNN.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
        
    # Test best model on validation and test set
    #train_result = trainer.test(model, graph_train_loader, verbose=False)
    #test_result = trainer.test(model, graph_test_loader, verbose=False)
    #result = {"test": test_result[0]['test_acc'], "train": train_result[0]['test_acc']} 
    #return model, result
    return model

# Example of ASHA Scheduler
scheduler_asha = ASHAScheduler(
    max_t=100,
    grace_period=1,
    reduction_factor=2,
)

from ray.tune.integration.pytorch_lightning import TuneReportCallback, TuneReportCheckpointCallback

reporter = CLIReporter(
    parameter_columns=['dropout'],
    metric_columns=["val_loss", "val_acc", "training_iteration"]
)

def train_fn(config):
    train_graph_classifier(
        model_name="GraphConv", 
        layer_name="GraphConv",
        **config)

analysis = tune.run(
    train_fn,
    config={
        # provide your hyperparameter search space here
        "c_hidden": tune.choice([64, 128]),
        "dp_rate_linear": tune.quniform(0.0, 1.0, 0.1),
        "num_layers":tune.choice([3,4]),
        "dp_rate":tune.choice([0.0,0.1])
        # ...
    },
    local_dir='/home/ad1/test_predictor/ray_ckpt2',  # path for saving checkpoints
    metric="val_loss",
    mode="min",
    num_samples=16,
    scheduler=scheduler_asha,
    progress_reporter=reporter,
    name="test")

print(analysis.best_checkpoint)

with analysis.best_checkpoint.as_directory() as tmpdir:
    trainer = GraphLevelGNN.load_from_checkpoint(tmpdir)

The output is (noting I cut the number some repition in the output e.g. epoch progress bars due to character limit):

| train_fn_90084_00009 | RUNNING    | 172.17.0.2:28893 |         64 |            |           |                      |                     |
| train_fn_90084_00000 | TERMINATED | 172.17.0.2:28828 |         64 |   0.636989 |   0.65625 |                    3 |
+----------------------+------------+------------------+------------+------------+-----------+----------------------+


Result for train_fn_90084_00007:
(train_fn pid=28880) 
  date: 2022-08-18_08-27-21
  done: false
  experiment_id: d0744ba26840481db2524413cd971bf9
  hostname: 0e26c6a24ffa
  iterations_since_restore: 1
  node_ip: 172.17.0.2
  pid: 28884
  should_checkpoint: true
  time_since_restore: 0.5186853408813477
  time_this_iter_s: 0.5186853408813477
  time_total_s: 0.5186853408813477
  timestamp: 1660811241
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '90084_00007'
  val_acc: 0.65625
  val_loss: 0.788177490234375
  warmup_time: 0.0034301280975341797
(train_fn pid=28902) /root/miniconda3/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:276: LightningDeprecationWarning: The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7. Please use the `on_exception` callback hook instead.
(train_fn pid=28902)   "The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7."
(train_fn pid=28902)   | Name        | Type              | Params
(train_fn pid=28902) 0 | model       | GraphGNNModel     | 67.8 K
(train_fn pid=28902) 1 | loss_module | BCEWithLogitsLoss | 0     
(train_fn pid=28902) 67.8 K    Trainable params
(train_fn pid=28902) 0         Non-trainable params
(train_fn pid=28902) 67.8 K    Total params
(train_fn pid=28902) 0.271     Total estimated model params size (MB)
(train_fn pid=28902) /root/miniconda3/lib/python3.7/site-packages/pytorch_lightning/trainer/data_loading.py:133: UserWarning: The dataloader, val_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 64 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
(train_fn pid=28902)   f"The dataloader, {name}, does not have many workers which may be a bottleneck."
(train_fn pid=28902) /root/miniconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 due to no true samples. Use `zero_division` parameter to control this behavior.
(train_fn pid=28902)   _warn_prf(average, modifier, msg_start, len(result))
(train_fn pid=28902) /root/miniconda3/lib/python3.7/site-packages/pytorch_lightning/utilities/data.py:60: UserWarning: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 10. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
(train_fn pid=28902)   "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
(train_fn pid=28902) Global seed set to 42                              
(train_fn pid=28877) Global seed set to 42
(train_fn pid=28877) /root/miniconda3/lib/python3.7/site-packages/pytorch_lightning/trainer/connectors/callback_connector.py:91: LightningDeprecationWarning: Setting `Trainer(progress_bar_refresh_rate=1)` is deprecated in v1.5 and will be removed in v1.7. Please pass `pytorch_lightning.callbacks.progress.TQDMProgressBar` with `refresh_rate` directly to the Trainer's `callbacks` argument instead. Or, to disable the progress bar pass `enable_progress_bar = False` to the Trainer.
(train_fn pid=28877)   f"Setting `Trainer(progress_bar_refresh_rate={progress_bar_refresh_rate})` is deprecated in v1.5 and"
(train_fn pid=28877) GPU available: False, used: False
(train_fn pid=28877) TPU available: False, using: 0 TPU cores
(train_fn pid=28877) IPU available: False, using: 0 IPUs
(train_fn pid=28877) Global seed set to 42
(train_fn pid=28877) /root/miniconda3/lib/python3.7/site-packages/pytorch_lightning/trainer/configuration_validator.py:276: LightningDeprecationWarning: The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7. Please use the `on_exception` callback hook instead.
(train_fn pid=28877)   "The `on_keyboard_interrupt` callback hook was deprecated in v1.5 and will be removed in v1.7."
(train_fn pid=28877) 
(train_fn pid=28877)   | Name        | Type              | Params
(train_fn pid=28877) --------------------------------------------------
(train_fn pid=28877) 0 | model       | GraphGNNModel     | 67.8 K
(train_fn pid=28877) 1 | loss_module | BCEWithLogitsLoss | 0     
(train_fn pid=28877) 67.8 K    Trainable params
(train_fn pid=28877) 0         Non-trainable params
(train_fn pid=28877) 67.8 K    Total params
(train_fn pid=28877) 0.271     Total estimated model params size (MB)
(train_fn pid=28877) /root/miniconda3/lib/python3.7/site-packages/pytorch_lightning/trainer/data_loading.py:133: UserWarning: The dataloader, val_dataloader 0, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 64 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
(train_fn pid=28877)   f"The dataloader, {name}, does not have many workers which may be a bottleneck."
(train_fn pid=28877) /root/miniconda3/lib/python3.7/site-packages/sklearn/metrics/_classification.py:1221: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 due to no true samples. Use `zero_division` parameter to control this behavior.
(train_fn pid=28877)   _warn_prf(average, modifier, msg_start, len(result))
(train_fn pid=28877) /root/miniconda3/lib/python3.7/site-packages/pytorch_lightning/utilities/data.py:60: UserWarning: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 10. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
(train_fn pid=28877)   "Trying to infer the `batch_size` from an ambiguous collection. The batch size we"
(train_fn pid=28877) Global seed set to 42
(train_fn pid=28877) /root/miniconda3/lib/python3.7/site-packages/pytorch_lightning/trainer/data_loading.py:133: UserWarning: The dataloader, train_dataloader, does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` (try 64 which is the number of cpus on this machine) in the `DataLoader` init to improve performance.
(train_fn pid=28877)   f"The dataloader, {name}, does not have many workers which may be a bottleneck."
(train_fn pid=28877) /root/miniconda3/lib/python3.7/site-packages/pytorch_lightning/trainer/data_loading.py:433: UserWarning: The number of training samples (24) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.
(train_fn pid=28877)   f"The number of training samples ({self.num_training_batches}) is smaller than the logging interval"

(train_fn pid=28889) 
Result for train_fn_90084_00012:
  date: 2022-08-18_08-27-22
  done: false
  experiment_id: c4c4dfbf3f3248f8a682441a1bbc29ec
  hostname: 0e26c6a24ffa
  iterations_since_restore: 1
  node_ip: 172.17.0.2
  pid: 28905
  should_checkpoint: true
  time_since_restore: 0.47632789611816406
  time_this_iter_s: 0.47632789611816406
  time_total_s: 0.47632789611816406
  timestamp: 1660811242
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '90084_00012'
  val_acc: 0.65625
  val_loss: 0.7415358424186707
  warmup_time: 0.0031974315643310547



Result for train_fn_90084_00008:
  date: 2022-08-18_08-27-22
  done: true
  experiment_id: 3102c377e0114cc6b5cf79030dea327c
  experiment_tag: 8_c_hidden=128,dp_rate=0.0000,dp_rate_linear=0.5000,num_layers=3
  hostname: 0e26c6a24ffa
  iterations_since_restore: 3
  node_ip: 172.17.0.2
  pid: 28889
  should_checkpoint: true
  time_since_restore: 1.370271921157837
  time_this_iter_s: 0.41094374656677246
  time_total_s: 1.370271921157837
  timestamp: 1660811242
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: '90084_00008'
  val_acc: 0.65625
  val_loss: 0.7269810438156128
  warmup_time: 0.003197193145751953
   
== Status ==                                                
Current time: 2022-08-18 08:27:23 (running for 00:00:09.05)
Memory usage on this node: 17.5/86.4 GiB
Using AsyncHyperBand: num_stopped=8
Bracket: Iter 64.000: None | Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: -0.5794828534126282 | Iter 1.000: -0.8018137514591217
Resources requested: 0/64 CPUs, 0/0 GPUs, 0.0/61.59 GiB heap, 0.0/9.31 GiB objects
Current best trial: 90084_00000 with val_loss=0.6369893550872803 and parameters={'c_hidden': 64}
Result logdir: /home/test_predictor/ray_ckpt2/test
Number of trials: 16/16 (16 TERMINATED)
+----------------------+------------+------------------+------------+------------+-----------+----------------------+
| Trial name           | status     | loc              |   c_hidden |   val_loss |   val_acc |   training_iteration |
|----------------------+------------+------------------+------------+------------+-----------+----------------------|
| train_fn_90084_00000 | TERMINATED | 172.17.0.2:28828 |         64 |   0.636989 |  0.65625  |                    3 |                   2 |
+----------------------+------------+------------------+------------+------------+-----------+----------------------+


2022-08-18 08:27:23,358 INFO tune.py:748 -- Total run time: 9.21 seconds (9.01 seconds for the tuning loop).
/home/ad1/test_predictor/ray_ckpt2/test/train_fn_90084_00000_0_c_hidden=64,dp_rate=0.0000,dp_rate_linear=0.8000,num_layers=3_2022-08-18_08-27-14/checkpoint_epoch=1-step=47/
Traceback (most recent call last):
  File "test_pytorch.py", line 384, in <module>
    trainer = GraphLevelGNN.load_from_checkpoint(tmpdir)
  File "/root/miniconda3/lib/python3.7/site-packages/pytorch_lightning/core/saving.py", line 134, in load_from_checkpoint
    checkpoint = pl_load(checkpoint_path, map_location=lambda storage, loc: storage)
  File "/root/miniconda3/lib/python3.7/site-packages/pytorch_lightning/utilities/cloud_io.py", line 37, in load
    with fs.open(path_or_url, "rb") as f:
  File "/root/miniconda3/lib/python3.7/site-packages/fsspec/spec.py", line 1043, in open
    **kwargs,
  File "/root/miniconda3/lib/python3.7/site-packages/fsspec/implementations/local.py", line 159, in _open
    return LocalFileOpener(path, mode, fs=self, **kwargs)
  File "/root/miniconda3/lib/python3.7/site-packages/fsspec/implementations/local.py", line 254, in __init__
    self._open()
  File "/root/miniconda3/lib/python3.7/site-packages/fsspec/implementations/local.py", line 259, in _open
    self.f = open(self.path, mode=self.mode)
IsADirectoryError: [Errno 21] Is a directory: '/home/ad1/test_predictor/ray_ckpt2/test/train_fn_90084_00000_0_c_hidden=64,dp_rate=0.0000,dp_rate_linear=0.8000,num_layers=3_2022-08-18_08-27-14/checkpoint_epoch=1-step=47'

What I was expecting as output:

A model that had been through HPO saved so I can read it back in.
A list of the parameters that was in the best model selected
A logger file with a list of the test/validation metrics using the best model

I have all the elements of this in my script but I get an error about a directory. I deleted this dir completely: /home/ad1/test_predictor/ray_ckpt2/, so there's no way that the dir should be there.

If someone could explain how to fix this/get this working i'd appreciate it.

Update: I partially understand my problem. If I edit out:

#with analysis.best_checkpoint.as_directory() as tmpdir:
#    trainer = GraphLevelGNN.load_from_checkpoint(tmpdir)

The output is:

Epoch 2: 100%|██████████| 48/48 [00:00<00:00, 109.54it/s, loss=0.538, v_num=0]
2022-08-18 14:51:32,391 INFO tune.py:748 -- Total run time: 9.10 seconds (8.90 seconds for the tuning loop).
/home/ad1/test_predictor/ray_ckpt2/test/train_fn_3a64c_00001_1_c_hidden=64,dp_rate=0.0000,dp_rate_linear=0.2000,num_layers=3_2022-08-18_14-51-26/checkpoint_epoch=1-step=47/

I guess that's the best model from HPO, but now I just don't understand how to use it yet to actually train the data and return the different metrics.

I actually solved this myself, i needed to give the directory name and ```ray_ckpt``` to the trainer. — Slowat_Kela, Aug 18 '22 at 15:25

How to get ray tune to run with pytorch - IsADirectoryError: [Errno 21] Is a directory

0 Answers0

Linked