6

I am using a Unet model for semantic segmentation - I have a custom dataset of images and their masks both in .png format. I have looked in the online forums and tried stuff, but not much works? Any suggestions in how to resolve the error or improve the code would be helpful.

model.eval()
with torch.no_grad():
    for xb, yb in val_dl:
        yb_pred = model(xb.to(device))
        # yb_pred = yb_pred["out"].cpu()
        print(yb_pred.shape)
        yb_pred = torch.argmax(yb_pred,axis = 1)     
        break
    
    print(yb_pred.shape)
    

criteron = nn.CrossEntropyLoss(reduction = 'sum')
opt = optim.Adam(model.parameters(), lr = 3e-4)

def loss_batch(loss_func, output, target, opt = None):
    loss = loss_func(output, target)
    
    if opt is not None:
        opt.zero_grad()
        loss.backward()
        opt.step()        
    return loss.item(), None

lr_scheduler = ReduceLROnPlateau(opt, mode = 'min', factor = 0.5, patience= 20, verbose = 1)

def get_lr(opt):
    for param_group in opt.param_groups:
        return param_group['lr']
    
current_lr = get_lr(opt)
print('current_lr = {}'.format(current_lr))


def loss_epoch(model, loss_func, dataset_dl, sanity_check = False, opt = None):
    running_loss = 0.0
    len_data = len(dataset_dl.dataset)
    
    for xb, yb in dataset_dl:
        xb = xb.to(device)
        yb = yb.to(device)
        
        # xb = torch.tensor(xbh, requires_grad=True)
                
        output = model(xb)
        
        loss_b, metric_b = loss_batch(loss_func, output, yb, opt)
        running_loss += loss_b
        if sanity_check is True:
            break
    loss = running_loss/float(len_data) 
    return loss, None

def train_val(model, params):
    num_epochs = params["num_epochs"]
    loss_func = params["loss_func"]
    opt = params["optimizer"]
    train_dl = params["train_dl"]
    val_dl = params["val_dl"]
    sanity_check = params["sanity_check"]
    lr_scheduler = params["lr_scheduler"]
    path2weights = params["path2weights"]
    
    loss_history = {"train": [],
                    "val": []}
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = float('inf')
    
    for epoch in range(num_epochs):
        current_lr = get_lr(opt)
        print('Epoch {}/{}, current_lr = {}'.format(epoch, num_epochs - 1, current_lr))
        
        with torch.enable_grad():
            model.train()
            train_loss, _ = loss_epoch(model, loss_func, train_dl, sanity_check, opt)
        loss_history["train"].append(train_loss)
        model.eval()
        
        with torch.no_grad():
            val_loss, _ = loss_epoch(model, loss_func, val_dl, sanity_check, opt)
        loss_history["val"].append(val_loss)
        
        if val_loss < best_loss:
            best_loss = val_loss
            best_model_wts = copy.deepcopy(model.state_dict())
            torch.save(model.state_dict(), path2weights)
            print("copied best model weights!!")
    
        lr_scheduler.step(val_loss)
        if current_lr != get_lr(opt):
            print("Loading best model weights!!")
            model.load_state_dict(best_model_wts)
            print("train Loss: %.6f" %(train_loss))
            print("val_loss: %.6f" %(val_loss))
            print("-"*20)
            
        model.load_state_dict(best_model_wts)
        return model, loss_history, metric_history
    

path2models = "./models/"
if not os.path.exists(path2models):
    os.mkdir(path2models)
    
param_train = {
    "num_epochs": 10,
    "loss_func": criteron,
    "optimizer": opt,
    "train_dl": train_dl,
    "val_dl": val_dl,
    "sanity_check": False,
    "lr_scheduler": lr_scheduler,
    "path2weights": path2models + "weights.pt"
model, loss_hist, _ = train_val(model, param_train)

The error message looks like:

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

And here is the traceback:

File "<ipython-input-108-1ef24c0b1593>", line 10, in <module>
    model, loss_hist, _ = train_val(model, param_train)

  File "<ipython-input-106-53830bafab8b>", line 27, in train_val
    val_loss, _ = loss_epoch(model, loss_func, val_dl, sanity_check, opt)

  File "<ipython-input-104-5fc229145602>", line 13, in loss_epoch
    loss_b, metric_b = loss_batch(loss_func, output, yb, opt)

  File "<ipython-input-100-68322a002c04>", line 6, in loss_batch
    loss.backward()

  File "C:\Users\W540\anaconda3\lib\site-packages\torch\tensor.py", line 198, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph)

  File "C:\Users\W540\anaconda3\lib\site-packages\torch\autograd\__init__.py", line 100, in backward
    allow_unreachable=True)  # allow_unreachable flag

I am not sure which variable to set as require_grad = True or where I should enable grad...

Barbara Gendron
  • 385
  • 1
  • 2
  • 16
Namwa
  • 61
  • 1
  • 1
  • 2

3 Answers3

12

You can try this before loss.backward():

loss = Variable(loss, requires_grad = True)

Or, because the Variable has been removed from PyTorch (still exists but deprecated), you can do the same thing simply by using following code:

loss.requires_grad = True
ouflak
  • 2,458
  • 10
  • 44
  • 49
parvaneh shayegh
  • 507
  • 5
  • 13
2

I got this error from passing the input instead of the output to the loss function.

output = model(input)
loss = loss_fn(input, target)

The correct code is

loss = loss_fn(output, target)
Tom Huntington
  • 2,260
  • 10
  • 20
0

For me calling .retain_grad() before the .backward() solved the issue as stated here

OuttaSpaceTime
  • 710
  • 10
  • 24