How to fix "RuntimeError: Function CudnnConvolutionBackward returned an invalid gradient at index 1"

Question

I encountered this problem when I tried to remove some filters and then retrain the network though I have moved both model and data to cuda. It appears that the error is at the backward stage when I called loss.backward(); here is my code:

criterion = nn.NLLLoss()

optimizer = optim.SGD(model.parameters(), lr = 0.01, momentum=0.9, 
                          weight_decay=0.0005)

def train(model, train_loader, criterion, optimizer, device, epoch):
    model.train()
    for idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output,target)
        loss.backward() #The problem is here
        optimizer.step()
        if idx % 5 == 0:
            print("Epoch {} [{}/{} ({:.2f}%)]\tLoss: {:.6f}"
                  .format(epoch,idx*len(data),len(train_loader.dataset),
                   100.0*idx/len(train_loader),loss.item()))

def test(model,test_loader,criterion,device):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data,target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = criterion(output,target)
            test_loss += loss.item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()
        test_loss /= len(test_loader.dataset)
        print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{}
              ({:.0f}%)\n'
              .format(test_loss, correct, len(test_loader.dataset),100. 
              * correct / len(test_loader.dataset)))

#This function returns the index of the filter to be pruned
def index_pruned(model, layer):
    l = []
    for i in range(model.features[layer].weight.size(0)):
        l.append(abs(model.features[layer].weight[i,:,:,:]).sum())
    return l.index(min(l))

def prune_filter(model, layer, next_layer, indexes): 
    #Making a new layer, with one less filter    
    new_conv = 
    torch.nn.Conv2d(in_channels=model.features[layer].in_channels, 
                               
                    out_channels=model.features[layer].out_channels-1,
                               
                    kernel_size=model.features[layer].kernel_size,
                    stride=model.features[layer].stride,
                    padding=model.features[layer].padding)
    
    #Pass the old weights to the new layer except in the pruned filter
    new_conv.weight[0:indexes,:,:,:] = 
           model.features[layer].weight[0:indexes,:,:,:]
    new_conv.weight[indexes:,:,:,:] = 
           model.features[layer].weight[indexes+1,:,:,:]
    new_conv.bias[0:indexes] = model.features[layer].bias[0:indexes]
    new_conv.bias[indexes:] = model.features[layer].bias[indexes+1:]
    #replace the layer
    model.features[layer] = new_conv
    
    #When not pruning the last layer
    if layer != 10:
        #Reduce the 'thickness' of the consecutive layer
        #as the previous one is pruned
        next_new_conv = torch.nn.Conv2d(
                 in_channels=model.features[next_layer].in_channels-1, 
                                        
                 out_channels=model.features[next_layer].out_channels,
                                        
                 kernel_size=model.features[next_layer].kernel_size,
                                        
                 stride=model.features[next_layer].stride,
                                        
                 padding=model.features[next_layer].padding)
                 next_new_conv.weight[:,0:indexes,:,:] = 
                 model.features[next_layer].weight[:,0:indexes,:,:]
        next_new_conv.weight[:,indexes:,:,:] = 
            model.features[next_layer].weight[:,indexes+1:,:,:]
        model.features[next_layer] = next_new_conv

    #Pruning the last layer affects the linear layer
    elif layer == 10:
        params = int(model.classifier[0].in_features / 
                        (model.features[10].out_channels+1))
        new_fc1 = torch.nn.Linear(
             in_features=int(model.classifier[0].in_features-params),
                                  
             out_features=int(model.classifier[0].out_features))
        new_fc1.weight[:,0:indexes*params] = 
                 model.classifier[0].weight[:,0:indexes*params]
        new_fc1.weight[:,:params*indexes] = 
                 model.classifier[0].weight[:,:(indexes+1)*params]
        new_fc1.bias = model.classifier[0].bias
        model.classifier[0]=new_fc1
    return model

def main(model, train_loader,test_loader,criterion,optimizer,
         pretrained=False,prune=False,save=False,pruneFilter=False):
    device = 'cuda'
    if pretrained == True:
        model.load_state_dict(torch.load('AlexNet_pruned.pt'))
        for params in model.parameters():
            params.requires_grad = True
    model.to(device)
    if pruneFilter == True:
        #conv0:
        for num_filters_pruned in range(16):
            model=prune_filter(model=model, layer=0, next_layer=3, indexes=index_pruned(model,0))
            if num_filters_pruned %4 == 0:
                model=model.cuda()
                train(model, train_loader, criterion, optimizer, device, 1)
                test(model,test_loader,criterion,device)
        
        torch.save(model.state_dict(),'AlexNet_filers_pruned.pt')
            
    if save == True:
        torch.save(model.state_dict(),'AlexNet.pt')
    if prune == True:
        torch.save(model.state_dict(),'AlexNet_pruned.pt')

And the error is at loss.backward(). I tried to check if every param is in cuda

conv0:

for num_filters_pruned in range(16):
    model=prune_filter(model=model, layer=0, next_layer=3, indexes=index_pruned(model,0))
    if num_filters_pruned %4 == 0:
        model=model.cuda()
        for name, param in model.named_parameters():
            print(name,':',param.device)
        train(model, train_loader, criterion, optimizer, device, 1)
        test(model,test_loader,criterion,device)

And this is the result

features.0.weight : cuda:0 
features.0.bias : cuda:0 
features.3.weight : cuda:0 
features.3.bias : cuda:0 
features.6.weight : cuda:0 
features.6.bias : cuda:0 
features.8.weight : cuda:0 
features.8.bias : cuda:0 
features.10.weight : cuda:0 
features.10.bias : cuda:0 
classifier.fc1.weight : cuda:0 
classifier.fc1.bias : cuda:0 
classifier.fc2.weight : cuda:0 
classifier.fc2.bias : cuda:0
--------------------------------------------------------------------------- RuntimeError                              Traceback (most recent call last) <ipython-input-12-46a65917686e> in <module>
----> 1 main(model,train_loader,test_loader,criterion,optimizer,pretrained=True,prune=False,save=False,pruneFilter=True)

<ipython-input-11-8c11e06a650b> in main(model, train_loader, test_loader, criterion, optimizer, pretrained, prune, save, pruneFilter)
     33                     print(name,':',param.device)
     34 
---> 35                 train(model, train_loader, criterion, optimizer, device, 1)
     36                 test(model,test_loader,criterion,device)
     37         #conv2:

<ipython-input-8-f4fd4c83eff2> in train(model, train_loader, criterion, optimizer, device, epoch)
      9         loss = criterion(output,target)
     10         loss = loss.cuda()
---> 11         loss.backward()
     12         optimizer.step()
     13         if idx % 5 == 0:

/usr/local/lib/python3.5/dist-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    100                 products. Defaults to ``False``.
    101         """
--> 102         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    103 
    104     def register_hook(self, hook):

/usr/local/lib/python3.5/dist-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     88     Variable._execution_engine.run_backward(
     89         tensors, grad_tensors, retain_graph, create_graph,
---> 90         allow_unreachable=True)  # allow_unreachable flag
     91 
     92 

RuntimeError: Function CudnnConvolutionBackward returned an invalid gradient at index 1 - expected type torch.FloatTensor but got torch.cuda.FloatTensor

Can you simplify your code (see [this](https://stackoverflow.com/help/minimal-reproducible-example)) so that we can better pinpoint the problem? BTW it is likely that you have missed a .to(device) somewhere. — hkchengrex, Jul 28 '19 at 17:00
I fully agree with @hkchengrex please reduce the amount of code as much as possible (but still enough to reproduce the problem). Your error message indicates that you are mixing up cpu and gpu tensors. Try changing `model.to(device)` to `model = model.to(device)`. The `to` function doesn't work in-place as far as I know. But as you use a lot of `model.cuda()` which should work as well, your problem might be somewhere else. — MBT, Jul 28 '19 at 19:48
I added a few comments and reduce the code a bit, can you guys have a look at it? Thank you! — Hoang Minh Q, Jul 29 '19 at 05:17
Did you try replacing `model.to(device)` by `model = model.to(device)`? — MBT, Jul 31 '19 at 12:55
you can clearly see that I have model.cuda() before training and I also printed out the device of each and every layer and all of them are "cuda". Actually I fixed this error by creating a new variable which copies all the features of the "being pruned filter", change all the weights to numpy arrays and pass them to the new layer, and after that move the parameters of the new layer to cuda(). — Hoang Minh Q, Aug 01 '19 at 15:40

How to fix "RuntimeError: Function CudnnConvolutionBackward returned an invalid gradient at index 1"

conv0:

0 Answers0