0

I am trying to implement the following algorithm L2C not the meta-L2C. In step 18, the gradient of the loss with respect to alpha is being computed and when i try to access this .grad atribute for alpha, I get None, which means that there is no gradient computed for alpha

My model is as follow:

class CNNCifar(nn.Module):
    def __init__(self):
        super(CNNCifar, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        self.alpha = nn.Parameter(torch.randn(100, 100), requires_grad=True)
        self.w = torch.randn((100, 100), requires_grad=True)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.log_softmax(x, dim=1)

with a training loop that goes like this:

k = len(neighbour_sets)
    device = torch.device("cuda" if not torch.cuda.is_available() else "cpu")
    model = CNNCifar().to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    l2c_optimizer = optim.Adam([model.alpha], lr=beta, weight_decay=0.01)

    test_accuracies = [[] for _ in range(k)]

    theta = [model.state_dict().copy() for _ in range(k)]
    theta_half = [model.state_dict().copy() for _ in range(k)]

    # w = torch.randn(k, k, requires_grad=True)
    delta_theta = [model.state_dict().copy() for _ in range(k)]

    with tqdm_output(tqdm(range(T))) as trange:
        for t in trange:
            for i in range(k):
                # Local SGD step
                log.info(f'Started training a Local SGD at node {i + 1}')

                model.load_state_dict(theta[i])
                for m in range(S):
                    for _, data in enumerate(train_loaders[i]):
                        inputs, labels = data
                        inputs, labels = inputs.to(device), labels.to(device)
                        optimizer.zero_grad()
                        outputs = model(inputs)
                        loss = criterion(outputs, labels)
                        loss.backward()
                        optimizer.step()

                log.info(f'Finished training a Local SGD at node {i + 1}')



                # Change capturing
                log.info(f'Computing change capturing at node {i + 1}')
                for name, param in model.named_parameters():
                    delta_theta[i][name] = theta[i][name] - theta_half[i][name]

                log.info(f'Computing mixing weights at node {i + 1}')
                # Mixing weights calculation
                model.w = model.w.clone()
                model.w[i] = compute_mixing_weights(model.alpha[i], neighbour_sets[i])

                # Aggregation
                log.info(f'Aggergating at node {i + 1}')
                theta_next = {}
                for name, param in model.named_parameters():
                    theta_next[name] = theta[i][name].clone()

                for j in neighbour_sets[i]:
                    for name, param in model.named_parameters():
                        theta_next[name] -= model.w[i][j].item() * delta_theta[i][name][j].clone()


                # Update L2C
                log.info(f'Updating L2C at node {i + 1}')
                model.load_state_dict(theta_next)
                model.train()
                # a training loop to find alpha that minimizes the validation loss
                for _, data in enumerate(val_loaders[i]):
                    inputs, labels = data
                    inputs, labels = inputs.to(device), labels.to(device)
                    
                    l2c_optimizer.zero_grad()
                    model.alpha.requires_grad_(True)
                    
                    log.info(f'Forward pass check')
                    outputs = model(inputs)
                    loss = criterion(outputs, labels)
                    model.alpha.retain_grad()
                    loss.backward()
                    print(f'gradient of alpha is {model.alpha.grad}')
                    import pdb; pdb.set_trace()
                    l2c_optimizer.step()

                    # Update α[i]
                    # import pdb; pdb.set_trace()
                    # alpha_grad = model.alpha.grad  # Access the computed gradients
                    # model.alpha.data[i] -= beta * alpha_grad[i]
                

                # Remove edges for sparse topology
                if t == T_0:
                    for _ in range(K_0):
                        j = min(neighbour_sets[i], key=lambda x: w[i][x])
                        neighbour_sets[i].delete(j)

                theta[i] = model.state_dict().copy()
                theta_half[i] = model.state_dict().copy()

                # Compute test accuracy for each local model
                test_accuracies = compute_test_acc(model, test_loaders[i], device, test_accuracies, i)
            
        log.info(f'Test accuracies atiteration at Comm_round {t} =  {sum(test_accuracies) / k}')
    
    return theta, test_accuracies

What is the problem with this implemntation.

I tried different retrain_grad() after the loss computation

aks
  • 25
  • 6
  • The problem here is that alpha is not directly used to make the losses and so dosen't appear in the gradient graph. Try to remove the `.clone()` in the weights especially in aggregation maybe. Btw `model.alpha.retrain_grad()` is useless here – Valentin Goldité Jul 26 '23 at 15:17
  • Yes, I know that alpha is not directly affecting the loss. The hypothesis is something like this: **alpha -> softmax(alpha) -> theta (model parameters)**. So alpha is affecting the loss indirectly. Is there a way to do this in Pytorch to the best of your knowledge? By the way i removed .clone() as you suggested and nothing changed. – aks Jul 26 '23 at 17:50

0 Answers0