1

I am trying to get new indices every time in a loop and use those indices to select a Subset from a PyTorch dataset class. Then building a dataloader for model training and this error keeps coming. Error and stack trace shown below:

def final_loop(model, dataset, val_data, test_data, budget, gamma = 0.5, rounds = 45, model_name = "engrad.pt", init_samples = 1000,keep_old = True):
    device = torch.device("cuda")
    init_idx = random.sample(range(0,len(dataset)), init_samples)
    train_idx = init_idx
    train_step_data = Subset(dataset, train_idx)
    train_loader = DataLoader(train_step_data, batch_size = 20, shuffle = True)
    remaining_idx = list(set(range(0,len(dataset))) - set(train_idx))
    valid_loader = DataLoader(val_data, batch_size = 20)
    test_loader = DataLoader(test_data, batch_size = 20)
    test_acc = 0
    test_acc_list = [test_cifar(model, test_loader, device = "cuda")]
    validation = []
    samples = [len(train_idx)]
    print("test acc", test_acc_list[0])
    for i in range(rounds):
        print("rounds = ",i+1,"------", "Datapoints = ", len(train_step_data))
        train_loss, val_loss = train_cifar(train_loader, valid_loader, model, epochs = 1, criterion= criterion, device = "cuda", model_name = model_name, save = True)
        validation.extend(val_loss)
        test_acc = test_cifar(model, test_loader, device = "cuda")
        test_acc_list.append(test_acc)

        # Sampling method1
        new_idx1 = ALGO1(inputs) # gives list output
        remaining_idx = list(set(remaining_idx) - set(new_idx1))

        # Sampling method2
        print("running loss_dep")
        new_idx2 = ALGO2(inputs) # Gives list output
        remaining_idx = list(set(remaining_idx) - set(new_idx2))
        
        train_idx = list(set(list(train_idx) + list(new_idx1) + list(new_idx2)))
        train_step_data = Subset(dataset,train_idx)

        print("New data points selected")
        samples.append(len(train_idx))
        train_loader = DataLoader(train_step_data, batch_size = 20, shuffle = True)
        model.load_state_dict(torch.load("/content/" + model_name))
        print("Best model loaded")
        print("test acc",test_acc)
    return test_acc_list, validation, samples

Below is the training loop, and here I'm facing the issue while dataloading as per the stack trace says:

def train_cifar(trainloader, valloader, model, epochs, criterion, device = None, model_name = "model_name.pt", save = True):
    if device == None:
        print("Using CPU")
        device = torch.device("cpu")
    elif device == "cuda":
        if torch.cuda.is_available():
            print("Using CUDA")
            device = torch.device("cuda")
        else:
            print("Cuda not found. Using CPU.")
            device =torch.device("cpu")
    elif device == "mps":
        if torch.has_mps:
            print("Using MPS")
            device = torch.device("mps")
        else:
            print("MPS not found. Using CPU")
            device = torch.device("cpu")
    model.to(device)
    train_loss_list = []
    val_loss_list = []
    learning_rate = 0.001
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.01, momentum = 0.75)
    
    for epoch in range(epochs):
        t1 = time.time()
        print("Epoch: ", epoch+1)
        model.train()
        train_loss = 0
        for inputs, labels in trainloader:
            # inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs,labels) #criterion(outputs,labels)
            loss.backward()
            optimizer.step()
            train_loss+=loss.item()
        train_loss_list.append(train_loss/len(trainloader))
        print("Train loss: ",train_loss/len(trainloader))
        model.eval()
        loss_val = 0
        for ip,lbl in valloader:
            # ip, lbl = datas
            ip, lbl = ip.to(device), lbl.to(device)
            op = model(ip)
            val_loss = criterion(op, lbl) #criterion(op, lbl)
            loss_val += val_loss.item()
        val_loss_list.append(loss_val/len(valloader))
        print("Validation loss: ", val_loss_list[-1])
        print("Epoch time ----- ",time.time() - t1, " sec")
        if save:    
            if val_loss_list[-1]<=min(val_loss_list):
                print("validation loss minimum, saving model")
                torch.save(model.state_dict(),"/content/"+model_name)
    return train_loss_list, val_loss_list

And the error is as shown below:

Using CUDA
test acc 10.23
rounds =  1 ------ Datapoints =  1000
Using CUDA
Epoch:  1
Train loss:  2.2999311542510985
Validation loss:  2.2954694013034596
Epoch time -----  4.80617094039917  sec
validation loss minimum, saving model
Using CUDA
running k-means on cuda..
[running kmeans]: 1it [00:02,  2.35s/it, center_shift=0.000008, iteration=1, tol=0.000100]
running loss_dep
New data points selected
Best model loaded
test acc 13.74
rounds =  2 ------ Datapoints =  1176
Using CUDA
Epoch:  1
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-33-fba5782ce9c0> in <module>
----> 1 abc1, val1, samp1 = final_loop(net2, cifar_train_victim_train, cifar_train_victim_valid, cifar10_test, budget = 300, gamma = 0.6, rounds = 20)

7 frames
<ipython-input-31-6693767cc031> in final_loop(model, dataset, val_data, test_data, budget, gamma, rounds, model_name, init_samples, keep_old)
     16     for i in range(rounds):
     17         print("rounds = ",i+1,"------", "Datapoints = ", len(train_step_data))
---> 18         train_loss, val_loss = train_cifar(train_loader, valid_loader, model, epochs = 1, criterion= criterion, device = "cuda", model_name = model_name, save = True)
     19         validation.extend(val_loss)
     20         test_acc = test_cifar(model, test_loader, device = "cuda")

<ipython-input-18-92479e16fe74> in train_cifar(trainloader, valloader, model, epochs, criterion, device, model_name, save)
     28         model.train()
     29         train_loss = 0
---> 30         for inputs, labels in trainloader:
     31             # inputs, labels = data
     32             inputs, labels = inputs.to(device), labels.to(device)

/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in __next__(self)
    679                 # TODO(https://github.com/pytorch/pytorch/issues/76750)
    680                 self._reset()  # type: ignore[call-arg]
--> 681             data = self._next_data()
    682             self._num_yielded += 1
    683             if self._dataset_kind == _DatasetKind.Iterable and \

/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
    719     def _next_data(self):
    720         index = self._next_index()  # may raise StopIteration
--> 721         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    722         if self._pin_memory:
    723             data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
     47     def fetch(self, possibly_batched_index):
     48         if self.auto_collation:
---> 49             data = [self.dataset[idx] for idx in possibly_batched_index]
     50         else:
     51             data = self.dataset[possibly_batched_index]

/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
     47     def fetch(self, possibly_batched_index):
     48         if self.auto_collation:
---> 49             data = [self.dataset[idx] for idx in possibly_batched_index]
     50         else:
     51             data = self.dataset[possibly_batched_index]

/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataset.py in __getitem__(self, idx)
    288         if isinstance(idx, list):
    289             return self.dataset[[self.indices[i] for i in idx]]
--> 290         return self.dataset[self.indices[idx]]
    291 
    292     def __len__(self):

/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataset.py in __getitem__(self, idx)
    288         if isinstance(idx, list):
    289             return self.dataset[[self.indices[i] for i in idx]]
--> 290         return self.dataset[self.indices[idx]]
    291 
    292     def __len__(self):

IndexError: list index out of range

What I don't understand is is using the Subset from torch.utils.data.Subset the main issue? If not, how can I resolve this?

0 Answers0