I am trying to get new indices every time in a loop and use those indices to select a Subset from a PyTorch dataset class. Then building a dataloader for model training and this error keeps coming. Error and stack trace shown below:
def final_loop(model, dataset, val_data, test_data, budget, gamma = 0.5, rounds = 45, model_name = "engrad.pt", init_samples = 1000,keep_old = True):
device = torch.device("cuda")
init_idx = random.sample(range(0,len(dataset)), init_samples)
train_idx = init_idx
train_step_data = Subset(dataset, train_idx)
train_loader = DataLoader(train_step_data, batch_size = 20, shuffle = True)
remaining_idx = list(set(range(0,len(dataset))) - set(train_idx))
valid_loader = DataLoader(val_data, batch_size = 20)
test_loader = DataLoader(test_data, batch_size = 20)
test_acc = 0
test_acc_list = [test_cifar(model, test_loader, device = "cuda")]
validation = []
samples = [len(train_idx)]
print("test acc", test_acc_list[0])
for i in range(rounds):
print("rounds = ",i+1,"------", "Datapoints = ", len(train_step_data))
train_loss, val_loss = train_cifar(train_loader, valid_loader, model, epochs = 1, criterion= criterion, device = "cuda", model_name = model_name, save = True)
validation.extend(val_loss)
test_acc = test_cifar(model, test_loader, device = "cuda")
test_acc_list.append(test_acc)
# Sampling method1
new_idx1 = ALGO1(inputs) # gives list output
remaining_idx = list(set(remaining_idx) - set(new_idx1))
# Sampling method2
print("running loss_dep")
new_idx2 = ALGO2(inputs) # Gives list output
remaining_idx = list(set(remaining_idx) - set(new_idx2))
train_idx = list(set(list(train_idx) + list(new_idx1) + list(new_idx2)))
train_step_data = Subset(dataset,train_idx)
print("New data points selected")
samples.append(len(train_idx))
train_loader = DataLoader(train_step_data, batch_size = 20, shuffle = True)
model.load_state_dict(torch.load("/content/" + model_name))
print("Best model loaded")
print("test acc",test_acc)
return test_acc_list, validation, samples
Below is the training loop, and here I'm facing the issue while dataloading as per the stack trace says:
def train_cifar(trainloader, valloader, model, epochs, criterion, device = None, model_name = "model_name.pt", save = True):
if device == None:
print("Using CPU")
device = torch.device("cpu")
elif device == "cuda":
if torch.cuda.is_available():
print("Using CUDA")
device = torch.device("cuda")
else:
print("Cuda not found. Using CPU.")
device =torch.device("cpu")
elif device == "mps":
if torch.has_mps:
print("Using MPS")
device = torch.device("mps")
else:
print("MPS not found. Using CPU")
device = torch.device("cpu")
model.to(device)
train_loss_list = []
val_loss_list = []
learning_rate = 0.001
optimizer = optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.01, momentum = 0.75)
for epoch in range(epochs):
t1 = time.time()
print("Epoch: ", epoch+1)
model.train()
train_loss = 0
for inputs, labels in trainloader:
# inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs,labels) #criterion(outputs,labels)
loss.backward()
optimizer.step()
train_loss+=loss.item()
train_loss_list.append(train_loss/len(trainloader))
print("Train loss: ",train_loss/len(trainloader))
model.eval()
loss_val = 0
for ip,lbl in valloader:
# ip, lbl = datas
ip, lbl = ip.to(device), lbl.to(device)
op = model(ip)
val_loss = criterion(op, lbl) #criterion(op, lbl)
loss_val += val_loss.item()
val_loss_list.append(loss_val/len(valloader))
print("Validation loss: ", val_loss_list[-1])
print("Epoch time ----- ",time.time() - t1, " sec")
if save:
if val_loss_list[-1]<=min(val_loss_list):
print("validation loss minimum, saving model")
torch.save(model.state_dict(),"/content/"+model_name)
return train_loss_list, val_loss_list
And the error is as shown below:
Using CUDA
test acc 10.23
rounds = 1 ------ Datapoints = 1000
Using CUDA
Epoch: 1
Train loss: 2.2999311542510985
Validation loss: 2.2954694013034596
Epoch time ----- 4.80617094039917 sec
validation loss minimum, saving model
Using CUDA
running k-means on cuda..
[running kmeans]: 1it [00:02, 2.35s/it, center_shift=0.000008, iteration=1, tol=0.000100]
running loss_dep
New data points selected
Best model loaded
test acc 13.74
rounds = 2 ------ Datapoints = 1176
Using CUDA
Epoch: 1
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
<ipython-input-33-fba5782ce9c0> in <module>
----> 1 abc1, val1, samp1 = final_loop(net2, cifar_train_victim_train, cifar_train_victim_valid, cifar10_test, budget = 300, gamma = 0.6, rounds = 20)
7 frames
<ipython-input-31-6693767cc031> in final_loop(model, dataset, val_data, test_data, budget, gamma, rounds, model_name, init_samples, keep_old)
16 for i in range(rounds):
17 print("rounds = ",i+1,"------", "Datapoints = ", len(train_step_data))
---> 18 train_loss, val_loss = train_cifar(train_loader, valid_loader, model, epochs = 1, criterion= criterion, device = "cuda", model_name = model_name, save = True)
19 validation.extend(val_loss)
20 test_acc = test_cifar(model, test_loader, device = "cuda")
<ipython-input-18-92479e16fe74> in train_cifar(trainloader, valloader, model, epochs, criterion, device, model_name, save)
28 model.train()
29 train_loss = 0
---> 30 for inputs, labels in trainloader:
31 # inputs, labels = data
32 inputs, labels = inputs.to(device), labels.to(device)
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in __next__(self)
679 # TODO(https://github.com/pytorch/pytorch/issues/76750)
680 self._reset() # type: ignore[call-arg]
--> 681 data = self._next_data()
682 self._num_yielded += 1
683 if self._dataset_kind == _DatasetKind.Iterable and \
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
719 def _next_data(self):
720 index = self._next_index() # may raise StopIteration
--> 721 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
722 if self._pin_memory:
723 data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)
/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
47 def fetch(self, possibly_batched_index):
48 if self.auto_collation:
---> 49 data = [self.dataset[idx] for idx in possibly_batched_index]
50 else:
51 data = self.dataset[possibly_batched_index]
/usr/local/lib/python3.7/dist-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
47 def fetch(self, possibly_batched_index):
48 if self.auto_collation:
---> 49 data = [self.dataset[idx] for idx in possibly_batched_index]
50 else:
51 data = self.dataset[possibly_batched_index]
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataset.py in __getitem__(self, idx)
288 if isinstance(idx, list):
289 return self.dataset[[self.indices[i] for i in idx]]
--> 290 return self.dataset[self.indices[idx]]
291
292 def __len__(self):
/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataset.py in __getitem__(self, idx)
288 if isinstance(idx, list):
289 return self.dataset[[self.indices[i] for i in idx]]
--> 290 return self.dataset[self.indices[idx]]
291
292 def __len__(self):
IndexError: list index out of range
What I don't understand is is using the Subset from torch.utils.data.Subset the main issue? If not, how can I resolve this?