I have a large number of numpy files that surpass the size of the RAM. I create a Dataloaer that read the files using memmap (solution from Load multiple .npy files (size > 10GB) in pytorch).
Surprisingly I have memory issues with while loading the memmaps list. I am therefore looking for other solutions.. Below is my code, Thank
def __init__(
self,
root: str,
dataset: str,
):
self.list_y_numpy_dir = glob.glob( str(Path(root) / "Ground_truth" / dataset / f"Ground_truth_{dataset}_*.npy"))
dic_path = {}
dic_path["S2"]= [xx.replace("Ground_truth","Sentinel-2") for xx in self.list_y_numpy_dir]
self.data_memmaps_S2 = [np.load(path, mmap_mode='r') for path in dic_path["S2"]]
self.target_memmaps = [np.load(path, mmap_mode='r') for path in self.list_y_numpy_dir]
self.start_indices = [0] * len(self.list_y_numpy_dir)
self.data_count = 0
for index, memmap in enumerate(self.target_memmaps):
self.start_indices[index] = self.data_count
self.data_count += memmap.shape[0]
def __getitem__(self, index):
memmap_index = bisect(self.start_indices, index) - 1
index_in_memmap = index - self.start_indices[memmap_index]
target = self.target_memmaps[memmap_index][index_in_memmap]
dicts = {}
dicts["S2"] = torch.as_tensor(np.array(self.data_memmaps_S2[memmap_index][index_in_memmap, :, :, :].astype('float32'))
dicts["Target"] = torch.as_tensor(np.array(target[1]))
return dicts
def __len__(self):
return self.data_count