I'm trying to train a model for sign language detection and I'm using mediapipe for hand detection and cropping hands.
When I try to train my model my memory overflows and this is fixed when I remove the mediapipe module.
Here is my cropping module:
def hand_crop(img, zoomout_ratio=2e-2):
mpHands = mp.solutions.hands
hands = mpHands.Hands(max_num_hands=10)
x = img.shape[1]
y = img.shape[0]
zoomout_x = x * zoomout_ratio
zoomout_y = y * zoomout_ratio
imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
img_padding = cv2.copyMakeBorder(imgRGB, math.ceil(zoomout_y), math.ceil(zoomout_y), math.ceil(zoomout_x), math.ceil(zoomout_x), cv2.BORDER_CONSTANT, (0, 0, 0))
results = hands.process(img_padding)
crop_result = []
if results.multi_hand_landmarks:
for hlm in results.multi_hand_landmarks:
x_max = int(max([i.x for i in hlm.landmark]) * (x + 2 * zoomout_x)) #0 ta 1 mide * 480
y_max = int(max([i.y for i in hlm.landmark]) * (y + 2 * zoomout_y))
x_min = int(min([i.x for i in hlm.landmark]) * (x + 2 * zoomout_x))
y_min = int(min([i.y for i in hlm.landmark]) * (y + 2 * zoomout_y))
width, height = x_max - x_min, y_max - y_min
if width > height:
y_min -= (((width - height) / 2) + zoomout_y)
y_min = math.floor(y_min)
y_max += (((width - height) / 2) + zoomout_y)
y_max = math.floor(y_max)
x_min -= zoomout_x
x_min = math.floor(x_min)
x_max += zoomout_x
x_max = math.floor(x_max)
else:
x_min -= (((height - width) / 2) + zoomout_x)
x_min = math.floor(x_min)
x_max += (((height - width) / 2) + zoomout_x)
x_max = math.floor(x_max)
y_min -= zoomout_y
y_min = math.floor(y_min)
y_max += zoomout_y
y_max = math.floor(y_max)
crop = img_padding[y_min:y_max, x_min:x_max, :]
cropBGR = cv2.cvtColor(crop, cv2.COLOR_RGB2BGR)
crop_result.append(cropBGR)
return crop_result
Here is my dataset and model:
class DatasetPlus(Dataset):
def __init__(self, root_img, root_data, width, height, transform=None):
self.root_img = root_img
self.root_data = root_data
self.width = width
self.height = height
self.transform = transform
# labels are stored in a csv file
self.labels = pd.read_csv(self.root_data)
self.imgs = [image for image in sorted(
os.listdir(self.root_img)) if image[-4:] == '.jpg']
self.len = len(self.imgs)
def __len__(self):
return self.len
def __getitem__(self, idx):
img_name = self.imgs[idx]
img_path = os.path.join(self.root_img, img_name)
img = cv2.imread(img_path)
print(img_name)
img = hand_crop(img)[0]
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
img = cv2.resize(img, (self.width, self.height), cv2.INTER_AREA)
img = np.array(img) / 255.0
if self.transform is not None:
img = self.transform(img)
img_id = int(img_name[6:-4])
label = self.labels.where(self.labels['ID'] == img_id)['Label'].dropna().to_numpy()[0]
if label == 3: label = 0
label = torch.tensor(label, dtype=torch.long)
return img, label
class Net(nn.Module):
def __init__(self, h, w):
super().__init__()
nw = (((w - 4) // 2) -4) // 2
nh = (((h - 4) // 2) -4) // 2
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(16 * nh * nw, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 3)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = (self.fc3(x))
return x
Here is my training code:
transform = transforms.Compose(
[transforms.ToTensor(),])
root_img = 'data/'
root_label = 'data.csv'
ds = DatasetPlus(root_img, root_label, 224, 224, transform=transform)
model = Net(224, 224)
trainloader = DataLoader(ds, batch_size=4, shuffle=True)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=1e-3)
def train_model(epochs):
for epoch in range(epochs):
losses = 0.0
for i, data in enumerate(trainloader, 0):
optimizer.zero_grad()
img, label = data
yhat = model(img)
loss = criterion(yhat, label)
loss.backward()
optimizer.step()
losses += loss.item()
print(f'[{epoch + 1}, {i + 1:5d}] loss: {losses:.3f}')
losses = 0.0
train_model(10)
Why isn't the unused data in the hand crop module being automatically collected?