1

I'm trying to train a model for sign language detection and I'm using mediapipe for hand detection and cropping hands.

When I try to train my model my memory overflows and this is fixed when I remove the mediapipe module.

Here is my cropping module:

def hand_crop(img, zoomout_ratio=2e-2):
    mpHands = mp.solutions.hands
    hands = mpHands.Hands(max_num_hands=10)
    
    x = img.shape[1]
    y = img.shape[0]

    zoomout_x = x * zoomout_ratio
    zoomout_y = y * zoomout_ratio

    imgRGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    
    img_padding = cv2.copyMakeBorder(imgRGB, math.ceil(zoomout_y), math.ceil(zoomout_y), math.ceil(zoomout_x), math.ceil(zoomout_x), cv2.BORDER_CONSTANT, (0, 0, 0))

    results = hands.process(img_padding)

    crop_result = []

    if results.multi_hand_landmarks:
        for hlm in results.multi_hand_landmarks:
            x_max = int(max([i.x for i in hlm.landmark]) * (x + 2 * zoomout_x))   #0 ta 1 mide * 480
            y_max = int(max([i.y for i in hlm.landmark]) * (y + 2 * zoomout_y))
            x_min = int(min([i.x for i in hlm.landmark]) * (x + 2 * zoomout_x))
            y_min = int(min([i.y for i in hlm.landmark]) * (y + 2 * zoomout_y))

            width, height = x_max - x_min, y_max - y_min

            if width > height:
                y_min -= (((width - height) / 2) + zoomout_y)
                y_min = math.floor(y_min)
                y_max += (((width - height) / 2) + zoomout_y)
                y_max = math.floor(y_max)
                x_min -= zoomout_x
                x_min = math.floor(x_min)
                x_max += zoomout_x
                x_max = math.floor(x_max)
            else:
                x_min -= (((height - width) / 2) + zoomout_x)
                x_min = math.floor(x_min)
                x_max += (((height - width) / 2) + zoomout_x)
                x_max = math.floor(x_max)
                y_min -= zoomout_y
                y_min = math.floor(y_min)
                y_max += zoomout_y
                y_max = math.floor(y_max)

            crop = img_padding[y_min:y_max, x_min:x_max, :]
            cropBGR = cv2.cvtColor(crop, cv2.COLOR_RGB2BGR)
            crop_result.append(cropBGR)
            
    return crop_result

Here is my dataset and model:


class DatasetPlus(Dataset):
    def __init__(self, root_img, root_data, width, height, transform=None):
        self.root_img = root_img
        self.root_data = root_data
        self.width = width
        self.height = height
        self.transform = transform
        # labels are stored in a csv file
        self.labels = pd.read_csv(self.root_data)
        self.imgs = [image for image in sorted(
            os.listdir(self.root_img)) if image[-4:] == '.jpg']
        self.len = len(self.imgs)

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        img_name = self.imgs[idx]
        img_path = os.path.join(self.root_img, img_name)
        img = cv2.imread(img_path)
        print(img_name)
        img = hand_crop(img)[0]
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB).astype(np.float32)
        img = cv2.resize(img, (self.width, self.height), cv2.INTER_AREA)
        img = np.array(img) / 255.0
        if self.transform is not None:
            img = self.transform(img)
        img_id = int(img_name[6:-4])
        label = self.labels.where(self.labels['ID'] == img_id)['Label'].dropna().to_numpy()[0]
        if label == 3: label = 0
        label = torch.tensor(label, dtype=torch.long)
        return img, label

class Net(nn.Module):
    def __init__(self, h, w):
        super().__init__()
        nw = (((w - 4) // 2) -4) // 2
        nh = (((h - 4) // 2) -4) // 2
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * nh * nw, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 3)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = (self.fc3(x))
        return x

Here is my training code:

transform = transforms.Compose(
    [transforms.ToTensor(),])
root_img = 'data/'
root_label = 'data.csv'
ds = DatasetPlus(root_img, root_label, 224, 224, transform=transform)


model = Net(224, 224)

trainloader = DataLoader(ds, batch_size=4, shuffle=True)

criterion = nn.CrossEntropyLoss()

optimizer = optim.SGD(model.parameters(), lr=1e-3)



def train_model(epochs):
    for epoch in range(epochs): 
        losses = 0.0 
        for i, data in enumerate(trainloader, 0):
            optimizer.zero_grad()
            img, label = data
            yhat = model(img)
            loss = criterion(yhat, label)
            loss.backward()
            optimizer.step()
            losses += loss.item()
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {losses:.3f}')
            losses = 0.0
        

train_model(10)

Why isn't the unused data in the hand crop module being automatically collected?

0 Answers0