3

I have trained a model in pytorch with float data type. I want to improve my inference time by converting this model to quantized model. I have used torch.quantization.convert api to convert my model's weight to uint8 data type. However, when I use this model for inference, I do not get any performance improvement. Am I doing something wrong here ?

The Unet Model code:

def gen_initialization(m):
    if type(m) == nn.Conv2d:
        sh = m.weight.shape
        nn.init.normal_(m.weight, std=math.sqrt(2.0 / (sh[0]*sh[2]*sh[3])))
        nn.init.constant_(m.bias, 0)
    elif type(m) == nn.BatchNorm2d:
        nn.init.constant_(m.weight, 1)
        nn.init.constant_(m.bias, 0)

class TripleConv(nn.Module):
    def __init__(self, in_ch, out_ch):
        super(TripleConv, self).__init__()
        mid_ch = (in_ch + out_ch) // 2
        self.conv = nn.Sequential(
            nn.Conv2d(in_ch, mid_ch, kernel_size=3, stride=1, padding=1, bias=True),
            nn.BatchNorm2d(num_features=mid_ch),
            nn.LeakyReLU(negative_slope=0.1),
            nn.Conv2d(mid_ch, mid_ch, kernel_size=3, stride=1, padding=1, bias=True),
            nn.BatchNorm2d(num_features=mid_ch),
            nn.LeakyReLU(negative_slope=0.1),
            nn.Conv2d(mid_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=True),
            nn.BatchNorm2d(num_features=out_ch),
            nn.LeakyReLU(negative_slope=0.1)
        )
        self.conv.apply(gen_initialization)

    def forward(self, x):
        return self.conv(x)


class Down(nn.Module):
    def __init__(self, in_ch, out_ch):
        super(Down, self).__init__()
        self.triple_conv = TripleConv(in_ch, out_ch)
        self.avg_pool_conv = nn.AvgPool2d(2, 2)
        self.in_ch = in_ch
        self.out_ch = out_ch

    def forward(self, x):
        self.cache = self.triple_conv(x)
        pad = torch.zeros(x.shape[0], self.out_ch - self.in_ch, x.shape[2], x.shape[3], device=x.device)
        x = torch.cat((x, pad), dim=1)
        self.cache += x
        return self.avg_pool_conv(self.cache)


class Center(nn.Module):
    def __init__(self, in_ch, out_ch):
        super(Center, self).__init__()

        self.conv = nn.Sequential(
            nn.Conv2d(in_ch, out_ch, kernel_size=3, stride=1, padding=1, bias=True),
            nn.BatchNorm2d(num_features=out_ch),
            nn.LeakyReLU(negative_slope=0.1, inplace=True)
        )
        self.conv.apply(gen_initialization)

    def forward(self, x):
        return self.conv(x)


class Up(nn.Module):
    def __init__(self, in_ch, out_ch):
        super(Up, self).__init__()
        self.upsample = nn.Upsample(scale_factor=2, mode='bilinear',
                                    align_corners=True)
        self.triple_conv = TripleConv(in_ch, out_ch)

    def forward(self, x, cache):
        x = self.upsample(x)
        x = torch.cat((x, cache), dim=1)
        x = self.triple_conv(x)
        return x


class UNet(nn.Module):
    def __init__(self, in_ch, first_ch=None):
        super(UNet, self).__init__()

        if not first_ch:
            first_ch = 32

        self.down1 = Down(in_ch, first_ch)
        self.down2 = Down(first_ch, first_ch*2)
        self.down3 = Down(first_ch*2, first_ch*4)
        self.down4 = Down(first_ch*4, first_ch*8)
        self.center = Center(first_ch*8, first_ch*8)
        self.up4 = Up(first_ch*8*2, first_ch*4)
        self.up3 = Up(first_ch*4*2, first_ch*2)
        self.up2 = Up(first_ch*2*2, first_ch)
        self.up1 = Up(first_ch*2, first_ch)
        self.output = nn.Conv2d(first_ch, in_ch, kernel_size=3, stride=1,
                                padding=1, bias=True)
        self.output.apply(gen_initialization)

    def forward(self, x):
        x = self.down1(x)
        x = self.down2(x)
        x = self.down3(x)
        x = self.down4(x)
        x = self.center(x)
        x = self.up4(x, self.down4.cache)
        x = self.up3(x, self.down3.cache)
        x = self.up2(x, self.down2.cache)
        x = self.up1(x, self.down1.cache)
        return self.output(x)

The inference code:

from tqdm import tqdm
import os
import numpy as np
import torch
import gan_network
import torch.nn.parallel
from torch.utils.data import DataLoader
import torch.utils.data as data
import random
import glob
import scipy.io
import time
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   # see issue #152
os.environ["CUDA_VISIBLE_DEVICES"]="0"


class DataFolder(data.Dataset):
    def __init__(self, file):
        super(DataFolder, self).__init__()
        self.image_names = []
        fid = file
        for line in fid:
            # line = line[:-1]
            if line == '':
                continue
            # print(line)
            self.image_names.append(line)
        random.shuffle(self.image_names)
        self.image_names = self.image_names[0:]

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, index):
        path = self.image_names[index]
        img = np.load(path)
        img = np.rollaxis(img, 2, 0)
        img = torch.from_numpy(img[:, :, :])
        return img, path


if __name__ == '__main__':
    batch_size = 1
    image_size = 2048
    channels = 6
    model_path = 'D:/WorkProjects/Network_Training_Aqusens/FullFovReconst/network/network_epoch9.pth'
    test_data = glob.glob('D:/save/temp/*.npy')
    dest_dir = 'D:/save/temp/results/'

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    net = gan_network.UNet(6, 32)
    if torch.cuda.device_count() > 1:
        net = torch.nn.DataParallel(net)
    net.to(device)
    net.load_state_dict(torch.load(model_path))
    quantized_model = torch.quantization.convert(net, {torch.nn.Conv2d, torch.nn.BatchNorm2d}, inplace=False)

    dataset = DataFolder(file=test_data)
    print(f'{len(dataset)}')
    data_loader = DataLoader(dataset=dataset, num_workers=4,
                             batch_size=batch_size, shuffle=False,
                             drop_last=False, pin_memory=True)
    input = torch.Tensor(batch_size, channels, image_size, image_size).to(device)

    t0 = time.time()
    with torch.no_grad():
        for i, batch in enumerate(tqdm(data_loader)):
            input.copy_(batch[0])
            output = net(input).cpu().clone().numpy()
            np.array(output)
            output = np.rollaxis(output, 1, 4)
            for num in range(batch_size):
                arr = output[num, :, :, :]
                file_name = os.path.basename(batch[1][num])
                save_name = os.path.join(dest_dir, file_name)
                save_name = save_name.replace(".npy", "")
                scipy.io.savemat(save_name+'.mat', {'output': arr})
    t1 = time.time()
    print(f'Elapsed time = {t1-t0}')

For models net and quantized model, i get the elapsed time around 30 seconds for 12 images passed through them.

Dhruvin Naik
  • 89
  • 2
  • 15

3 Answers3

6

PyTorch documentation suggests three ways to perform quantization. You are doing post-training dynamic quantization (the simplest quantization method available) which only supports torch.nn.Linear and torch.nn.LSTM layers as listed here. To quantize CNN layers, you would want to check out the other two techniques (these are the ones that support CNN layers): post-training static quantization and quantization aware training. This tutorial shows both these techniques applied on CNNs.

1

If there is no improvement in the performance and if you are using Intel hardware, check out Intel Extension for PyTorch (https://github.com/intel/intel-extension-for-pytorch). This extension provides built-in quantization to deliver good statistical accuracy for most popular DL workloads including CNN, NLP and recommendation models.

Ramya R
  • 163
  • 8
0

Have tried out static quantization approach on Yolov5, it cuts 73% of size and decreases inference time by ~ 13-15%. It seems to me, that conv layers gains less than linear in latency, because same static approach on toy MNIST fully-dense net decreased inference time by 4 times.

Poe Dator
  • 4,535
  • 2
  • 14
  • 35