I want to use Numpy to simulate the inference process of a quantized MobileNet V2 network, but the outcome is different with pytorch realized one

Question

Python version: 3.8

Pytorch version: 1.9.0+cpu

Platform: Anaconda Spyder5.0

To reproduce this problem, just copy every code below to a single file.

The ILSVRC2012_val_00000293.jpg file used in this code is shown below, you also need to download it and then change its destination in the code.

Some background of this problem:

I am now working on a project that aims to develop a hardware accelerator to complete the inference process of the MobileNet V2 network. I used pretrained quantized Pytorch model to simulate the outcome, and the result comes out very well.

In order to use hardware to complete this task, I wish to know every inputs and outputs as well as intermidiate variables during runing this piece of pytorch code. I used a package named torchextractor to fetch the outcomes of first layer, which in this case, is a 3*3 convolution layer.

import numpy as np
import torchvision
import torch
from torchvision import transforms, datasets
from PIL import Image
from torchvision import transforms
import torchextractor as tx
import math
#########################################################################################
##### Processing of input image
#########################################################################################

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
test_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,])

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


#image file destination 
filename = "D:\Project_UM\MobileNet_VC709\MobileNet_pytorch\ILSVRC2012_val_00000293.jpg"
input_image = Image.open(filename)
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0)
#########################################################################################
#########################################################################################
#########################################################################################

#----First verify that the torchextractor class should not influent the inference outcome 

# ofmp of layer1 before putting into torchextractor
a,b,c = quantize_tensor(input_batch)# to quantize the input tensor and return an int8 tensor, scale and zero point
input_qa = torch.quantize_per_tensor(torch.tensor(input_batch.clone().detach()), b, c, torch.quint8)# Using quantize_per_tensor method of torch

# Load a quantized mobilenet_v2 model
model_quantized = torchvision.models.quantization.mobilenet_v2(pretrained=True, quantize=True)

model_quantized.eval()
with torch.no_grad():
    output = model_quantized.features[0][0](input_qa)# Ofmp of layer1, datatype : quantized_tensor

# print("FM of layer1 before tx_extractor:\n",output.int_repr())# Ofmp of layer1, datatype : int8 tensor
output1_clone = output.int_repr().detach().numpy()# Clone ofmp of layer1, datatype : ndarray



#########################################################################################
#########################################################################################
#########################################################################################

# ofmp of layer1 after adding torchextractor
model_quantized_ex = tx.Extractor(model_quantized, ["features.0.0"])#Capture of the module inside first layer
model_output, features = model_quantized_ex(input_batch)# Forward propagation
# feature_shapes = {name: f.shape for name, f in features.items()}
# print(features['features.0.0']) # Ofmp of layer1, datatype : quantized_tensor
out1_clone = features['features.0.0'].int_repr().numpy() # Clone ofmp of layer1, datatype : ndarray


if(out1_clone.all() == output1_clone.all()):
    print('Model with torchextractor attached output the same value as the original model')
else:
    print('Torchextractor method influence the outcome')

Here I define a numpy quantization scheme based on the quantization scheme proposed by Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference

# Convert a normal regular tensor to a quantized tensor with scale and zero_point
def quantize_tensor(x, num_bits=8):# to quantize the input tensor and return an int8 tensor, scale and zero point

    qmin = 0.
    qmax = 2.**num_bits - 1.
    min_val, max_val = x.min(), x.max()

    scale = (max_val - min_val) / (qmax - qmin)

    initial_zero_point = qmin - min_val / scale

    zero_point = 0
    if initial_zero_point < qmin:
        zero_point = qmin
    elif initial_zero_point > qmax:
        zero_point = qmax
    else:
        zero_point = initial_zero_point

    # print(zero_point)
    zero_point = int(zero_point)
    q_x = zero_point + x / scale
    q_x.clamp_(qmin, qmax).round_()
    q_x = q_x.round().byte()
    return q_x, scale, zero_point

#%%
# #############################################################################################
# ---------  Simulate the inference process of layer0: conv33 using numpy
# #############################################################################################


# get the input_batch quantized buffer data
input_scale = b.item()
input_zero  = c
input_quantized = a[0].detach().numpy()

# get the layer0 output scale and zero_point
output_scale = model_quantized.features[0][0].state_dict()['scale'].item()
output_zero  = model_quantized.features[0][0].state_dict()['zero_point'].item()

# get the quantized weight with scale and zero_point  
weight_scale = model_quantized.features[0][0].state_dict()["weight"].q_scale()
weight_zero  = model_quantized.features[0][0].state_dict()["weight"].q_zero_point()
weight_quantized = model_quantized.features[0][0].state_dict()["weight"].int_repr().numpy()
# print(weight_quantized)
# print(weight_quantized.shape)


# bias_quantized,bias_scale,bias_zero= quantize_tensor(model_quantized.features[0][0].state_dict()["bias"])# to quantize the input tensor and return an int8 tensor, scale and zero point
# print(bias_quantized.shape)
bias = model_quantized.features[0][0].state_dict()["bias"].detach().numpy()
# print(input_quantized)
print(type(input_scale))
print(type(output_scale))
print(type(weight_scale))

Then I write a quantized 2D convolution using numpy, hope to figure out every details in pytorch data flow during the inference.

#%% numpy simulated layer0 convolution function define

def conv_cal(input_quantized, weight_quantized, kernel_size, stride, out_i, out_j, out_k):
    weight = weight_quantized[out_i]
    input = np.zeros((input_quantized.shape[0], kernel_size, kernel_size))
    for i in range(weight.shape[0]):
        for j in range(weight.shape[1]):
            for k in range(weight.shape[2]):
                input[i][j][k] = input_quantized[i][stride*out_j+j][stride*out_k+k]
    # print(np.dot(weight,input))
    # print(input,"\n")
    # print(weight)

    return np.multiply(weight,input).sum()

def QuantizedConv2D(input_scale, input_zero, input_quantized, output_scale, output_zero, weight_scale, weight_zero, weight_quantized, bias, kernel_size, stride, padding, ofm_size):
    output = np.zeros((weight_quantized.shape[0],ofm_size,ofm_size))

    input_quantized_padding = np.full((input_quantized.shape[0],input_quantized.shape[1]+2*padding,input_quantized.shape[2]+2*padding),0)
    zero_temp = np.full(input_quantized.shape,input_zero)
    input_quantized = input_quantized - zero_temp
    for i in range(input_quantized.shape[0]):
        for j in range(padding,padding + input_quantized.shape[1]):
            for k in range(padding,padding + input_quantized.shape[2]):
                input_quantized_padding[i][j][k] = input_quantized[i][j-padding][k-padding]

    zero_temp = np.full(weight_quantized.shape, weight_zero)
    weight_quantized = weight_quantized - zero_temp

    for i in range(output.shape[0]):
        for j in range(output.shape[1]):
            for k in range(output.shape[2]):
                # output[i][j][k] = (weight_scale*input_scale)*conv_cal(input_quantized_padding, weight_quantized, kernel_size, stride, i, j, k) + bias[i] #floating_output
                output[i][j][k] = weight_scale*input_scale/output_scale*conv_cal(input_quantized_padding, weight_quantized, kernel_size, stride, i, j, k) + bias[i]/output_scale + output_zero
                output[i][j][k] = round(output[i][j][k])
                # int_output
    return output

Here I input the same image, weight, and bias together with their zero_point and scale, then compare this "numpy simulated" result to the PyTorch calculated one.

quantized_model_out1_int8 = np.squeeze(features['features.0.0'].int_repr().numpy())


print(quantized_model_out1_int8.shape)
print(quantized_model_out1_int8)
out1_np = QuantizedConv2D(input_scale, input_zero, input_quantized, output_scale, output_zero, weight_scale, weight_zero, weight_quantized, bias, 3, 2, 1, 112)
np.save("out1_np.npy",out1_np)

for i in range(quantized_model_out1_int8.shape[0]):
    for j in range(quantized_model_out1_int8.shape[1]):
        for k in range(quantized_model_out1_int8.shape[2]):
            if(out1_np[i][j][k] < 0):
                out1_np[i][j][k] = 0

print(out1_np)

flag = np.zeros(quantized_model_out1_int8.shape)
for i in range(quantized_model_out1_int8.shape[0]):
    for j in range(quantized_model_out1_int8.shape[1]):
        for k in range(quantized_model_out1_int8.shape[2]):

            if(quantized_model_out1_int8[i][j][k] == out1_np[i][j][k]):
                flag[i][j][k] = 1
                out1_np[i][j][k] = 0
                quantized_model_out1_int8[i][j][k] = 0

# Compare the simulated result to extractor fetched result, gain the total hit rate
print(flag.sum()/(112*112*32)*100,'%')

If the "numpy simulated" results are the same as the extracted one, call it a hit. Print the total hit rate, it shows that numpy gets 92% of the values right. Now the problem is, I have no idea why the rest 8% of values come out wrong.

Comparison of two outcomes: The picture below shows the different values between Numpy one and PyTorch one, the sample channel is index[1]. The left upper corner is Numpy one, and the upright corner is PyTorch one, I have set all values that are the same between them to 0, as you can see, most of the values just have a difference of 1(This can be view as the error brought by the precision loss of fixed point arithmetics), but some have large differences, e.g. the value[1][4], 121 vs. 76 (I don't know why)
Focus on one strange value: This code is used to replay the calculation process of the value[1][4], originally I was expecting a trial and error process could lead me to solve this problem, to get my wanted number of 76, but no matter how I tried, it didn't output 76. If you want to try this, I paste this code for your convenience.

    #%% A test code to check the calculation process
    weight_quantized_sample = weight_quantized[2]
    M_t = input_scale * weight_scale / output_scale
    ifmap_t = np.int32(input_quantized[:,1:4,7:10])
    weight_t = np.int32(weight_quantized_sample)
    bias_t = bias[2]
    bias_q = bias_t/output_scale
    res_t = 0
    for ch in range(3):
        ifmap_offset = ifmap_t[ch]-np.int32(input_zero)
        weight_offset = weight_t[ch]-np.int32(weight_zero)
        res_ch = np.multiply(ifmap_offset, weight_offset)
        res_ch = res_ch.sum()
        res_t = res_t + res_ch
    res_mul = M_t*res_t
    # for n in range(1, 30):
    #     res_mul = multiply(n, M_t, res_t)
    res_t = round(res_mul + output_zero + bias_q)
    print(res_t)

Could you help me out of this, have been stuck here for a long time.

Hello. Were you able to resolve your issue? I've added the answer to the questions.Your link to the paper helped a lot. In my experiments I didn't face strange things in implementing convolution, fully connected layer, averagepolling and maxpooling. THOUGH I faced them implementing nn.quantized.FloatFunctional().add() function needed for resnet blocks. I don't know yet how to simulate this function. One of the issues is when the result exceeds 256 and it is magically reduced to some other value. — Roman Malashin, Jan 13 '22 at 07:40

score 0 · Answer 1 · answered Jan 13 '22 at 07:28

I implemented my own version of quantized convolution and got from 99.999% to 100% hitrate (and mismatch of a single value is by 1 that I can consider to be a rounding issue). The link on the paper in the question helped a lot.

But I found that your formulas are the same as mine. So I don't know what was your issue. As I understand quantization in pytorch is hardware dependent.

Here is my code:

def my_Conv2dRelu_b2(input_q, conv_layer, output_shape):
    '''

    Args:
        input_q: quantized tensor
        conv_layer: quantized tensor
        output_shape: the pre-computed shape of the result

    Returns:

    '''
    output = np.zeros(output_shape)

    # extract needed float numbers from quantized operations
    weights_scale = conv_layer.weight().q_per_channel_scales()
    input_scale = input_q.q_scale()
    weights_zp = conv_layer.weight().q_per_channel_zero_points()
    input_zp = input_q.q_zero_point()

    # extract needed convolution parameters
    padding = conv_layer.padding
    stride = conv_layer.stride

    # extract float numbers for results
    output_zp = conv_layer.zero_point
    output_scale = conv_layer.scale
    conv_weights_int = conv_layer.weight().int_repr()
    input_int = input_q.int_repr()

    biases = conv_layer.bias().numpy()
    for k in range(input_q.shape[0]):
        for i in range(conv_weights_int.shape[0]):
            output[k][i] = manual_convolution_quant(
                input_int[k].numpy(),
                conv_weights_int[i].numpy(),
                biases[i],
                padding=padding,
                stride=stride,
                image_zp=input_zp, image_scale=input_scale,
                kernel_zp=weights_zp[i].item(), kernel_scale=weights_scale[i].item(),
                result_zp=output_zp, result_scale=output_scale
            )
    return output


def manual_convolution_quant(image, kernel, b, padding, stride, image_zp, image_scale, kernel_zp, kernel_scale,
                             result_zp, result_scale):
    H = image.shape[1]
    W = image.shape[2]
    new_H = H // stride[0]
    new_W = W // stride[1]
    results = np.zeros([new_H, new_W])

    M = image_scale * kernel_scale / result_scale
    bias = b / result_scale
    paddedIm = np.pad(
        image,
        [(0, 0), (padding[0], padding[0]), (padding[1], padding[1])],
        mode="constant",
        constant_values=image_zp,
    )
    s = kernel.shape[1]
    for i in range(new_H):
        for j in range(new_W):
            patch = paddedIm[
                    :, i * stride[0]: i * stride[0] + s, j * stride[1]: j * stride[1] + s
                    ]
            res = M * ((kernel - kernel_zp) * (patch - image_zp)).sum() + result_zp + bias
            if res < 0:
                res = 0
            results[i, j] = round(res)

    return results

Code to compare pytorch and my own version.

def calc_hit_rate(array1, array2):
    good = (array1 == array2).astype(np.int).sum()
    all = array1.size
    return good / all


# during inference
y2 = model.conv1(y1)
y2_int = torch.int_repr(y2)

y2_int_manual = my_Conv2dRelu_b2(y1, model.conv1, y2.shape)
print(f'y2 hit rate= {calc_hit_rate(y2.int_repr().numpy(), y2_int_manual)}') #hit_rate=1.0

I want to use Numpy to simulate the inference process of a quantized MobileNet V2 network, but the outcome is different with pytorch realized one

1 Answers1

Linked