2

Python version: 3.8

Pytorch version: 1.9.0+cpu

Platform: Anaconda Spyder5.0

To reproduce this problem, just copy every code below to a single file.

The ILSVRC2012_val_00000293.jpg file used in this code is shown below, you also need to download it and then change its destination in the code.

enter image description here

  • Some background of this problem:

I am now working on a project that aims to develop a hardware accelerator to complete the inference process of the MobileNet V2 network. I used pretrained quantized Pytorch model to simulate the outcome, and the result comes out very well.

enter image description here classification outcome

In order to use hardware to complete this task, I wish to know every inputs and outputs as well as intermidiate variables during runing this piece of pytorch code. I used a package named torchextractor to fetch the outcomes of first layer, which in this case, is a 3*3 convolution layer.

import numpy as np
import torchvision
import torch
from torchvision import transforms, datasets
from PIL import Image
from torchvision import transforms
import torchextractor as tx
import math
#########################################################################################
##### Processing of input image
#########################################################################################

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
test_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        normalize,])

preprocess = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])


#image file destination 
filename = "D:\Project_UM\MobileNet_VC709\MobileNet_pytorch\ILSVRC2012_val_00000293.jpg"
input_image = Image.open(filename)
input_tensor = preprocess(input_image)
input_batch = input_tensor.unsqueeze(0)
#########################################################################################
#########################################################################################
#########################################################################################

#----First verify that the torchextractor class should not influent the inference outcome 

# ofmp of layer1 before putting into torchextractor
a,b,c = quantize_tensor(input_batch)# to quantize the input tensor and return an int8 tensor, scale and zero point
input_qa = torch.quantize_per_tensor(torch.tensor(input_batch.clone().detach()), b, c, torch.quint8)# Using quantize_per_tensor method of torch

# Load a quantized mobilenet_v2 model
model_quantized = torchvision.models.quantization.mobilenet_v2(pretrained=True, quantize=True)

model_quantized.eval()
with torch.no_grad():
    output = model_quantized.features[0][0](input_qa)# Ofmp of layer1, datatype : quantized_tensor

# print("FM of layer1 before tx_extractor:\n",output.int_repr())# Ofmp of layer1, datatype : int8 tensor
output1_clone = output.int_repr().detach().numpy()# Clone ofmp of layer1, datatype : ndarray



#########################################################################################
#########################################################################################
#########################################################################################

# ofmp of layer1 after adding torchextractor
model_quantized_ex = tx.Extractor(model_quantized, ["features.0.0"])#Capture of the module inside first layer
model_output, features = model_quantized_ex(input_batch)# Forward propagation
# feature_shapes = {name: f.shape for name, f in features.items()}
# print(features['features.0.0']) # Ofmp of layer1, datatype : quantized_tensor
out1_clone = features['features.0.0'].int_repr().numpy() # Clone ofmp of layer1, datatype : ndarray


if(out1_clone.all() == output1_clone.all()):
    print('Model with torchextractor attached output the same value as the original model')
else:
    print('Torchextractor method influence the outcome')

Here I define a numpy quantization scheme based on the quantization scheme proposed by Quantization and Training of Neural Networks for Efficient Integer-Arithmetic-Only Inference

# Convert a normal regular tensor to a quantized tensor with scale and zero_point
def quantize_tensor(x, num_bits=8):# to quantize the input tensor and return an int8 tensor, scale and zero point

    qmin = 0.
    qmax = 2.**num_bits - 1.
    min_val, max_val = x.min(), x.max()

    scale = (max_val - min_val) / (qmax - qmin)

    initial_zero_point = qmin - min_val / scale

    zero_point = 0
    if initial_zero_point < qmin:
        zero_point = qmin
    elif initial_zero_point > qmax:
        zero_point = qmax
    else:
        zero_point = initial_zero_point

    # print(zero_point)
    zero_point = int(zero_point)
    q_x = zero_point + x / scale
    q_x.clamp_(qmin, qmax).round_()
    q_x = q_x.round().byte()
    return q_x, scale, zero_point

#%%
# #############################################################################################
# ---------  Simulate the inference process of layer0: conv33 using numpy
# #############################################################################################


# get the input_batch quantized buffer data
input_scale = b.item()
input_zero  = c
input_quantized = a[0].detach().numpy()

# get the layer0 output scale and zero_point
output_scale = model_quantized.features[0][0].state_dict()['scale'].item()
output_zero  = model_quantized.features[0][0].state_dict()['zero_point'].item()

# get the quantized weight with scale and zero_point  
weight_scale = model_quantized.features[0][0].state_dict()["weight"].q_scale()
weight_zero  = model_quantized.features[0][0].state_dict()["weight"].q_zero_point()
weight_quantized = model_quantized.features[0][0].state_dict()["weight"].int_repr().numpy()
# print(weight_quantized)
# print(weight_quantized.shape)


# bias_quantized,bias_scale,bias_zero= quantize_tensor(model_quantized.features[0][0].state_dict()["bias"])# to quantize the input tensor and return an int8 tensor, scale and zero point
# print(bias_quantized.shape)
bias = model_quantized.features[0][0].state_dict()["bias"].detach().numpy()
# print(input_quantized)
print(type(input_scale))
print(type(output_scale))
print(type(weight_scale))

Then I write a quantized 2D convolution using numpy, hope to figure out every details in pytorch data flow during the inference.

#%% numpy simulated layer0 convolution function define

def conv_cal(input_quantized, weight_quantized, kernel_size, stride, out_i, out_j, out_k):
    weight = weight_quantized[out_i]
    input = np.zeros((input_quantized.shape[0], kernel_size, kernel_size))
    for i in range(weight.shape[0]):
        for j in range(weight.shape[1]):
            for k in range(weight.shape[2]):
                input[i][j][k] = input_quantized[i][stride*out_j+j][stride*out_k+k]
    # print(np.dot(weight,input))
    # print(input,"\n")
    # print(weight)

    return np.multiply(weight,input).sum()

def QuantizedConv2D(input_scale, input_zero, input_quantized, output_scale, output_zero, weight_scale, weight_zero, weight_quantized, bias, kernel_size, stride, padding, ofm_size):
    output = np.zeros((weight_quantized.shape[0],ofm_size,ofm_size))

    input_quantized_padding = np.full((input_quantized.shape[0],input_quantized.shape[1]+2*padding,input_quantized.shape[2]+2*padding),0)
    zero_temp = np.full(input_quantized.shape,input_zero)
    input_quantized = input_quantized - zero_temp
    for i in range(input_quantized.shape[0]):
        for j in range(padding,padding + input_quantized.shape[1]):
            for k in range(padding,padding + input_quantized.shape[2]):
                input_quantized_padding[i][j][k] = input_quantized[i][j-padding][k-padding]

    zero_temp = np.full(weight_quantized.shape, weight_zero)
    weight_quantized = weight_quantized - zero_temp

    for i in range(output.shape[0]):
        for j in range(output.shape[1]):
            for k in range(output.shape[2]):
                # output[i][j][k] = (weight_scale*input_scale)*conv_cal(input_quantized_padding, weight_quantized, kernel_size, stride, i, j, k) + bias[i] #floating_output
                output[i][j][k] = weight_scale*input_scale/output_scale*conv_cal(input_quantized_padding, weight_quantized, kernel_size, stride, i, j, k) + bias[i]/output_scale + output_zero
                output[i][j][k] = round(output[i][j][k])
                # int_output
    return output  

Here I input the same image, weight, and bias together with their zero_point and scale, then compare this "numpy simulated" result to the PyTorch calculated one.

quantized_model_out1_int8 = np.squeeze(features['features.0.0'].int_repr().numpy())


print(quantized_model_out1_int8.shape)
print(quantized_model_out1_int8)
out1_np = QuantizedConv2D(input_scale, input_zero, input_quantized, output_scale, output_zero, weight_scale, weight_zero, weight_quantized, bias, 3, 2, 1, 112)
np.save("out1_np.npy",out1_np)

for i in range(quantized_model_out1_int8.shape[0]):
    for j in range(quantized_model_out1_int8.shape[1]):
        for k in range(quantized_model_out1_int8.shape[2]):
            if(out1_np[i][j][k] < 0):
                out1_np[i][j][k] = 0

print(out1_np)

flag = np.zeros(quantized_model_out1_int8.shape)
for i in range(quantized_model_out1_int8.shape[0]):
    for j in range(quantized_model_out1_int8.shape[1]):
        for k in range(quantized_model_out1_int8.shape[2]):

            if(quantized_model_out1_int8[i][j][k] == out1_np[i][j][k]):
                flag[i][j][k] = 1
                out1_np[i][j][k] = 0
                quantized_model_out1_int8[i][j][k] = 0

# Compare the simulated result to extractor fetched result, gain the total hit rate
print(flag.sum()/(112*112*32)*100,'%')

If the "numpy simulated" results are the same as the extracted one, call it a hit. Print the total hit rate, it shows that numpy gets 92% of the values right. Now the problem is, I have no idea why the rest 8% of values come out wrong.

hit rate

  • Comparison of two outcomes: The picture below shows the different values between Numpy one and PyTorch one, the sample channel is index[1]. The left upper corner is Numpy one, and the upright corner is PyTorch one, I have set all values that are the same between them to 0, as you can see, most of the values just have a difference of 1(This can be view as the error brought by the precision loss of fixed point arithmetics), but some have large differences, e.g. the value[1][4], 121 vs. 76 (I don't know why) enter image description here

  • Focus on one strange value: This code is used to replay the calculation process of the value[1][4], originally I was expecting a trial and error process could lead me to solve this problem, to get my wanted number of 76, but no matter how I tried, it didn't output 76. If you want to try this, I paste this code for your convenience.

    #%% A test code to check the calculation process
    weight_quantized_sample = weight_quantized[2]
    M_t = input_scale * weight_scale / output_scale
    ifmap_t = np.int32(input_quantized[:,1:4,7:10])
    weight_t = np.int32(weight_quantized_sample)
    bias_t = bias[2]
    bias_q = bias_t/output_scale
    res_t = 0
    for ch in range(3):
        ifmap_offset = ifmap_t[ch]-np.int32(input_zero)
        weight_offset = weight_t[ch]-np.int32(weight_zero)
        res_ch = np.multiply(ifmap_offset, weight_offset)
        res_ch = res_ch.sum()
        res_t = res_t + res_ch
    res_mul = M_t*res_t
    # for n in range(1, 30):
    #     res_mul = multiply(n, M_t, res_t)
    res_t = round(res_mul + output_zero + bias_q)
    print(res_t)

Could you help me out of this, have been stuck here for a long time.

Johnson Z
  • 31
  • 3
  • Hello. Were you able to resolve your issue? I've added the answer to the questions.Your link to the paper helped a lot. In my experiments I didn't face strange things in implementing convolution, fully connected layer, averagepolling and maxpooling. THOUGH I faced them implementing nn.quantized.FloatFunctional().add() function needed for resnet blocks. I don't know yet how to simulate this function. One of the issues is when the result exceeds 256 and it is magically reduced to some other value. – Roman Malashin Jan 13 '22 at 07:40

1 Answers1

0

I implemented my own version of quantized convolution and got from 99.999% to 100% hitrate (and mismatch of a single value is by 1 that I can consider to be a rounding issue). The link on the paper in the question helped a lot.

But I found that your formulas are the same as mine. So I don't know what was your issue. As I understand quantization in pytorch is hardware dependent.

Here is my code:

def my_Conv2dRelu_b2(input_q, conv_layer, output_shape):
    '''

    Args:
        input_q: quantized tensor
        conv_layer: quantized tensor
        output_shape: the pre-computed shape of the result

    Returns:

    '''
    output = np.zeros(output_shape)

    # extract needed float numbers from quantized operations
    weights_scale = conv_layer.weight().q_per_channel_scales()
    input_scale = input_q.q_scale()
    weights_zp = conv_layer.weight().q_per_channel_zero_points()
    input_zp = input_q.q_zero_point()

    # extract needed convolution parameters
    padding = conv_layer.padding
    stride = conv_layer.stride

    # extract float numbers for results
    output_zp = conv_layer.zero_point
    output_scale = conv_layer.scale
    conv_weights_int = conv_layer.weight().int_repr()
    input_int = input_q.int_repr()

    biases = conv_layer.bias().numpy()
    for k in range(input_q.shape[0]):
        for i in range(conv_weights_int.shape[0]):
            output[k][i] = manual_convolution_quant(
                input_int[k].numpy(),
                conv_weights_int[i].numpy(),
                biases[i],
                padding=padding,
                stride=stride,
                image_zp=input_zp, image_scale=input_scale,
                kernel_zp=weights_zp[i].item(), kernel_scale=weights_scale[i].item(),
                result_zp=output_zp, result_scale=output_scale
            )
    return output


def manual_convolution_quant(image, kernel, b, padding, stride, image_zp, image_scale, kernel_zp, kernel_scale,
                             result_zp, result_scale):
    H = image.shape[1]
    W = image.shape[2]
    new_H = H // stride[0]
    new_W = W // stride[1]
    results = np.zeros([new_H, new_W])

    M = image_scale * kernel_scale / result_scale
    bias = b / result_scale
    paddedIm = np.pad(
        image,
        [(0, 0), (padding[0], padding[0]), (padding[1], padding[1])],
        mode="constant",
        constant_values=image_zp,
    )
    s = kernel.shape[1]
    for i in range(new_H):
        for j in range(new_W):
            patch = paddedIm[
                    :, i * stride[0]: i * stride[0] + s, j * stride[1]: j * stride[1] + s
                    ]
            res = M * ((kernel - kernel_zp) * (patch - image_zp)).sum() + result_zp + bias
            if res < 0:
                res = 0
            results[i, j] = round(res)

    return results

Code to compare pytorch and my own version.

def calc_hit_rate(array1, array2):
    good = (array1 == array2).astype(np.int).sum()
    all = array1.size
    return good / all


# during inference
y2 = model.conv1(y1)
y2_int = torch.int_repr(y2)

y2_int_manual = my_Conv2dRelu_b2(y1, model.conv1, y2.shape)
print(f'y2 hit rate= {calc_hit_rate(y2.int_repr().numpy(), y2_int_manual)}') #hit_rate=1.0