This should really be a question for the NVIDIA team, but they are notoriously bad in providing support, so I am hoping instead that someone intimately familiar with the TensorRT C++ API can help me out.
I am unable to put a full minimum reproducible example in this code here, as there is a lot of boilerplate code involved, especially since I want to demonstrate both the FP16 and INT8 calibration cases. But wait, before you close the question, I do have a minimal reproducible example on GitHub here. It's a project (150 stars and counting) which has the intention of teaching and helping others to use the TensorRT API (so by helping me solve this, you will actually help countless others too - win win!). The project has a very thorough readme file for how to get started, and the code is well documented (and it's only 3 code files in the /src
dir).
Anyways, I am successfully able to run FP32 and FP16 inference (in the repo I use the arcface face recognition model). However, when I try doing INT8 quantization, that's where things fall apart.
I define a class which extends nvinfer1::IInt8EntropyCalibrator2
called Int8EntropyCalibrator2
. The class is used for reading calibration data into GPU memory and providing it to TensorRT via the getBatch
method:
// Class used for int8 calibration
class Int8EntropyCalibrator2 : public nvinfer1::IInt8EntropyCalibrator2 {
public:
Int8EntropyCalibrator2(int32_t batchSize, int32_t inputW, int32_t inputH, const std::string& calibDataDirPath, const std::string& calibTableName, const std::string& inputBlobName,
const std::array<float, 3>& subVals = {0.f, 0.f, 0.f},const std::array<float, 3>& divVals = {1.f, 1.f, 1.f}, bool normalize = true, bool readCache = true);
virtual ~Int8EntropyCalibrator2();
// Abstract base class methods which must be implemented
int32_t getBatchSize () const noexcept override;
bool getBatch (void *bindings[], char const *names[], int32_t nbBindings) noexcept override;
void const * readCalibrationCache (std::size_t &length) noexcept override;
void writeCalibrationCache (void const *ptr, std::size_t length) noexcept override;
private:
const int32_t m_batchSize;
const int32_t m_inputW;
const int32_t m_inputH;
int32_t m_imgIdx;
std::vector<std::string> m_imgPaths;
size_t m_inputCount;
const std::string m_calibTableName;
const std::string m_inputBlobName;
const std::array<float, 3> m_subVals;
const std::array<float, 3> m_divVals;
const bool m_normalize;
const bool m_readCache;
void* m_deviceInput;
std::vector<char> m_calibCache;
};
The implementation for said class is as follows:
Int8EntropyCalibrator2::Int8EntropyCalibrator2(int32_t batchSize, int32_t inputW, int32_t inputH,
const std::string &calibDataDirPath,
const std::string &calibTableName,
const std::string &inputBlobName,
const std::array<float, 3>& subVals,
const std::array<float, 3>& divVals,
bool normalize,
bool readCache)
: m_batchSize(batchSize)
, m_inputW(inputW)
, m_inputH(inputH)
, m_imgIdx(0)
, m_calibTableName(calibTableName)
, m_inputBlobName(inputBlobName)
, m_subVals(subVals)
, m_divVals(divVals)
, m_normalize(normalize)
, m_readCache(readCache) {
// Allocate GPU memory to hold the entire batch
m_inputCount = 3 * inputW * inputH * batchSize;
checkCudaErrorCode(cudaMalloc(&m_deviceInput, m_inputCount * sizeof(float)));
// Read the name of all the files in the specified directory.
if (!doesFileExist(calibDataDirPath)) {
throw std::runtime_error("Error, directory at provided path does not exist: " + calibDataDirPath);
}
m_imgPaths = getFilesInDirectory(calibDataDirPath);
if (m_imgPaths.size() < static_cast<size_t>(batchSize)) {
throw std::runtime_error("There are fewer calibration images than the specified batch size!");
}
// Randomize the calibration data
auto rd = std::random_device {};
auto rng = std::default_random_engine { rd() };
std::shuffle(std::begin(m_imgPaths), std::end(m_imgPaths), rng);
}
int32_t Int8EntropyCalibrator2::getBatchSize() const noexcept {
// Return the batch size
return m_batchSize;
}
bool Int8EntropyCalibrator2::getBatch(void **bindings, const char **names, int32_t nbBindings) noexcept {
// This method will read a batch of images into GPU memory, and place the pointer to the GPU memory in the bindings variable.
if (m_imgIdx + m_batchSize > static_cast<int>(m_imgPaths.size())) {
// There are not enough images left to satisfy an entire batch
return false;
}
// Read the calibration images into memory for the current batch
std::vector<cv::cuda::GpuMat> inputImgs;
for (int i = m_imgIdx; i < m_imgIdx + m_batchSize; i++) {
std::cout << "Reading image " << i << ": " << m_imgPaths[i] << std::endl;
auto cpuImg = cv::imread(m_imgPaths[i]);
if (cpuImg.empty()){
std::cout << "Fatal error: Unable to read image at path: " << m_imgPaths[i] << std::endl;
return false;
}
cv::cuda::GpuMat gpuImg;
gpuImg.upload(cpuImg);
cv::cuda::cvtColor(gpuImg, gpuImg, cv::COLOR_BGR2RGB);
// TODO: Define any preprocessing code here, such as resizing
// In this example, we will assume the calibration images are already of the correct size
inputImgs.emplace_back(std::move(gpuImg));
}
// Convert the batch from NHWC to NCHW
// ALso apply normalization, scaling, and mean subtraction
auto mfloat = Engine::blobFromGpuMats(inputImgs, m_subVals, m_divVals, m_normalize);
auto *dataPointer = mfloat.ptr<void>();
// Copy the GPU buffer to member variable so that it persists
checkCudaErrorCode(cudaMemcpyAsync(m_deviceInput, dataPointer, m_inputCount * sizeof(float), cudaMemcpyDeviceToDevice));
m_imgIdx+= m_batchSize;
if (std::string(names[0]) != m_inputBlobName) {
std::cout << "Error: Incorrect input name provided!" << std::endl;
return false;
}
bindings[0] = m_deviceInput;
return true;
}
void const *Int8EntropyCalibrator2::readCalibrationCache(size_t &length) noexcept {
std::cout << "Searching for calibration cache: " << m_calibTableName << std::endl;
m_calibCache.clear();
std::ifstream input(m_calibTableName, std::ios::binary);
input >> std::noskipws;
if (m_readCache && input.good()) {
std::cout << "Reading calibration cache: " << m_calibTableName << std::endl;
std::copy(std::istream_iterator<char>(input), std::istream_iterator<char>(), std::back_inserter(m_calibCache));
}
length = m_calibCache.size();
return length ? m_calibCache.data() : nullptr;
}
void Int8EntropyCalibrator2::writeCalibrationCache(const void *ptr, std::size_t length) noexcept {
std::cout << "Writing calib cache: " << m_calibTableName << " Size: " << length << " bytes" << std::endl;
std::ofstream output(m_calibTableName, std::ios::binary);
output.write(reinterpret_cast<const char*>(ptr), length);
}
Int8EntropyCalibrator2::~Int8EntropyCalibrator2() {
checkCudaErrorCode(cudaFree(m_deviceInput));
};
Once again, I understand the example above is not complete (ex. the definition of checkCudaErrorCode
or blobFromGpuMats
are not shown) but again please I plead you to look at the GitHub repo before dismissing this question. The implementation above is here (Note you will need to checkout the int8
branch.
What I find is that the feature vector produced when running int8
inference is very different from that generated using FP16
.
Here are the steps to reproduce for yourself:
Navigate to the GitHub repo, clone recursively, checkout
int8
branch , install dependencies listed in readme, compile.Follow the readme file Sanity check section to obtain the arcface model.
Run the executable and provide path to the arcface model. It should generate the following feature vector. This is the FP16 feature vector.
-0.050293 -0.0993042 0.181152 0.144531 0.222656 0.217529 -0.290283 -0.0638428 0.234375 -0.176636 ...
Navigate to this line, change it from
Precision::FP16
toPrecision::INT8
.Download and extract calibration data, available here.
Provide path to calibration data to the
Engine::build
method here.
7 ) Recompile and run. The resulting feature vector this time is:
-0.175003 -0.00527599 -0.128431 -0.147636 0.278055 0.0584708 -0.083089 -0.0100119 -0.185134 0.0172769 ...
As can be seen, the int8 feature vector is quite different from the FP16 feature vector. Any thoughts on where I'm going wrong? There doesn't seem to be much documentation or sample code on int8 calibration.