I am trying to rewrite the OpenCV dilate
function to practice SIMD programming. For simplicity, only non-separable case is considered. Much of the code looks like the OpenCV version. The result, however, shows that OpenCV is more than 10 times faster.
** Input **
-- Image size: [5472 x 3648]
-- Dilate structuring element:
[1, 1, 1, 1, 1, 1, 1;
1, 1, 1, 1, 1, 1, 1;
1, 1, 1, 1, 1, 1, 1;
1, 1, 1, 0, 1, 1, 1;
1, 1, 1, 1, 1, 1, 1;
1, 1, 1, 1, 1, 1, 1;
1, 1, 1, 1, 1, 1, 1]
** Result **
OpenCV: 0.043147 sec
My Function: 0.49147 sec
The code :
Mat mydilate(const Mat& src, const Mat& de)
{
int xb = de.cols / 2;
int yb = de.rows / 2;
Mat img_whole = Mat::zeros(src.rows + 2 * yb, src.cols + 2 * xb, CV_8U);
src.copyTo(img_whole(Rect(xb, yb, src.cols, src.rows)));
vector<Point> coords;
vector<const uchar*> sptr;
for (int i = 0; i < de.rows; i++) {
for (int j = 0; j < de.cols; j++) {
if (de.ptr(i)[j] == 1) {
coords.push_back(Point(j, i));
}
}
}
Mat result(src.rows, src.cols, CV_8U);
int width = result.cols;
vector<uchar> pts;
int nz = coords.size();
pts.resize(nz);
sptr.resize(nz);
for (int y = 0; y < result.rows; y++) {
uchar *pdst = result.ptr(y);
for (int k = 0; k < nz; k++)
sptr[k] = img_whole.ptr(y + coords[k].y) + coords[k].x;
uchar **ssptr = (uchar **)&sptr[0];
for (int x = 0; x < width - 16; x += 16)
{
__m128i s0 = _mm_loadu_si128((const __m128i *)(ssptr[0] + x));
for (int i = 1; i < nz; i++)
{
__m128i x0 = _mm_loadu_si128((const __m128i *)(ssptr[i] + x));
s0 = _mm_max_epu8(s0, x0);
}
_mm_storeu_si128((__m128i*)(pdst + x), s0);
}
}
return result;
}
What is the bottleneck of this function? I also tried to align the source pointer to be multiple of 16, but no gain in performance.