I'm trying to apply filter 1x3, 3x1 repectively in 3D-structure(volume).
For example, if there is 20(cols) x 10(rows) x 10(depth) structure,
for(int depth = 0; depth < 10; depth++)
Apply image filter(depth);
Apply filter to 2d image(20x10) 10 times. every slice of image is different.
first, i allocate 3D structure like
// COLS = 450, ROWS = 375, MAX_DISPARITY = 60
cudaPitchedPtr volume;
cudaExtent volumeExtent = make_cudaExtent(COLS, ROWS, MAX_DISPARITY);
HANDLE_ERROR(cudaMalloc3D(&volume, volumeExtent ));
and memory set to zero for stable output. so far so good, until copy image into volume.
when applying 3x1 filter like below, it's computation time was 6 mSec.
Apply_3by1 << <ROWS, COLS, COLS>> > (volume, COLS, ROWS);
__global__ void Apply_3by1 (cudaPitchedPtr src, unsigned int COLS, unsigned int ROWS)
{
const unsigned int x = threadIdx.x;
const unsigned int y = blockIdx.x;
extern __shared__ unsigned char SharedMemory[];
for (int dispCnt = 0; dispCnt < MAX_DISPARITY; dispCnt++)
{
if (x < dispCnt) continue;//exception for my algorithm.
unsigned char dst_val = *GET_UCHAR_PTR_3D(src, x, y, dispCnt);
SharedMemory[x] = dst_val;
__syncthreads();
unsigned char left;
int leftIdx = x - 3;
if (leftIdx < 0)//index underflow
left = 0;
else
left = SharedMemory[leftIdx];
unsigned char right;//index overflow
int rightIdx = x + 3;
if (COLS < rightIdx)
right = 0;
else
right = SharedMemory[rightIdx];
*GET_UCHAR_PTR_3D(src, x, y, dispCnt) += left + right;
}
}
but when i apply vertical direction 1x3 filter, it's computation time was 46mSec.
Apply_1by3 << <COLS, ROWS, ROWS >> > (volume, COLS, ROWS);
__global__ void Apply_1by3 (cudaPitchedPtr src, unsigned int COLS, unsigned int ROWS)
{
const unsigned int x = threadIdx.x;
const unsigned int y = blockIdx.x;
extern __shared__ unsigned char SharedMemory[];
for (int dispCnt = 0; dispCnt < MAX_DISPARITY; dispCnt++)
{
unsigned char my_val = *GET_UCHAR_PTR_3D(src, y, x, dispCnt);
SharedMemory[x] = my_val;
__syncthreads();
if (y < dispCnt) continue;
int topIdx = x - 3;
unsigned char top_value;
if (topIdx < 0)
top_value = 0;
else
top_value = SharedMemory[topIdx];
int bottomIdx = x + 3;
unsigned char bottom_value;
if (ROWS <= bottomIdx)
bottom_value = 0;
else
bottom_value = SharedMemory[bottomIdx];
*GET_UCHAR_PTR_3D(src, y, x, dispCnt) += bottom_value + top_value;
}
}
I have no idea why vertical direction access is slower than horizontal access, almost 8 times. if you know why it's access time is different, please enlighten me.
My apology, i forgot to add
#define GET_UCHAR_PTR_3D(pptr, x, y, d) \
(unsigned char*)((char*)(pptr).ptr + (sizeof(unsigned char)* x) + ((pptr).pitch * y) + ((pptr).pitch * (pptr).ysize * d))