I want to make a simple tiling convolution code. It is from the lecture of Coursera :Heterogeneous Parallel Programming. The lecture provides a simple convolution code with tiling method, but the code was not complete. Thus, I fill out the blanks in the code and below is the result.
The goal of this code is to calculate convolution.
Input dimension : (24 by 24),
Kernel size : (9 by 9),
Output dimension : (16 by 16).
Also, in the main, I put computation time checking code to compare with the CPU version.
The problem is, whenever I run this code, the result is different. I tried to find the problem for several day, but every trial did not work. I also found similar code in the Internet blog but it has same problem as mine. I do not know why the result is different every time. Some say it is due to the race condition, but I don't find anything about that.
Here is example result of convolution (16 by 16 size).
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 0 0 0 0 0 0
81 81 81 81 81 81 81 81 81 81 0 0 0 0 0 0
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
0 0 0 0 0 0 0 0 0 0 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 81 81 81 81 81 81
81 81 81 81 81 81 81 81 81 81 0 0 0 0 0 0
81 81 81 81 81 81 81 81 81 81 0 0 0 0 0 0
81 81 81 81 81 81 81 81 81 81 0 0 0 0 0 0
81 81 81 81 81 81 81 81 81 81 0 0 0 0 0 0
81 81 81 81 81 81 81 81 81 81 0 0 0 0 0 0
81 81 81 81 81 81 81 81 81 81 0 0 0 0 0 0
My device is CUDA Driver = CUDART, CUDA Driver Version = 7.5, CUDA Runtime Version = 7.5, NumDevs = 1, Device0 = GeForce GT 630. I use Ubuntu 14.04.
Thanks in advance!
#include<stdio.h>
#include<cuda.h>
#include<time.h>
#define O_TILE_WIDTH 10
#define MASK_WIDTH 9
#define I_TILE_WIDTH (O_TILE_WIDTH+MASK_WIDTH-1)
__global__ void Convolution2DBasicKernel(float *out, float *in, int in_height, int in_width, const float *__restrict__ mask, int output_dim)
{
int tx=threadIdx.x;
int ty=threadIdx.y;
int row_o=blockIdx.y*O_TILE_WIDTH+ty;
int col_o=blockIdx.x*O_TILE_WIDTH+tx;
int row_i=row_o;
int col_i=col_o;
__syncthreads();
__shared__ float Ns[I_TILE_WIDTH][I_TILE_WIDTH];
///////////////////////////////////////////////////////////
//////////////////// reading input data ///////////////////
if( (row_i>=0)&&(row_i<in_height)&&(col_i>=0)&&(col_i<in_width) )
{
Ns[ty][tx]=in[row_i*in_width + col_i];
}
else
{
Ns[ty][tx]=0.0f;
}
__syncthreads();
///////////////////////////////////////////////////////////
//////////////////// calculating convol ///////////////////
float output=0.0f;
if( (tx<O_TILE_WIDTH)&&(ty<O_TILE_WIDTH) )
{
for(int i=0; i<MASK_WIDTH; i++)
{
for(int j=0; j<MASK_WIDTH; j++)
{
output += Ns[ty+i][tx+j]*mask[i*MASK_WIDTH+j];
}
}
}
__syncthreads();
if( (row_o<output_dim)&&(col_o<output_dim) )
{
out[row_o*output_dim+col_o]=output;//in_width
}
__syncthreads();
}
int main() {
int input_dim=24;
int kernel_dim=9;
int output_dim=16;
float *input = new float[input_dim*input_dim];
float *kernel = new float[kernel_dim*kernel_dim];
float *output = new float[output_dim*output_dim];
float *d_input;
float *d_kernel;
float *d_output;
cudaMalloc(&d_input, sizeof(float)*input_dim*input_dim);
cudaMalloc(&d_kernel, sizeof(float)*kernel_dim*kernel_dim);
cudaMalloc(&d_output, sizeof(float)*output_dim*output_dim);
for(int i=0; i<input_dim*input_dim; i++)
{
input[i]=1.0;
}
for(int i=0; i<kernel_dim*kernel_dim; i++)
{
kernel[i]=1.0;
}
cudaMemcpy(d_input, input, sizeof(float)*input_dim*input_dim, cudaMemcpyHostToDevice);
cudaMemcpy(d_kernel, kernel, sizeof(float)*kernel_dim*kernel_dim, cudaMemcpyHostToDevice);
dim3 dimBlock (I_TILE_WIDTH, I_TILE_WIDTH, 1);
dim3 dimGrid ((output_dim-1)/O_TILE_WIDTH+1, (output_dim-1)/O_TILE_WIDTH+1, 1);
clock_t begin, end;
double time_spent;
begin = clock();
for(int iteration=0; iteration<1; iteration++)//100000
{
Convolution2DBasicKernel<<<dimGrid, dimBlock>>>(d_output, d_input, input_dim, input_dim, d_kernel, output_dim);
}
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("time: %f\n", time_spent);
cudaMemcpy(output, d_output, sizeof(float)*output_dim*output_dim, cudaMemcpyDeviceToHost);
for(int y=0; y<output_dim; y++)
{
for(int x=0; x<output_dim; x++)
printf("%d\t", int(output[y*16+x]));
printf("\n");
}
}