v0.1 - Naive implementation
Here's my first, naive attempt at making this work:
__global__ void sliding_dot(float *out, int *outdims, float *X, int *Xdims, float *Y, int *Ydims )
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
int Y_indx = 0;
int X_indx = 0;
if ( i < outdims[0] & j < outdims[1] )
{
int out_indx = j + i*outdims[1];
for (int Yi = 0; Yi < Ydims[0]; Yi++ )
{
for (int Yj = 0; Yj < Ydims[1]; Yj++ )
{
for (int k = 0; k < Ydims[2]; k++ )
{
Y_indx = k + Yj* Ydims[2] + Yi* Ydims[2]*Ydims[1];
X_indx = k + (j+Yj)*Xdims[2] + (i+Yi)*Xdims[2]*Xdims[1];
out[out_indx] += X[X_indx]*Y[Y_indx];
}
}
}
}
}
So far the results are less-than-desirable. With block size (32,32,1) and grid dimensions p,q chosen such that p*32 >= outdims[0] and q*32 >= outdims[1] :
method=[ sliding_dot ] gputime=[ 7013.280 ] cputime=[ 18.000 ] occupancy=[ 0.667 ]
method=[ sliding_dot ] gputime=[ 6945.184 ] cputime=[ 7.000 ] occupancy=[ 0.667 ]
method=[ sliding_dot ] gputime=[ 6990.816 ] cputime=[ 6.000 ] occupancy=[ 0.667 ]
method=[ sliding_dot ] gputime=[ 6931.648 ] cputime=[ 6.000 ] occupancy=[ 0.667 ]
v0.2 - texture<float,1>
I hope everybody is learning as much from this as I am! I followed @aland's suggestions and got a considerable speed-up:
texture<float,1> X;
texture<float,1> Y;
__global__ void dotconv(float *out, int2 outdims, int3 Xdims, int3 Ydims )
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
if ( i < outdims.x & j < outdims.y )
{
int out_indx = j + i*outdims.y;
float total = 0.0f;
int X_indx = 0;
int Y_indx = 0;
for (int Yi=0; Yi<Ydims.x; Yi++ )
{
for (int Yj=0; Yj<Ydims.y; Yj++ )
{
for (int k=0; k<Ydims.z; k++ )
{
Y_indx = k + Yj* Ydims.z + Yi* Ydims.z*Ydims.y;
X_indx = k + (j+Yj)*Xdims.z + (i+Yi)*Xdims.z*Xdims.y;
total += tex1Dfetch(X,X_indx)*tex1Dfetch(Y,Y_indx);
}
}
}
out[out_indx] = total;
}
}
But we're still not running as quickly as the CPU:
method=[ dotconv ] gputime=[ 2224.928 ] cputime=[ 24.000 ] occupancy=[ 0.667 ]
method=[ dotconv ] gputime=[ 2222.592 ] cputime=[ 7.000 ] occupancy=[ 0.667 ]
method=[ dotconv ] gputime=[ 2225.216 ] cputime=[ 10.000 ] occupancy=[ 0.667 ]
method=[ dotconv ] gputime=[ 2222.752 ] cputime=[ 10.000 ] occupancy=[ 0.667 ]
v0.3 - texture<float,3>
texture<float,3,cudaReadModeElementType> X;
texture<float,3,cudaReadModeElementType> Y;
__global__ void dotconv(float *out, int2 outdims, int3 Xdims, int3 Ydims )
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
int j = threadIdx.y + blockDim.y * blockIdx.y;
if ( i < outdims.x & j < outdims.y )
{
int out_indx = j + i*outdims.y;
float total = 0.0f;
for (int Yi=0; Yi<Ydims.x; Yi++ )
{
for (int Yj=0; Yj<Ydims.y; Yj++ )
{
for (int k=0; k<Ydims.z; k++ )
{
total += tex3D(X,k,j+Yj,i+Yi) * tex3D(Y,k,Yj,Yi);
}
}
}
out[out_indx] = total;
}
}
This is actually a little slower than the v0.2
method=[ dotconv ] gputime=[ 2403.360 ] cputime=[ 35.000 ] occupancy=[ 0.667 ]
method=[ dotconv ] gputime=[ 2392.160 ] cputime=[ 15.000 ] occupancy=[ 0.667 ]
method=[ dotconv ] gputime=[ 2396.448 ] cputime=[ 15.000 ] occupancy=[ 0.667 ]
method=[ dotconv ] gputime=[ 2398.880 ] cputime=[ 16.000 ] occupancy=[ 0.667 ]
Thanks for your suggestions!