I'm writing a kernel using PyCUDA. My GPU device only supports compute capability 1.1 (arch sm_11) and so I can only use floats in my code. I've taken great effort to ensure I'm doing everything with floats, but despite that, there is a particular line in my code that keeps causing a compiler error.
The chunk of code is:
// Gradient magnitude, so 1 <= x <= width, 1 <= y <= height.
if( j > 0 && j < im_width && i > 0 && i < im_height){
gradient_mag[idx(i,j)] = float(sqrt(x_gradient[idx(i,j)]*x_gradient[idx(i,j)] + y_gradient[idx(i,j)]*y_gradient[idx(i,j)]));
}
Here, idx()
is a __device__
helper function that returns a linear index based on pixel indices i
and j
, and it only works with integers. I use it throughout and it doesn't give errors anywhere else, so I strongly suspect it's not idx()
. The sqrt()
call is just from the standard C math functions which support floats. All of the arrays involved, x_gradient
, y_gradient
, and gradient_mag
are all float*
and they are part of the input to my function (i.e. declared in Python, then converted to device variables, etc.).
I've tried removing the extra cast to float in my code above, with no luck. I've also tried doing something completely stupid like this:
// Gradient magnitude, so 1 <= x <= width, 1 <= y <= height.
if( j > 0 && j < im_width && i > 0 && i < im_height){
gradient_mag[idx(i,j)] = 3.0f; // also tried float(3.0) here
}
All of these variations give the same error:
pycuda.driver.CompileError: nvcc said it demoted types in source code it compiled--this is likely not what you want.
[command: nvcc --cubin -arch sm_11 -I/usr/local/lib/python2.7/dist-packages/pycuda-2011.1.2-py2.7-linux-x86_64.egg/pycuda/../include/pycuda kernel.cu]
[stderr:
ptxas /tmp/tmpxft_00004329_00000000-2_kernel.ptx, line 128; warning : Double is not supported. Demoting to float
]
Any ideas? I've debugged many errors in my code and was hoping to get it working tonight, but this has proved to be a bug that I cannot understand.
Added -- Here is a truncated version of the kernel that produces the same error above on my machine.
every_pixel_hog_kernel_source = \
"""
#include <math.h>
#include <stdio.h>
__device__ int idx(int ii, int jj){
return gridDim.x*blockDim.x*ii+jj;
}
__device__ int bin_number(float angle_val, int total_angles, int num_bins){
float angle1;
float min_dist;
float this_dist;
int bin_indx;
angle1 = 0.0;
min_dist = abs(angle_val - angle1);
bin_indx = 0;
for(int kk=1; kk < num_bins; kk++){
angle1 = angle1 + float(total_angles)/float(num_bins);
this_dist = abs(angle_val - angle1);
if(this_dist < min_dist){
min_dist = this_dist;
bin_indx = kk;
}
}
return bin_indx;
}
__device__ int hist_number(int ii, int jj){
int hist_num = 0;
if(jj >= 0 && jj < 11){
if(ii >= 0 && ii < 11){
hist_num = 0;
}
else if(ii >= 11 && ii < 22){
hist_num = 3;
}
else if(ii >= 22 && ii < 33){
hist_num = 6;
}
}
else if(jj >= 11 && jj < 22){
if(ii >= 0 && ii < 11){
hist_num = 1;
}
else if(ii >= 11 && ii < 22){
hist_num = 4;
}
else if(ii >= 22 && ii < 33){
hist_num = 7;
}
}
else if(jj >= 22 && jj < 33){
if(ii >= 0 && ii < 11){
hist_num = 2;
}
else if(ii >= 11 && ii < 22){
hist_num = 5;
}
else if(ii >= 22 && ii < 33){
hist_num = 8;
}
}
return hist_num;
}
__global__ void every_pixel_hog_kernel(float* input_image, int im_width, int im_height, float* gaussian_array, float* x_gradient, float* y_gradient, float* gradient_mag, float* angles, float* output_array)
{
/////
// Setup the thread indices and linear offset.
/////
int i = blockDim.y * blockIdx.y + threadIdx.y;
int j = blockDim.x * blockIdx.x + threadIdx.x;
int ang_limit = 180;
int ang_bins = 9;
float pi_val = 3.141592653589f; //91
/////
// Compute a Gaussian smoothing of the current pixel and save it into a new image array
// Use sync threads to make sure everyone does the Gaussian smoothing before moving on.
/////
if( j > 1 && i > 1 && j < im_width-2 && i < im_height-2 ){
// Hard-coded unit standard deviation 5-by-5 Gaussian smoothing filter.
gaussian_array[idx(i,j)] = float(1.0/273.0) *(
input_image[idx(i-2,j-2)] + float(4.0)*input_image[idx(i-2,j-1)] + float(7.0)*input_image[idx(i-2,j)] + float(4.0)*input_image[idx(i-2,j+1)] + input_image[idx(i-2,j+2)] +
float(4.0)*input_image[idx(i-1,j-2)] + float(16.0)*input_image[idx(i-1,j-1)] + float(26.0)*input_image[idx(i-1,j)] + float(16.0)*input_image[idx(i-1,j+1)] + float(4.0)*input_image[idx(i-1,j+2)] +
float(7.0)*input_image[idx(i,j-2)] + float(26.0)*input_image[idx(i,j-1)] + float(41.0)*input_image[idx(i,j)] + float(26.0)*input_image[idx(i,j+1)] + float(7.0)*input_image[idx(i,j+2)] +
float(4.0)*input_image[idx(i+1,j-2)] + float(16.0)*input_image[idx(i+1,j-1)] + float(26.0)*input_image[idx(i+1,j)] + float(16.0)*input_image[idx(i+1,j+1)] + float(4.0)*input_image[idx(i+1,j+2)] +
input_image[idx(i+2,j-2)] + float(4.0)*input_image[idx(i+2,j-1)] + float(7.0)*input_image[idx(i+2,j)] + float(4.0)*input_image[idx(i+2,j+1)] + input_image[idx(i+2,j+2)]);
}
__syncthreads();
/////
// Compute the simple x and y gradients of the image and store these into new images
// again using syncthreads before moving on.
/////
// X-gradient, ensure x is between 1 and width-1
if( j > 0 && j < im_width){
x_gradient[idx(i,j)] = float(input_image[idx(i,j)] - input_image[idx(i,j-1)]);
}
else if(j == 0){
x_gradient[idx(i,j)] = float(0.0);
}
// Y-gradient, ensure y is between 1 and height-1
if( i > 0 && i < im_height){
y_gradient[idx(i,j)] = float(input_image[idx(i,j)] - input_image[idx(i-1,j)]);
}
else if(i == 0){
y_gradient[idx(i,j)] = float(0.0);
}
__syncthreads();
// Gradient magnitude, so 1 <= x <= width, 1 <= y <= height.
if( j < im_width && i < im_height){
gradient_mag[idx(i,j)] = float(sqrt(x_gradient[idx(i,j)]*x_gradient[idx(i,j)] + y_gradient[idx(i,j)]*y_gradient[idx(i,j)]));
}
__syncthreads();
/////
// Compute the orientation angles
/////
if( j < im_width && i < im_height){
if(ang_limit == 360){
angles[idx(i,j)] = float((atan2(y_gradient[idx(i,j)],x_gradient[idx(i,j)])+pi_val)*float(180.0)/pi_val);
}
else{
angles[idx(i,j)] = float((atan( y_gradient[idx(i,j)]/x_gradient[idx(i,j)] )+(pi_val/float(2.0)))*float(180.0)/pi_val);
}
}
__syncthreads();
// Compute the HoG using the above arrays. Do so in a 3x3 grid, with 9 angle bins for each grid.
// forming an 81-vector and then write this 81 vector as a row in the large output array.
int top_bound, bot_bound, left_bound, right_bound, offset;
int window = 32;
if(i-window/2 > 0){
top_bound = i-window/2;
bot_bound = top_bound + window;
}
else{
top_bound = 0;
bot_bound = top_bound + window;
}
if(j-window/2 > 0){
left_bound = j-window/2;
right_bound = left_bound + window;
}
else{
left_bound = 0;
right_bound = left_bound + window;
}
if(bot_bound - im_height > 0){
offset = bot_bound - im_height;
top_bound = top_bound - offset;
bot_bound = bot_bound - offset;
}
if(right_bound - im_width > 0){
offset = right_bound - im_width;
right_bound = right_bound - offset;
left_bound = left_bound - offset;
}
int counter_i = 0;
int counter_j = 0;
int bin_indx, hist_indx, glob_col_indx, glob_row_indx;
int row_width = 81;
for(int pix_i = top_bound; pix_i < bot_bound; pix_i++){
for(int pix_j = left_bound; pix_j < right_bound; pix_j++){
bin_indx = bin_number(angles[idx(pix_i,pix_j)], ang_limit, ang_bins);
hist_indx = hist_number(counter_i,counter_j);
glob_col_indx = ang_bins*hist_indx + bin_indx;
glob_row_indx = idx(i,j);
output_array[glob_row_indx*row_width + glob_col_indx] = float(output_array[glob_row_indx*row_width + glob_col_indx] + float(gradient_mag[idx(pix_i,pix_j)]));
counter_j = counter_j + 1;
}
counter_i = counter_i + 1;
counter_j = 0;
}
}
"""