I have a pycuda program here that reads in an image from the command line and saves a version back with the colors inverted:
import pycuda.autoinit
import pycuda.driver as device
from pycuda.compiler import SourceModule as cpp
import numpy as np
import sys
import cv2
modify_image = cpp("""
__global__ void modify_image(int pixelcount, unsigned char* inputimage, unsigned char* outputimage)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id >= pixelcount)
return;
outputimage[id] = 255 - inputimage[id];
}
""").get_function("modify_image")
print("Loading image")
image = cv2.imread(sys.argv[1], cv2.IMREAD_UNCHANGED).astype(np.uint8)
print("Processing image")
pixels = image.shape[0] * image.shape[1]
newchannels = []
for channel in cv2.split(image):
output = np.zeros_like(channel)
modify_image(
device.In(np.int32(pixels)),
device.In(channel),
device.Out(output),
block=(1024,1,1), grid=(pixels // 1024 + 1, 1))
newchannels.append(output)
finalimage = cv2.merge(newchannels)
print("Saving image")
cv2.imwrite("processed.png", finalimage)
print("Done")
It works perfectly fine, even on larger images. However, in trying to expand the functionality of the program, I came across a really strange issue wherein adding a second variable argument to the kernel causes the program to completely fail, simply saving a completely black image. The following code does not work;
import pycuda.autoinit
import pycuda.driver as device
from pycuda.compiler import SourceModule as cpp
import numpy as np
import sys
import cv2
modify_image = cpp("""
__global__ void modify_image(int pixelcount, int width, unsigned char* inputimage, unsigned char* outputimage)
{
int id = threadIdx.x + blockIdx.x * blockDim.x;
if (id >= pixelcount)
return;
outputimage[id] = 255 - inputimage[id];
}
""").get_function("modify_image")
print("Loading image")
image = cv2.imread(sys.argv[1], cv2.IMREAD_UNCHANGED).astype(np.uint8)
print("Processing image")
pixels = image.shape[0] * image.shape[1]
newchannels = []
for channel in cv2.split(image):
output = np.zeros_like(channel)
modify_image(
device.In(np.int32(pixels)),
device.In(np.int32(image.shape[0])),
device.In(channel),
device.Out(output),
block=(1024,1,1), grid=(pixels // 1024 + 1, 1))
newchannels.append(output)
finalimage = cv2.merge(newchannels)
print("Saving image")
cv2.imwrite("processed.png", finalimage)
print("Done")
where the only difference is on two lines, the kernel header and it's call. The actual code of the kernel itself is unchanged, and yet this small addition completely breaks the program. Neither the compiler nor interpreter throw any errors. I have no idea how to begin to debug it, and am thoroughly confused.