I try to convert a PyCuda code to PyOpenCL. This is the following of a previous version of working NVIDIA GPU code. This code aims to invert a high number of 3x3 matrixes.
Here's the PyCuda working version :
$ cat t14.py
import numpy as np
# import matplotlib.pyplot as plt
import pycuda.driver as cuda
from pycuda.compiler import SourceModule
import pycuda.autoinit
# kernel
kernel = SourceModule("""
__device__ unsigned getoff(unsigned &off){
unsigned ret = off & 0x0F;
off >>= 4;
return ret;
}
// in-place is acceptable i.e. out == in)
// T = float or double only
const int block_size = 288;
typedef double T; // *** can set to float or double
__global__ void inv3x3(const T * __restrict__ in, T * __restrict__ out, const size_t n, const unsigned * __restrict__ pat){
__shared__ T si[block_size];
size_t idx = threadIdx.x+blockDim.x*blockIdx.x;
T det = 1;
if (idx < n*9)
det = in[idx];
unsigned sibase = (threadIdx.x / 9)*9;
unsigned lane = threadIdx.x - sibase; // cheaper modulo
si[threadIdx.x] = det;
__syncthreads();
unsigned off = pat[lane];
T a = si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
T b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
a -= b;
__syncthreads();
if (lane == 0) si[sibase+3] = a;
if (lane == 3) si[sibase+4] = a;
if (lane == 6) si[sibase+5] = a;
__syncthreads();
det = si[sibase]*si[sibase+3]+si[sibase+1]*si[sibase+4]+si[sibase+2]*si[sibase+5];
if (idx < n*9)
out[idx] = a / det;
}
""")
# host code
def gpuinv3x3(inp, n):
# internal constants not to be modified
hpat = (0x07584, 0x08172, 0x04251, 0x08365, 0x06280, 0x05032, 0x06473, 0x07061, 0x03140)
# Convert parameters into numpy array
# *** change next line between float32 and float64 to match float or double
inpd = np.array(inp, dtype=np.float64)
hpatd = np.array(hpat, dtype=np.uint32)
# *** change next line between float32 and float64 to match float or double
output = np.empty((n*9), dtype= np.float64)
# Get kernel function
matinv3x3 = kernel.get_function("inv3x3")
# Define block, grid and compute
blockDim = (288,1,1) # do not change
gridDim = ((n/32)+1,1,1)
# Kernel function
matinv3x3 (
cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
block=blockDim, grid=gridDim)
return output
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv3x3(inp, n)
print(result.reshape(2,3,3))
$ python t14.py
And here below my attempt to get the equivalent PyOpenCL version :
$ cat gpu_opencl.py
from __future__ import absolute_import, print_function
import numpy as np
import pyopencl as cl
# Initialization
n = 2
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
a_in = np.array(n*9).astype(np.float64)
res_np = np.zeros(n*9).astype(np.float64)
mf = cl.mem_flags
a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a_in)
res_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=res_np)
# kernel
prg = cl.Program(ctx, """
unsigned getoff(unsigned &off){
unsigned ret = off & 0x0F;
off >>= 4;
return ret;
}
// OpenCL
// T = float or double only
typedef double T;
const int block_size = 288;
__kernel
void
inv3x3(
__global const T* restrict in,
__global T* restrict out,
const size_t n,
const unsigned* restrict pat
)
{
unsigned int tid = get_local_id(0);
unsigned int gid = get_global_id(0);
unsigned int groupid = get_group_id(0);
unsigned int localSize = get_local_size(0);
__local T si[block_size];
size_t idx = tid + groupid*localSize;
T det = 1;
if (idx < n*9)
det = in[idx];
unsigned sibase = (tid / 9)*9;
unsigned lane = tid - sibase; // cheaper modulo
si[tid] = det;
// Synchronize to make sure data is available for processing
barrier(CLK_LOCAL_MEM_FENCE);
unsigned off = pat[lane];
T a = si[sibase + getoff(off)];
a *= si[sibase + getoff(off)];
T b = si[sibase + getoff(off)];
b *= si[sibase + getoff(off)];
a -= b;
barrier(CLK_LOCAL_MEM_FENCE);
if (lane == 0) si[sibase+3] = a;
if (lane == 3) si[sibase+4] = a;
if (lane == 6) si[sibase+5] = a;
barrier(CLK_LOCAL_MEM_FENCE);
det = si[sibase]*si[sibase+3]+si[sibase+1]*si[sibase+4]+si[sibase+2]*si[sibase+5];
if (idx < n*9)
out[idx] = a / det;
}
""").build()
def gpuinv3x3(inp, n):
# internal constants not to be modified
hpat = (0x07584, 0x08172, 0x04251, 0x08365, 0x06280, 0x05032, 0x06473, 0x07061, 0x03140)
hpatd = np.array(hpat, dtype=np.uint32)
# Convert parameters into numpy array
# *** change next line between float32 and float64 to match float or double
inpd = np.array(inp, dtype=np.float64)
hpatd = np.array(hpat, dtype=np.uint32)
# *** change next line between float32 and float64 to match float or double
output = np.empty((n*9), dtype= np.float64)
# Get kernel function
res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_in.nbytes)
prg.inv3x3(queue, a_g, res_g, np.uint64(n), htpatd)
res_np = np.empty_like(a_in)
cl.enqueue_copy(queue, res_np, res_g)
return res_np
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
n = 2
result = gpuinv3x3(inp, n)
print(result.reshape(2,3,3))
$ python gpu_opencl.py
But when I execute this latter with PyOpenCL, I get the following errors :
Traceback (most recent call last):
File "gpu_opencl_inv_3x3.py", line 69, in <module>
""").build()
File "/Users/fab/Library/Python/2.7/lib/python/site-packages/pyopencl/__init__.py", line 510, in build
options_bytes=options_bytes, source=self._source)
File "/Users/fab/Library/Python/2.7/lib/python/site-packages/pyopencl/__init__.py", line 554, in _build_and_catch_errors
raise err
pyopencl._cl.RuntimeError: clBuildProgram failed: BUILD_PROGRAM_FAILURE - clBuildProgram failed: BUILD_PROGRAM_FAILURE - clBuildProgram failed: BUILD_PROGRAM_FAILURE
Build on <pyopencl.Device 'AMD Radeon Pro Vega 20 Compute Engine' on 'Apple' at 0x1021d00>:
<program source>:5:26: error: expected ')'
unsigned getoff(unsigned &off){
^
<program source>:5:16: note: to match this '('
unsigned getoff(unsigned &off){
^
<program source>:5:10: warning: no previous prototype for function 'getoff'
unsigned getoff(unsigned &off){
^
<program source>:5:26: error: parameter name omitted
unsigned getoff(unsigned &off){
^
<program source>:6:18: error: use of undeclared identifier 'off'
unsigned ret = off & 0x0F;
^
<program source>:7:3: error: use of undeclared identifier 'off'
off >>= 4;
^
<program source>:15:11: error: global variables must have a constant address space qualifier
const int block_size = 288;
^
<program source>:23:26: error: kernel pointer arguments must have a global, local, or constant address space qualifier
const unsigned* restrict pat
^
I don't know how to modify exactly the equivalent for PyOpenCL. Someone could see by chance an error in my PyOpenCL code. I am seeking some clues to make this PyOpenCL code working like with PyCuda version.
Update
Thanks to @talonmies
comment, I have finally found a solution that seems to work, here's the code :
import numpy as np
import pyopencl as cl
# Initialization
n = 2
# Numpy array
a_np = np.array(n*9).astype(np.float64)
res_np = np.zeros(n*9).astype(np.float64)
# Select a device
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)
# Allocate memory on the device and copy the content of our numpy array
mf = cl.mem_flags
res_g = cl.Buffer(ctx, mf.WRITE_ONLY, a_np.nbytes)
# kernel
prg = cl.Program(ctx, """
#define block_size 288
static unsigned getoff(unsigned* off){
unsigned ret = *off & 0x0F;
*off = *off >> 4;
return ret;
}
// OpenCL
// T = float or double only
typedef double T;
//__constant int block_size = 288;
__kernel
void
inv3x3(
__global const T* in,
__global T* out,
const size_t n,
__global const unsigned* pat
)
{
unsigned tid = get_local_id(0);
unsigned groupid = get_group_id(0);
unsigned localSize = get_local_size(0);
//Get kernel function
//Define block and grid : NOT USED
//blockDim = (288,1,1) # do not change
//gridDim = ((n/32)+1,1,1)
__local T si[block_size];
size_t idx = tid + groupid*localSize;
T det = 1;
if (idx < n*9)
det = in[idx];
unsigned sibase = (tid / 9)*9;
unsigned lane = tid - sibase; // cheaper modulo
si[tid] = det;
// Synchronize to make sure data is available for processing
barrier(CLK_LOCAL_MEM_FENCE);
unsigned off = pat[lane];
T a = si[sibase + getoff(&off)];
a *= si[sibase + getoff(&off)];
T b = si[sibase + getoff(&off)];
b *= si[sibase + getoff(&off)];
a -= b;
barrier(CLK_LOCAL_MEM_FENCE);
if (lane == 0) si[sibase+3] = a;
if (lane == 3) si[sibase+4] = a;
if (lane == 6) si[sibase+5] = a;
barrier(CLK_LOCAL_MEM_FENCE);
det = si[sibase]*si[sibase+3]+si[sibase+1]*si[sibase+4]+si[sibase+2]*si[sibase+5];
if (idx < n*9)
out[idx] = a / det;
}
""").build()
def gpuinv3x3(inp, n):
# internal constants not to be modified
hpat = (0x07584, 0x08172, 0x04251, 0x08365, 0x06280, 0x05032, 0x06473, 0x07061, 0x03140)
# Convert parameters into numpy array
inpd = np.array(inp, dtype=np.float64)
hpatd = np.array(hpat, dtype=np.uint32)
# Create input numpy
a_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=inpd)
res_g = cl.Buffer(ctx, mf.WRITE_ONLY, inpd.nbytes)
hpatd_g = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=hpatd)
#Get kernel function
my_knl= prg.inv3x3
my_knl.set_scalar_arg_dtypes([None, None, np.int32, None])
my_knl(queue, inpd.shape, None, a_g, res_g, n, hpatd_g)
# Create a numpy array for the results and copy them from the device
cl.enqueue_copy(queue, res_np, res_g)
return res_np
inp = (1.0, 1.0, 1.0, 0.0, 0.0, 3.0, 1.0, 2.0, 2.0, 1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0)
result = gpuinv3x3(inp, n)
print(result.reshape(2,3,3))
That produces the 2 expected matrixes :
[[[ 2. -0. -1. ]
[-1. -0.33333333 1. ]
[-0. 0.33333333 -0. ]]
[[ 1. 0. 0. ]
[ 0. 1. 0. ]
[ 0. 0. 1. ]]]
Now, I am faced to the understanding of the kernel parameters used in PyCuda version, i.e :
# Global and local sizes
blockDim = (288,1,1)
gridDim = ((n/32)+1,1,1)
As you can see, I have not used, in my working version above, these parameters : How can I include them into the calling of kernel code ?
I tried to do in the calling :
my_knl(queue, gridDim, blockDim, a_g, res_g, n, hpatd_g)
It seems that right syntax is :
my_knl(queue, globalSize, localSize, a_g, res_g, n, hpatd_g)
In CUDA version, the calling with these sizes is :
# Kernel function
matinv3x3 (
cuda.In(inpd), cuda.Out(output), np.uint64(n), cuda.In(hpatd),
block=blockDim, grid=gridDim)
How can I proceed?