pyopencl - pycuda performance difference

Question

Comparing multiple matrix multiplication calculations with pyopencl and pycuda show differences in performance.

System:

Ubuntu 14.04 with GeForce 920m

Pyopencl code:

#-*- coding: utf-8 -*-
import pyopencl as cl
import pyopencl.array 
from jinja2 import Template
import time
import numpy as np
from scipy.sparse import csr_matrix

KERNEL = Template("""
    {{header}}

    #include <pyopencl-complex.h>

    __kernel
    void complex_mat_mul(__global const {{complex_type}} *a, __global const {{complex_type}} *b, __global {{complex_type}} *res)
    {
        int row = get_local_id(1);
        int col = get_local_id(0);

        int mat_id = get_group_id(0) * get_num_groups(0) + get_group_id(1);

        //printf("mat_id: %d, row: %d, col: %d ----- ", mat_id, row, col);

        {{complex_type}} entry = 0;
        for (int e = 0; e < {{mat_dim}}; ++e) {
            entry += a[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + e] *  b[mat_id*{{mat_dim}}*{{mat_dim}} + e * {{mat_dim}} + col];
        }
        res[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + col] = entry;
    }
""")

def get_ctx_queue(devices=[0]):
    """
    optain context and queue for spcified devices
    """
    platform         = cl.get_platforms()[0]
    platform_devices = platform.get_devices()

    ctx = cl.Context(devices=[platform_devices[x] for x in devices])
    return (ctx, cl.CommandQueue(ctx))

data_types = {
    'cfloat_t': np.complex64,
    'cdouble_t': np.complex128,
    'float': np.float32,
    'double': np.float64
}

def render_kernel(complex_type, real_type, mat_dim):
    header = ""
    if data_types[complex_type] == np.complex128:
        header = """
            #pragma OPENCL EXTENSION cl_khr_fp64 : enable
            #define PYOPENCL_DEFINE_CDOUBLE
        """
    templ = KERNEL.render(
        header=header,
        complex_type=complex_type,
        real_type=real_type,
        mat_dim=mat_dim,
    )
    print(templ)

    return templ

complex_type = 'cdouble_t'
real_type    = 'float'
mat_dim      = 25
mats_count   = 200 # x*x

ctx, queue = get_ctx_queue()

program= cl.Program(ctx, render_kernel(complex_type, real_type, mat_dim)).build()

mats_1 = np.array(np.random.rand(mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type])
mats_2 = np.array(np.random.rand(mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type])

start = time.time()
numpy_result = np.array([np.dot(mats_1[i], mats_2[i]) for i in range(mats_count**2)])
print("numpy time: %.3f" % (time.time()-start))

a = cl.array.to_device(queue, mats_1)
b = cl.array.to_device(queue, mats_2)
c = cl.array.to_device(queue, np.zeros((mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type]))

start = time.time()
program.complex_mat_mul(queue, (mats_count*mat_dim, mats_count*mat_dim, 1), (mat_dim, mat_dim, 1), a.data,b.data,c.data)
queue.finish()
queue.flush()
result = c.get()
print("opencl time: %.3f" % (time.time()-start))

assert np.allclose(numpy_result.flatten(), result.flatten(), atol=0), "FAIL opencl"
print("Success")

Pycuda code:

#-*- coding: utf-8 -*-
import pycuda.autoinit
import pycuda.driver as drv
from pycuda.compiler import SourceModule
from jinja2 import Template
import time
import numpy as np
from scipy.sparse import csr_matrix

KERNEL = Template("""
    #include <stdio.h>
    #include <pycuda-complex.hpp>

    typedef pycuda::complex<float> scmplx;
    typedef pycuda::complex<double> dcmplx;

    __global__ void complex_mat_mul(const {{complex_type}} *a, const {{complex_type}} *b, {{complex_type}} *res)
    {
        int row = threadIdx.y;
        int col = threadIdx.x;

        int mat_id = blockIdx.x * gridDim.x + blockIdx.y;

        //printf("mat_id: %d, row: %d, col: %d ----- ", mat_id, row, col);

        {{complex_type}} entry = 0;
        for (int e = 0; e < {{mat_dim}}; ++e) {
            entry += a[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + e] *  b[mat_id*{{mat_dim}}*{{mat_dim}} + e * {{mat_dim}} + col];
        }
        res[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + col] = entry;
    }
""")

data_types = {
    'scmplx': np.complex64,
    'dcmplx': np.complex128,
    'float': np.float32,
    'double': np.float64
}

def render_kernel(complex_type, real_type, mat_dim, block, gird):
    templ = KERNEL.render(
        complex_type=complex_type,
        real_type=real_type,
        mat_dim=mat_dim,
        blockDim_x=block[0],
        blockDim_y=block[1]
    )
    print(templ)

    return templ

complex_type = 'dcmplx'
real_type    = 'double'
mat_dim      = 25
mats_count   = 200 # x*x

block = (mat_dim,mat_dim,1)
grid  = (mats_count,mats_count)

program = SourceModule(render_kernel(complex_type, real_type, mat_dim, block, grid))

complex_mat_mul = program.get_function("complex_mat_mul")

mats_1 = np.array(np.random.rand(mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type])
mats_2 = np.array(np.random.rand(mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type])
result = np.zeros((mats_count**2, mat_dim, mat_dim), dtype=data_types[complex_type])

start = time.time()
numpy_result = np.array([np.dot(mats_1[i], mats_2[i]) for i in range(mats_count**2)])
print("numpy time: %.3f" % (time.time()-start))

a = drv.In(mats_1)
b = drv.In(mats_2)
c = drv.Out(result)

start = time.time()
complex_mat_mul(a, b, c,
    block=block, 
    grid=grid
)
print("cuda time: %.3f" % (time.time()-start))

assert np.array_equal(numpy_result.flatten(), result.flatten()), "FAIL"
print("Success")

In single and double precision pyopencl performs at least 2 times faster. Changing the number of matrices doesn't change the result.

Both kernels get called the same number of times and I suppose that the spots of the benchmark are appropriate.

What am I missing?

This might be a silly question, but you sure that the opencl kernel is running over the GPU and not over the CPU? — Patrick Trentin, Mar 20 '16 at 12:47
I thought about that two, because when I increase the number of matrices i would expect no change in performance, since they should be executed in parallel. In reality the more matrices the slower it gets. I account this to be due to the increase of buffer size and the time it needs to transfer that buffer to GPU. Also `cl.get_platforms()[0].get_devices()` yields only the nvidia device. — Jesse, Mar 20 '16 at 12:53
try just doubling (or 10x more) the floating point operations loop and see whether there is a corresponding larger decrease in performance with CUDA. If the floating point op is the issue, you would find it easily. What drivers are you using? — Patrick Trentin, Mar 20 '16 at 15:46
When I include the array.to_device and get() operations in pyopencl and drv.In, drv.out operations in pycudua and print a small piece of the result before stopping the time, I constantly get better results with pycuda. So the first performance test was not fair, since I excluded the buffer read and write operations. Also pycuda does complex number multiplication the right way and does it fast! Pyopencl slows down when custom complex_mul functions are used (pyopencl 2014.1). — Jesse, Mar 24 '16 at 13:14

score 3 · Answer 1 · answered Mar 24 '16 at 15:16

Running more tests gave the following results.

Using the Kernel (opencl is equivalent except for the determination of row, col and mat_id):

int row = threadIdx.y;
int col = threadIdx.x;

int mat_id = blockIdx.x * gridDim.x + blockIdx.y;

for (int i=0; i < 10; i++) {

{{complex_type}} entry = 0;
for (int e = 0; e < {{mat_dim}}; ++e) {
    entry += a[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + e] *  b[mat_id*{{mat_dim}}*{{mat_dim}} + e * {{mat_dim}} + col];
}
res[mat_id*{{mat_dim}}*{{mat_dim}} + row * {{mat_dim}} + col] = entry;

}

Added the extra loop to execute more floating point operations
Measuring the performance includes writing and reading the buffers on the GPU (compared to the actual execution time, those transferoperations are very fast)

For different matrix dimensions (constant number of matrices: 22500):

For a different number of matrices (constant matrix dimension: 25):

As in the comments discussed I shortly had better result with pycuda, when I included the initial buffer operations. But by adding the extra loop to increase floating point operations, the pycuda head start vanished. Thus leaving this question open, since we would expect pycuda to perform better.

There are still a number of unanswered questions: 1) what is the driver/video card/library you're using? Maybe there is some known issue. 2) did you try fixing both the memory/matrices size and number, and increase only the number of F.P. operations on the same data, possibly playing nice with data locality? Let's put aside correctness for a minute and focus on F.P. operations only. 3) Is it somewhat possible to get the compiled code of the two kernels? 4) Have you checked that cuda kernel plays nice with Nvidia's caching system, and OpenCL doesn't have optimization in place with this regard? — Patrick Trentin, Mar 25 '16 at 13:43
5) Would your test-bench work out of the box? Maybe if you release it somewhere we can start a bounty and ask S.O.'s users to run tests on their platforms. At least we would understand if these are consistent results. — Patrick Trentin, Mar 25 '16 at 13:47
I created a simpler example yielding again a different result, [code](https://gist.github.com/Jesse-jApps/fb6e0a9a83e35a39c3f6). I still have to find why this behaviour occures now. I am using nvidia 352.63 driver. — Jesse, Mar 27 '16 at 17:09

pyopencl - pycuda performance difference

1 Answers1