1

I have been trying to implement naive matrix multiplication in SYCL, but I always keep running into segmentation fault once the kernel start. My code is as follows -

class naive_MatMul_kernel;
class sharedMatrixMultiplication_kernel;
typedef cl::sycl::buffer<float, 1> sycl_buffer;


void naiveMatrixMultiplication(sycl_buffer MatA, sycl_buffer MatB, sycl_buffer result, size_t M, size_t N, size_t K,
                               queue deviceQueue, int numThreads){

    /*
     * Naive Matrix Multiplication of MxN and NxK
     * */

    std::cout<<"Starting Matrix Multiplication"<<std::endl;
    nd_range<2> launchParams = nd_range<2>(cl::sycl::range<2>(M / numThreads + 1, K / numThreads + 1),
                                           cl::sycl::range<2>(numThreads, numThreads));

    deviceQueue.submit([&MatA, &MatB, &result, M, N, K, launchParams](handler& cgh){

        auto MatA_accessor = MatA.get_access<access::mode::read>(cgh);
        auto MatB_accessor = MatB.get_access<access::mode::read>(cgh);
        auto result_accessor = result.get_access<access::mode::read_write>(cgh);


        cgh.parallel_for<naive_MatMul_kernel>(launchParams, [MatA_accessor, MatB_accessor, result_accessor, M, N, K]
                (nd_item<2> ndItem){

            auto column_index = ndItem.get_group(1) * ndItem.get_local_range(1) + ndItem.get_local_id(1);
            auto row_index = ndItem.get_group(0) * ndItem.get_local_range(0) + ndItem.get_local_id(0);

            if(row_index < M && column_index < K){
                float sum = 0.0f;
                for (int i = 0; i < N; i++) {
                    sum += MatA_accessor[N * row_index + i] * MatB_accessor[ i * N + column_index];
                }
                result_accessor[K * row_index + column_index] = sum;
            }
        });
    });
    deviceQueue.wait();
    std::cout<<"Done with Matmul"<<std::endl;
}
 

int main() {

    size_t M  = 512;
    size_t N = 512;
    size_t K = 512;

    auto matA = (float*) malloc(M * N * sizeof(float ));
    auto matB = (float*) malloc(N * K * sizeof(float ));
    auto result =  (float*) malloc(M * K * sizeof(float ));

    for (int i=0; i< M*N; i++)
         matA[i] = 2.0f;
    for (int i=0; i< N*K; i++)
        matB[i] = 2.0f;
    for (int i = 0; i < M*K; ++i)
        result[i] = 69.0f;

    queue Queue;

    auto device = Queue.get_device();
    auto max_work_group_size = device.get_info<cl::sycl::info::device::max_work_group_size>();
    std::cout<<device.get_info<cl::sycl::info::device::name>()<<std::endl;
    auto thread_max  = int(std::sqrt(max_work_group_size));
    std::cout<<thread_max<<std::endl;


    buffer<float, 1> mata_buffer(matA, range<1>(M * N * sizeof(float )));
    buffer<float, 1> matb_buffer(matB, range<1>(N * K * sizeof(float )));
    buffer<float, 1> result_buffer(result, range<1>(M * K * sizeof(float )));

    auto mata_shared = std::make_shared<buffer<float, 1>>(mata_buffer);
    auto matb_shared = std::make_shared<buffer<float, 1>>(matb_buffer);
    auto result_shared = std::make_shared<buffer<float, 1>>(result_buffer);

    naiveMatrixMultiplication(mata_buffer, matb_buffer, result_buffer, M, N, K, Queue, thread_max);

    Queue.submit([result_shared, result](handler& cgh){
       auto resultAccessor = result_shared->get_access<access::mode::read>(cgh);
       cgh.copy(resultAccessor, result);
    });
    Queue.wait();

    std::cout<<"Here";

    for(int i=0; i<100; i++)
        std::cout<<result[i]<<"  ";
    std::cout<<std::endl;

}



and the output is as follows -

Intel(R) Gen9 HD Graphics NEO
16
Starting Matrix Multiplication
Segmentation fault (core dumped)

I am unable to figure out where the segmentation fault is originating. Any help is appreciated.

Thanks in Advance

Edit - passing -g as a compiler flag to get the debugging symbols, the output is as follows -

Intel(R) Gen9 HD Graphics NEO
16
Starting Matrix Multiplication
terminate called after throwing an instance of 'cl::sycl::invalid_object_error'
Aborted (core dumped)

and running it under GDB - Here is the output

For help, type "help".
Type "apropos word" to search for commands related to "word"...
Reading symbols from ./computecpp_test...
(gdb) r
Starting program: /home/atharva/CLionProjects/computecpp_test/computecpp_test 
[Thread debugging using libthread_db enabled]
Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
[New Thread 0x7ffff7066700 (LWP 18128)]
[New Thread 0x7ffff62e5700 (LWP 18133)]
Intel(R) Gen9 HD Graphics NEO
16
Starting Matrix Multiplication
terminate called after throwing an instance of 'cl::sycl::invalid_object_error'

Thread 1 "computecpp_test" received signal SIGABRT, Aborted.
__GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
50      ../sysdeps/unix/sysv/linux/raise.c: No such file or directory.
(gdb) 

and Here is my CMake, just for reference so as you know the compiler flags being passed

cmake_minimum_required(VERSION 3.17)
project(computecpp_test)

set(CMAKE_CXX_COMPILER /home/atharva/ComputeCPP/computeCPP/bin/compute++)
set(CMAKE_CXX_FLAGS -sycl-driver)
set(CMAKE_CXX_FLAGS -g)

set(CMAKE_MODULE_PATH /home/atharva/computecpp-sdk/cmake/Modules/)
#include(FindComputeCpp)
find_package(ComputeCpp)

include_directories($(COMPUTECPP_INCLUDE_DIRECTORY))

add_executable(computecpp_test main.cpp)
target_link_libraries(computecpp_test PUBLIC ComputeCpp::ComputeCpp)

UPDATE - During debugging, I changed all the indices to 0, but segmentation fault(invalid object error if the -g compiler flag is used) is still thrown, so that leads me to believe that the data access is not the issue but something else.

The backtrace is as follows -

#0  __GI_raise (sig=sig@entry=6) at ../sysdeps/unix/sysv/linux/raise.c:50
#1  0x00007ffff73c8859 in __GI_abort () at abort.c:79
#2  0x00007ffff779d911 in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#3  0x00007ffff77a938c in ?? () from /lib/x86_64-linux-gnu/libstdc++.so.6
#4  0x00007ffff77a93f7 in std::terminate() () from /lib/x86_64-linux-gnu/libstdc++.so.6
#5  0x00007ffff77a96a9 in __cxa_throw () from /lib/x86_64-linux-gnu/libstdc++.so.6
#6  0x00007ffff7c63d61 in void cl::sycl::detail::handle_sycl_log<cl::sycl::invalid_object_error>(std::unique_ptr<cl::sycl::detail::sycl_log, std::default_delete<cl::sycl::detail::sycl_log> >&&) ()
   from /home/atharva/ComputeCPP/computeCPP/lib/libComputeCpp.so
#7  0x00007ffff7c5d0bd in cl::sycl::detail::trigger_sycl_log(cl::sycl::log_type, char const*, int, int, cl::sycl::detail::cpp_error_code, cl::sycl::detail::context const*, char const*) ()
   from /home/atharva/ComputeCPP/computeCPP/lib/libComputeCpp.so
#8  0x000000000040ab25 in cl::sycl::program::create_program_for_kernel<naive_MatMul_kernel> (c=...) at /home/atharva/ComputeCPP/computeCPP/include/SYCL/program.h:510
#9  0x000000000040552b in cl::sycl::handler::parallel_for_impl<naive_MatMul_kernel, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::{lambda(cl::sycl::nd_item<2>)#1}>(cl::sycl::detail::nd_range_base const&, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::{lambda(cl::sycl::nd_item<2>)#1} const&, int) (this=0x6b1d40, ndRange=..., functor=..., dimensions=2)
    at /home/atharva/ComputeCPP/computeCPP/include/SYCL/apis.h:423
#10 0x0000000000405485 in cl::sycl::handler::parallel_for<naive_MatMul_kernel, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::{lambda(cl::sycl::nd_item<2>)#1}, 2>(cl::sycl::nd_range<2> const&, naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const::{lambda(cl::sycl::nd_item<2>)#1} const&) (this=0x6b1d40, ndRange=..., functor=...)
    at /home/atharva/ComputeCPP/computeCPP/include/SYCL/apis.h:471
#11 0x000000000040536e in naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0::operator()(cl::sycl::handler&) const (
    this=0x7fffffffd500, cgh=...) at main.cpp:49
#12 0x000000000040518f in cl::sycl::detail::command_group::submit_handler<naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0>(naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0, std::shared_ptr<cl::sycl::detail::queue> const&, cl::sycl::detail::standard_handler_tag) (this=0x7fffffffd738, cgf=..., fallbackQueue=std::shared_ptr<class cl::sycl::detail::queue> (empty) = {...}) at /home/atharva/ComputeCPP/computeCPP/include/SYCL/command_group.h:179
#13 0x000000000040391f in cl::sycl::queue::submit<naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0>(naiveMatrixMultiplication(cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, cl::sycl::buffer<float, 1, cl::sycl::detail::aligned_mem::aligned_allocator>, unsigned long, unsigned long, unsigned long, cl::sycl::queue, int)::$_0) (this=0x7fffffffdaa8, cgf=...) at /home/atharva/ComputeCPP/computeCPP/include/SYCL/queue.h:519
#14 0x00000000004037bb in naiveMatrixMultiplication (MatA=..., MatB=..., result=..., M=512, N=512, K=512, deviceQueue=..., numThreads=16) at main.cpp:42
#15 0x0000000000404adb in main () at main.cpp:220

Essentially it stops at this code in a program.h file

      COMPUTECPP_CL_ERROR_CODE_MSG(
          CL_SUCCESS, detail::cpp_error_code::KERNEL_NOT_FOUND_ERROR,
          c.get_impl().get(),
          "Unable to retrieve kernel function, is integration header included?")
    }

Apparently, it is not able to retrieve the kernel function.

Atharva Dubey
  • 832
  • 1
  • 8
  • 25
  • I'd recommend running the program using the gdb debugger and getting a backtrace. That should give you some hints about where the segmentation fault is happening. Could you add the information from that? – Rod Burns Jun 25 '21 at 11:00
  • Thank you for your reply @RodBurns. I did run the program under gdb on my side but it wasn't really helpful. I am using Codeplay's compiler, could you recommend the correct flags to use to run it under gdb, I suspect that's why GDB wasn't really helpful in the first try – Atharva Dubey Jun 25 '21 at 11:20
  • I have added the GDB output as suggested. Interestingly, after passing -g as a compiler flag the error went from segmentation fault to invalid object error, and gdb says that some file is missing – Atharva Dubey Jun 25 '21 at 11:33
  • Might be helpful to get the backtrace using the bt command. – Rod Burns Jun 25 '21 at 18:08
  • Also what is your environment? E.g. Linux or Windows. Windows in debug mode is failing with ComputeCpp at the moment – Rod Burns Jun 25 '21 at 18:10
  • I am using Ubuntu 20.04 with compute cpp version 2.3.0, – Atharva Dubey Jun 25 '21 at 18:20
  • @RodBurns I have added the backtrace, Thanks for your time – Atharva Dubey Jun 26 '21 at 04:26
  • I think the queue is not valid. Do you need to pass the queue by reference to the function perhaps? I'd be inclined to have it as a member variable to avoid passing it around. – Rod Burns Jun 26 '21 at 10:00
  • @RodBurns, passing the queue parameter is not the issue. That had occurred to me and wrote a function which only accepted the array pointers, and the buffers and queue are created in that function. Still I get the error message - `Error: [ComputeCpp:RT0102] The requested kernel name could not be found (Unable to retrieve kernel function, is integration header included? ) ` Irrespective of the queue. Also I have passed queues are parameter before and never ran into any problem – Atharva Dubey Jun 26 '21 at 13:32

1 Answers1

3

Here are a few issues I've found in your code:

  1. The following:
 Queue.submit([result_shared, result](handler& cgh){
   auto resultAccessor = result_shared->get_access<access::mode::read(cgh);
   cgh.copy(resultAccessor, result);
 });
 Queue.wait(); 

is useless as sycl::buffers are meant to do the sync for you. You are guaranteed that memory was copied back to the host once the buffer is destroyed (else it's in an undefined state I believe).

  1. You've declared your buffer as buffer<float, 1>, this means your SYCL buffers contains the type of the underlying data. When constructing your buffer you only need to pass the number of elements AND NOT its size in bytes. This is why your code crashes when submitting the kernel (this is where the implicit copy happens to the device).

Just write:

buffer<float, 1> mata_buffer(matA, range<1>(M * N));
buffer<float, 1> matb_buffer(matB, range<1>(N * K));
buffer<float, 1> result_buffer(result, range<1>(M * K));
  1. Turns out that the default queue you get from queue Queue; is not necessarily the host device. This behaviour allows, on some implementations, to change the device you're running on using environment variables. On my implementation, queue Queue; returns me a GPU, and your original code fails (as it needs to do the mentioned copy). But when running on the host device with queue Queue{host_selector{}}; is does not work as the SYCL implementation I'm running does not perform, hopefully, a memcpy from the host to the host.

  2. You're using max_work_group_size as if it you assumed it was the real work group size. It is not, it's just a hint and can be literally anything from 0 to 2**64-1. Consider doing some bounds checking.

  3. You've mixed up the arguments in your nd_range<2>. The signature is:

sycl::nd_range<2>(sycl::range<2> globalSize, sycl::range<2> localSize);

Every dimension of globalSize should be a multiple of every dimension in localSize.

So you should do

auto local_range = sycl::range<2>(numThreads, numThreads);
auto global_range = sycl::range<2>(M / numThreads + 1, K / numThreads + 1) * local_range;
sycl::nd_range<2> launchParams = nd_range<2>(global_range, local_range);

The purpose of the nd_range multiplication is to get the "real" global range your device will be working on, as it can be a little bigger than what you expects.

Last remarks: I'm not really sure why you're wrapping your buffers in shared pointers. First they are not "heavy constructs", it's a wrapper that does not hold memory. You might have noticed it does not even take a device. Furthermore, accessing a single buffer from various places (purpose of shared pointer I guess) could lead to UB.

Finally you don't need to do the offset computing by hand, you can just use

row_index = ndItem.get_global_id(0);

With these suggestions your code is:


void naiveMatrixMultiplication(float* MatA, float* MatB, float* result, size_t M, size_t N, size_t K, queue deviceQueue, size_t numThreads) {

    /*
     * Naive Matrix Multiplication of MxN and NxK
     * */

    std::cout << "Starting Matrix Multiplication" << std::endl;

    buffer<float, 1> mata_buffer(MatA, range<1>(M * N));
    buffer<float, 1> matb_buffer(MatB, range<1>(N * K));
    buffer<float, 1> result_buffer(result, range<1>(M * K));

    auto local_range = range<2>(numThreads, numThreads);
    auto global_range = range<2>(M / numThreads + 1, K / numThreads + 1) * local_range;
    auto launchParams = nd_range<2>(global_range, local_range);

    deviceQueue.submit([&, M, N, K, launchParams](handler &cgh) {
        auto MatA_accessor = mata_buffer.get_access<access::mode::read>(cgh);
        auto MatB_accessor = matb_buffer.get_access<access::mode::read>(cgh);
        auto result_accessor = result_buffer.get_access<access::mode::write>(cgh);
        cgh.parallel_for<naive_MatMul_kernel>(launchParams, [MatA_accessor, MatB_accessor, result_accessor, M, N, K]
                (nd_item<2> ndItem) {

            auto column_index = ndItem.get_global_id(1);
            auto row_index = ndItem.get_global_id(0);

            if (row_index < M && column_index < K) {
                float sum = 0.0f;
                for (int i = 0; i < N; i++) {
                    sum += MatA_accessor[N * row_index + i] * MatB_accessor[i * N + column_index];
                }
                result_accessor[K * row_index + column_index] = sum;
            }
        });
    });
    deviceQueue.wait();
    std::cout << "Done with Matmul" << std::endl;
}


int main() {
    size_t M = 512;
    size_t N = 512;
    size_t K = 512;
    auto matA = (float *) malloc(M * N * sizeof(float));
    auto matB = (float *) malloc(N * K * sizeof(float));
    auto result = (float *) malloc(M * K * sizeof(float));

    for (int i = 0; i < M * N; i++)
        matA[i] = 2.0f;
    for (int i = 0; i < N * K; i++)
        matB[i] = 2.0f;
    for (int i = 0; i < M * K; ++i)
        result[i] = 69.0f;

    queue Queue{gpu_selector{}};

    auto device = Queue.get_device();
    auto max_work_group_size = device.get_info<info::device::max_work_group_size>();
    std::cout << device.get_info<info::device::name>() << std::endl;
    auto thread_max = std::sqrt(max_work_group_size);
    std::cout << thread_max << std::endl;

    naiveMatrixMultiplication(matA, matB, result, M, N, K, Queue, thread_max);
    std::cout << "Here";

    for (int i = 0; i < 100; i++)
        std::cout << result[i] << "  ";
    std::cout << std::endl;
}

Edit: I'd add that there is a matrix multiplication sample written in SYCL on the computecpp-sdk repository (for further inspiration).

Martin
  • 5
  • 2
Michoumichmich
  • 406
  • 2
  • 8