I am doing matrix multiplication in sycl, but having some problems. I am using 2 (4x4) Matrices for multiplication and on first iteration of for loop it works but on second iteration when i = 1 it works fine until C[11] = A[11]*B[15] but then it skips 1 multiplication and move forward. I know the problem why it skips but unfortunately i have been unable to change matrix B index properly. Kindly if someone can help i will greatly appreciate it. Thanks
Here is the code Matsize= 4, Blocksize = 4 also i know for loop will be equal to matsize it is 2 just to get clear idea of execution flow
{
range<1> dimensions(matSize * matSize);
const property_list props = { property::buffer::use_host_ptr() };
buffer<T> A_buf(MA, dimensions, props);
buffer<T> B_buf(MB, dimensions, props);
buffer<T> C_buf(MC, dimensions, props);
myQueue.submit([&](handler& cgh) {
auto A_ptr = A_buf.template get_access<access::mode::read>(cgh);
auto B_ptr = B_buf.template get_access<access::mode::read_write>(cgh);
auto C_ptr = C_buf.template get_access<access::mode::write>(cgh);
auto localRange = range<1>(blockSize* blockSize);
accessor<T, 1, access::mode::read_write, access::target::local>
C(matSize * matSize, cgh);
cgh.parallel_for<mxm_kernel>(
nd_range<2>(range<2>{matSize, matSize},
range<2>{blockSize, blockSize}),
[=](nd_item<2> item) {
const auto id_x = item.get_global_id(0);
const auto id_y = item.get_global_id(1);
const auto width = item.get_group_range(0) * item.get_local_range(0);
const auto index = id_x * width + id_y;
const auto index2 = id_y * width + id_x;
for (int i = 0; i < 2 ; i++) {
C[index] += A_ptr[index] * B_ptr[index2 + i ];
}
out << "C is!" << C[index] << sycl::endl;
item.barrier(cl::sycl::access::fence_space::local_space);
C_ptr[index] = C[index];
});
});
}