segmentation fault(core dumped) in opencl

Question

I am very new to OpenCL but I have been doing parallel programming for more than a year now. I was making my 1st openCL code (matrix multiplication ). I wrote the following code,

//#include<stdio.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <string.h>
#include <SDKCommon.hpp>
#include <SDKApplication.hpp>
#include <SDKCommandArgs.hpp>
#include <SDKFile.hpp>
#include <CL/cl.h>

#define MAX_SOURCE_SIZE (0x100000)
#define MATSIZE 16


void initmat(float *Aa,float *Bb,float *Cc,int row,int colrow,int col);

void initmat(float *Aa,float *Bb,float *Cc,int row,int colrow,int col)
{
unsigned int i;

for(i=0;i<row*colrow;i++){
Aa[i]=1;
}

for(i=0;i<colrow*col;i++){
Bb[i]=2;
}

for(i=0;i<row*col;i++){
Cc[i]=0;
}

}


int main(void)
{
printf(" Here: 1 \n");
// Load the kernel source code into the array source_str
    FILE *fp;
    char *source_str;
    size_t source_size;

    fp = fopen("matmul.cl", "r");
    if (!fp) {
        fprintf(stderr, "Failed to load kernel.\n");
        exit(1);
    }
    source_str = (char*)malloc(MAX_SOURCE_SIZE);
    source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp);
    fclose( fp );
printf(" Here: 2 \n");
// matrix declaration
float *A;
float *B;
float *C;

// set dimesions
int Arow,AcolBrow,Bcol;

Arow=AcolBrow=Bcol=MATSIZE;

// no. of elements in matrix
int sizea, sizeb, sizec;

// Error code from opencl

int err;

// Setting up matrices
sizea= Arow*AcolBrow;
sizeb= AcolBrow*Bcol;
sizec= Arow*Bcol;

A = (float *) malloc(sizeof(float)*sizea);
B = (float *) malloc(sizeof(float)*sizeb);
C = (float *) malloc(sizeof(float)*sizec);
printf(" Here: 3 \n");
initmat(A,B,C,Arow,AcolBrow,Bcol);
// Displaying inputs

unsigned long int ii;

printf("Input A: \n");
for(ii=0;ii<sizea;ii++)
printf("%f  ",A[ii]);

printf("\n \n \n \n");


printf("Input B: \n");
for(ii=0;ii<sizeb;ii++)
printf("%f  ",B[ii]);
printf("\n \n \n");

// get platform id & device id

cl_uint numplatform;
cl_platform_id platformid=NULL;
cl_device_id deviceid=NULL;

err= clGetPlatformIDs(1,&platformid,&numplatform);
err=clGetDeviceIDs(platformid,CL_DEVICE_TYPE_GPU,1,&deviceid,NULL);

cl_context_properties properties[]= 
{ 
    CL_CONTEXT_PLATFORM, (cl_context_properties)platformid,0 
};

// create context
cl_context context= clCreateContext(properties,1,&deviceid,NULL,NULL,&err);

/* when more than one gpu is installed on the system than we make use of the approach as we stated in the  notes !! */
printf(" Here: 4 \n");
// create command queue

cl_command_queue queue = clCreateCommandQueue(context,deviceid,0,&err); // I have disabled profiling option


// Allocate buffer object for Ad,Bd,Cd

cl_mem Ad = clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(cl_float)*sizea,NULL,NULL);
cl_mem Bd = clCreateBuffer(context,CL_MEM_READ_ONLY,sizeof(cl_float)*sizeb,NULL,NULL);
cl_mem Cd = clCreateBuffer(context,CL_MEM_WRITE_ONLY,sizeof(cl_float)*sizec,NULL,NULL);
printf(" Here: 5 \n");
// We are not explicitely making kernel. We are putting the kernel code here itself (see notes)


cl_program program= clCreateProgramWithSource(context,1,(const char **)&source_str, (const size_t *)&source_size,&err);
printf(" Here: 6 \n");
// Build program using program object just created

err = clBuildProgram(program,0,NULL,NULL,NULL,NULL);

if(err !=CL_SUCCESS)
{

size_t len;
char buffer[2048];
printf("ERROR: Failed to build executable \n ");
clGetProgramBuildInfo(program,deviceid,CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer , &len);
printf("%s \n",buffer);
//return FAILURE;

}
printf(" Here: 7 \n");
// Create kernel object

cl_kernel kernel = clCreateKernel(program,"matmul",NULL);
printf(" Here: 8 \n");
// set kernel argument values

err=0;
err= clSetKernelArg(kernel,0,sizeof(int),&Arow);
err|= clSetKernelArg(kernel,1,sizeof(int),&AcolBrow);
err|= clSetKernelArg(kernel,2,sizeof(int),&Bcol);
err|= clSetKernelArg(kernel,3,sizeof(cl_mem),&Ad);
err|= clSetKernelArg(kernel,4,sizeof(cl_mem),&Bd);
err|= clSetKernelArg(kernel,5,sizeof(cl_mem),&Cd);
printf(" Here: 9 \n");
// Write to device buffers. Ad=A and Bd=B   : Equivalent to CUDAmemcpy

err=clEnqueueWriteBuffer(queue,Ad,CL_TRUE,0,sizeof(cl_float)*sizea,A,0,NULL,NULL);
err=clEnqueueWriteBuffer(queue,Bd,CL_TRUE,0,sizeof(cl_float)*sizeb,B,0,NULL,NULL);
printf(" Here: 10 \n");
// since we have set the copy as synchronous we will be creating event
cl_event event;

// Execute the kernel over entire range of C matrix

size_t global[2];
size_t local[2];
cl_uint * ndim; // no. of dimension in ND range. 3rd parameter in kernel call signifies the dimension.

global[0]=(size_t)Arow;
global[1]=(size_t)Bcol;

* ndim=2; // because we want 2-D multiplication. Gives n

/* no local size declaration cause we are not making work groups ie blocks.We are just make making oneblock where everythread takes one element of A,B and computes C */
printf(" Here: 11 \n");
err = clEnqueueNDRangeKernel(queue,kernel,*ndim,NULL,global,NULL,0,NULL,&event); // the NULL position after global is for passing local dimension. In this case we don't have one.
clFinish(queue); // wait for kernel to finish before we begin copying the result back on host
printf(" Here: 12 \n");
//read back the result

err=clEnqueueReadBuffer(queue,Cd,CL_TRUE,0,sizeof(cl_float)*sizec,C,0,NULL,NULL);


// Checking whether the computations done are on CPU or GPU

cl_device_type dev_type;
clGetDeviceInfo(deviceid, CL_DEVICE_TYPE, sizeof(dev_type), &dev_type, NULL);
if (dev_type == CL_DEVICE_TYPE_GPU) {
    printf("Following code was complied on GPU ! \n \n \n \n");
}
else
    printf("Following code was complied on CPU ! \n \n \n \n");
printf(" Here: 13 \n \n \n");
// Displaying results
printf("Result is: \n");
for(ii=0;ii<sizec;ii++)
printf("%f  ",C[ii]);

printf("\n \n \n");

// free all memory
printf(" Here: 14 \n");
clReleaseProgram(program);
clReleaseKernel(kernel);
clReleaseMemObject(Ad);
clReleaseMemObject(Bd);
clReleaseMemObject(Cd);
clReleaseCommandQueue(queue);
clReleaseContext(context);
printf(" Here: 15 \n");
return 0;
printf(" Here: 16 \n");

}

My kernel code is as follows,

__kernel void matmul(const int Mdim, const int Ndim,const int Pdim,__global float* A,__global float* B,__global float* C)
{

float tmp;
int i = get_global_id(0);
int j = get_global_id(1);
if((i<Ndim)&(j<Mdim))
C[i*Ndim+j] = 3; 

}

I could compile it successfully and it created a binary in ../bin/x86/ folder. When I try to run it using ./matmul it throws the following error,

93 > Sun Mar 17 : 04:22 PM : samkit@samkit:~/AMD/AMD-APP-SDK-v2.8-RC-lnx32/samples/opencl/bin/x86$ ./matmul Segmentation fault (core dumped)

This is my sample output:

   135 > Sun Mar 17 : 07:49 PM : samkit@samkit:~/AMD/AMD-APP-SDK-v2.8-RC-lnx32/samples/opencl/bin/x86$ ./matmul
 Here: 1 
 Here: 2 
 Here: 3 
 Here: 4 
 Here: 5 
 Here: 6 
 Here: 7 
 Here: 8 
 Here: 9 
 Here: 10 
 Here: 11 
 Here: 12 
Following code was complied on GPU ! 



 Here: 13 


Result is: 
Segmentation fault (core dumped)

I know functionality of kernel is no where near that of matrix multiplication kernel but I did this to just check if these is some error in my kernel. Please give suggestions or advice that can help me make my code run.

Thanks in advance.

You need way more error checking here. Don't just get the error code, also print out a message if something goes wrong! I think the kernel should be using `constant`, not `const` (which is not a memory space qualifier) though CLC should be able to correct it. So, yeah, add some debugging code to get some more information (where exactly does it fail, etc..) — Thomas, Mar 17 '13 at 22:58
I have edited the code with flags.Putting constant didn't helped. Still I am getting the error. Some strange thing is happening. If I comment the loops that are printing A and B's element ( ensuring unsigned long int ii; is not commented ) only flags till Here:13 are displayed. Flag: Here:14 and Here:15 are not displayed and also elements of C are not displayed ! If I uncomment these loops then I get all flag till : Here:15 displayed and also elements of C are displayed, but I still get segmentation error . Can you run this code on your machine & share output. Please help. — samkit, Mar 18 '13 at 00:58
You did not allocate memory for `ndim`. I'm guessing this is trashing your stack. Can you print out the value of `sizec` at the beginning of the program, and then right before displaying the elements of `C`? Also print the value of `C` (as a pointer). Definitely memory corruption. — Thomas, Mar 18 '13 at 01:15
ohhhhhh Goooddd..!! Yes you are right... its the pointer *ndim that was creating problem. I forgot to allocate memory for it. Just did that and every every thing is on track ! Thanks a lot @Thomas : I could have never debugged this. :) Thanks once again. — samkit, Mar 18 '13 at 01:35

segmentation fault(core dumped) in opencl

0 Answers0

Linked