3

I was wondering if anybody can shed some light on this behaviour with the new operator within a kernel.. Following is the code

#include <stdio.h>
#include "cuda_runtime.h"
#include "cuComplex.h"
using namespace std;
__global__ void test()
{

    cuComplex *store;
    store= new cuComplex[30000];
    if (store==NULL) printf("Unable to allocate %i\n",blockIdx.y);
    delete store;
    if (threadIdx.x==10000) store->x=0.0;
}

int main(int argc, char *argv[])
{
    float timestamp;
    cudaEvent_t event_start,event_stop;
    // Initialise


    cudaEventCreate(&event_start);
    cudaEventCreate(&event_stop);
    cudaEventRecord(event_start, 0);
    dim3 threadsPerBlock;
    dim3 blocks;
    threadsPerBlock.x=1;
    threadsPerBlock.y=1;
    threadsPerBlock.z=1;
    blocks.x=1;
    blocks.y=500;
    blocks.z=1;

    cudaEventRecord(event_start);
    test<<<blocks,threadsPerBlock,0>>>();
    cudaEventRecord(event_stop, 0);
    cudaEventSynchronize(event_stop);
    cudaEventElapsedTime(&timestamp, event_start, event_stop);
    printf("test took  %fms \n", timestamp);
}

Running this on a GTX680 Cuda 5 and investigating the output one will notice that randomly memory is not allocated :( I was thinking that maybe it is because all global memory is finished but I have 2GB of memory and since the maximum amount of active blocks is 16 the amount of memory allocated with this method should at maximum be 16*30000*8=38.4x10e6.. ie around 38Mb. So what else should I consider?

Daniel
  • 639
  • 1
  • 4
  • 17
  • The memory allocated should be `16*30000*sizeof(cuComplex)` – pQB Oct 25 '12 at 16:11
  • Also note that `threadIdx.x` would never be equal to `10000` – pQB Oct 25 '12 at 16:25
  • This is possibly just memory fragmentation on the runtime heap. Have you tried increasing the runtime heap size a bit? – talonmies Oct 25 '12 at 16:27
  • Yes you are right but cuComplex is only 8 bytes so the sum just goes up to 32Mb.. still there is more plenty of memory available (2GB). regarding threadIdx.x==1000 that is just to fool the compiler and not let it ignore the store variable – Daniel Oct 25 '12 at 16:30
  • @tolanmies.. i inserted in the code cudaDeviceSetLimit(cudaLimitMallocHeapSize,30000*8*400); and it worked... but how do I calculate to what size I need to increase this? – Daniel Oct 25 '12 at 17:10

1 Answers1

3

The problem is related with the size of the heap used by the malloc() and free() device system calls. See section 3.2.9 Call Stack and appendix B.16.1 Heap Memory Allocation in the NVIDIA CUDA C Programming Guide for more details.

Your test will work if you set the heap size to fit your kernel requirement

    cudaDeviceSetLimit(cudaLimitMallocHeapSize, 500*30000*sizeof(cuComplex));
pQB
  • 3,077
  • 3
  • 23
  • 49
  • But this will eat all my memory...and is not really required since only allocations from 6 blocks are possible... Is there a way to have finer calculation of the limit? – Daniel Oct 25 '12 at 17:38
  • Change `500` for `concurrently blocks x Streaming Multiprocessors` to get the minimum size that you need, being `SM = 8` in the gtx680 and the number of concurrent blocks depending of the kernel requirements in terms of registers or shared memory. – pQB Oct 25 '12 at 17:47