maximum supported size for cub library

Question

Does anyone know what is the maximum supported size for cub::scan ? I got core dump for input sizes over 500 million. I wanted to make sure I'm not doing anything wrong...

Here is my code:

#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;

bool                    g_verbose = false;  // Whether to display input/output to console
CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
typedef int mytype;

/**
 * Solve inclusive-scan problem
 */

static void solve(mytype *h_in, mytype *h_cpu, int n)
{
    mytype inclusive = 0;
    for (int i = 0; i < n; ++i) {
      inclusive += h_in[i];
      h_cpu[i] = inclusive;
    }
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
    for (int i = 0; i < n; i++) {
      if (h_cpu[i] != h_o[i]) {
        return i + 1;
      }
    }
    return 0;
}

/**
 * Main
 */
int main(int argc, char** argv)
{
    cudaSetDevice(0);
    struct timeval start, end;
    int num_items = 1073741824;
    const int repetitions = 5;
    mytype *h_in, *h_out, *h_cpu;
    const int size = num_items * sizeof(mytype);
    // Allocate host arrays
    h_in = (mytype *)malloc(size);
    h_out = (mytype *)malloc(size);
    h_cpu = (mytype *)malloc(size);


    // Initialize problem and solution
    for (int i = 0; i < num_items; i++) {
        h_in[i] = i;
        h_out[i] = 0;
        h_cpu[i] = 0;
    }

    solve(h_in, h_cpu, num_items);

    // Allocate problem device arrays
    mytype *d_in = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));

    // Initialize device input
    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));

    // Allocate device output array
    mytype *d_out = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));

    // Allocate temporary storage
    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;


    CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

    // Run
    gettimeofday(&start, NULL);
    for (long i = 0; i < repetitions; i++) 
        DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
    cudaThreadSynchronize();
    gettimeofday(&end, NULL);
    double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;

    cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
    int cmp = compare(h_cpu, h_out, num_items);
    printf("%d\t", num_items);
    if (!cmp)
        printf("\t%7.4fs \n", ctime);
    printf("\n");
    if (h_in) delete[] h_in;
    if (h_out) delete[] h_out;
    if (h_cpu) delete[] h_cpu;
    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

    printf("\n\n");

    return 0;
}

What GPU are you using and how much memory does it have? Also what type are you performing the scan on? — talonmies, Oct 09 '15 at 12:14
I'm using TitanX and it has 12GB. I'm performing Exclusive Device Scan. — JacobS, Oct 09 '15 at 15:34
What **data type** are you performing the scan on ? `int`, `float`, `double`, ... ? And how much host memory do you have? — Robert Crovella, Oct 09 '15 at 17:55
[Here](http://pastebin.com/mMk3s8DW) is a worked example of a cub::ExclusiveSum on an array of `int` of size 1073741824 (over 1 Billion elements). It was done using CUDA 7.0 and CUB 1.4.1 on a RHEL 6.2 system with 64GB of host memory and a K40c, which also has 12GB. Ultimately, if you don't run into a memory allocation error, cub should be able to handle up to the maximum positive number that will fit in the `int num_items` variable - so around 2 Billion elements. But memory limitations may come into play before you hit this number, particularly depending on the data type you are using. — Robert Crovella, Oct 09 '15 at 18:40
I'm working on 32 bits integers and it works for 268435456 elements but above that I get Segmentation fault (core dumped). For 64 bit integers it only works for 134217728 integers and I get the same error for anything above that. I'm using CubDebugExit for allocation device input/output, so I have to get a more meaningful error message if it has something to do with the memory allocation right? — JacobS, Oct 09 '15 at 19:30
@RobertCrovella I ran your code and it worked! I used cub's sample code for device scan [here](http://pastebin.com/4r9035fD) I don't know why it's not working. — JacobS, Oct 09 '15 at 20:16
You should learn how to do basic debug of a host code seg fault. At a minimum, sprinkle printf through your code until you find the exact section of code that is causing the seg fault/core dump. Certainly one problem with the code you have posted is here: `const int size = num_items * sizeof(mytype);` Spend some time thinking about that line of code. If necessary, get out a calculator, compute what that number would be, then see if it will properly fit into an `int` variable. (This particular problem has nothing to do with CUDA, of course.) — Robert Crovella, Oct 09 '15 at 20:40
And by the way, the code you have posted [here](http://pastebin.com/4r9035fD) won't compile. I had to fix a few things. In the future, you should provide a proper, complete code like that, directly in your question. I'm going to edit your question to add a compilable version of that code. — Robert Crovella, Oct 09 '15 at 20:44
Why did you remove the code from your question? Did I not explain to you why it belongs there and why I was adding it? Please read [this](http://stackoverflow.com/help/on-topic) in particular: "Questions seeking debugging help ("why isn't this code working?") must include the desired behavior, a specific problem or error and the shortest code necessary to reproduce it in the question itself. Questions without a clear problem statement are not useful to other reader" — Robert Crovella, Oct 09 '15 at 22:06
Oh sorry, I did not notice that you uploaded it. I thought it is my old code which is not working so I was going to correct that and upload it again — JacobS, Oct 09 '15 at 22:14

Robert Crovella · Accepted Answer · 2015-10-09T22:12:22.130

The problem is here:

const int size = num_items * sizeof(mytype);

And it can be fixed by changing it to:

const size_t size = num_items * sizeof(mytype);

The value of num_items in the code is over 1 Billion. When we multiply that by sizeof(mytype) we are multiplying it by 4, so the result is over 4 Billion. This value cannot be stored in an int variable. If you try to use it anyway like that, then your subsequent host code will do bad things. This problem (the core dump) actually has nothing to do with CUDA. The code would core dump if you removed all the CUB elements.

When I modify the line of code above, and compile for the correct GPU (e.g. -arch=sm_35 in my case, or -arch=sm_52 for a Titan X GPU), then I get the correct answer (and no seg fault/core dump).

In general, the correct starting point when chasing a seg fault/core dump type error, is to recognize that this error arises from host code and you should attempt to localize the exact line of source code that is generating this error. This can be done trivially/tediously by putting many printf statements in your code, until you identify the line of your code after which you don't see any printf output, or by using a host code debugger, such as gdb on linux.

Also note that this code as written will require slightly more than 12GB of memory on the host, and slightly more than 8GB of memory on the GPU, so it will only run properly in such settings.

For reference, here is the fixed code (based on what OP posted here):

#define CUB_STDERR
#include <stdio.h>
#include "cub/util_allocator.cuh"
#include "cub/device/device_scan.cuh"
#include <sys/time.h>
using namespace cub;

bool                    g_verbose = false;  // Whether to display input/output to console
CachingDeviceAllocator  g_allocator(true);  // Caching allocator for device memory
typedef int mytype;

/**
 * Solve inclusive-scan problem
 */

static void solve(mytype *h_in, mytype *h_cpu, int n)
{
    mytype inclusive = 0;
    for (int i = 0; i < n; ++i) {
      inclusive += h_in[i];
      h_cpu[i] = inclusive;
    }
}
static int compare(mytype *h_cpu, mytype *h_o, int n)
{
    for (int i = 0; i < n; i++) {
      if (h_cpu[i] != h_o[i]) {
        return i + 1;
      }
    }
    return 0;
}

/**
 * Main
 */
int main(int argc, char** argv)
{
    cudaSetDevice(0);
    struct timeval start, end;
    int num_items = 1073741824;
    const int repetitions = 5;
    mytype *h_in, *h_out, *h_cpu;
    const size_t size = num_items * sizeof(mytype);
    // Allocate host arrays
    h_in = (mytype *)malloc(size);
    h_out = (mytype *)malloc(size);
    h_cpu = (mytype *)malloc(size);


    // Initialize problem and solution
    for (int i = 0; i < num_items; i++) {
        h_in[i] = i;
        h_out[i] = 0;
        h_cpu[i] = 0;
    }

    solve(h_in, h_cpu, num_items);

    // Allocate problem device arrays
    mytype *d_in = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_in, sizeof(mytype) * num_items));

    // Initialize device input
    CubDebugExit(cudaMemcpy(d_in, h_in, sizeof(mytype) * num_items, cudaMemcpyHostToDevice));

    // Allocate device output array
    mytype *d_out = NULL;
    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out, sizeof(mytype) * num_items));

    // Allocate temporary storage
    void            *d_temp_storage = NULL;
    size_t          temp_storage_bytes = 0;


    CubDebugExit(DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items));
    CubDebugExit(g_allocator.DeviceAllocate(&d_temp_storage, temp_storage_bytes));

    // Run
    gettimeofday(&start, NULL);
    for (long i = 0; i < repetitions; i++) 
        DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, d_in, d_out, num_items);
    cudaThreadSynchronize();
    gettimeofday(&end, NULL);
    double ctime = end.tv_sec + end.tv_usec / 1000000.0 - start.tv_sec - start.tv_usec / 1000000.0;

    cudaMemcpy(h_out, d_out, sizeof(mytype) * num_items, cudaMemcpyDeviceToHost);
    int cmp = compare(h_cpu, h_out, num_items);
    printf("%d\t", num_items);
    if (!cmp)
        printf("\t%7.4fs \n", ctime);
    printf("\n");
    if (h_in) delete[] h_in;
    if (h_out) delete[] h_out;
    if (h_cpu) delete[] h_cpu;
    if (d_in) CubDebugExit(g_allocator.DeviceFree(d_in));
    if (d_out) CubDebugExit(g_allocator.DeviceFree(d_out));
    if (d_temp_storage) CubDebugExit(g_allocator.DeviceFree(d_temp_storage));

    printf("\n\n");

    return 0;
}

maximum supported size for cub library

1 Answers1