How can I implement a custom atomic function involving several variables?

Question

I'd like to implement this atomic function in CUDA:

__device__ float lowest;   // global var
__device__ int  lowIdx;    // global var
float realNum;   // thread reg var
int index;       // thread reg var

if(realNum < lowest) {
 lowest= realNum;  // the new lowest
 lowIdx= index;    // update the 'low' index
}

I don't believe I can do this with any of the atomic functions. I need to lock down a couple global memory loc's for a couple instructions. Might I be able to implement this with PTXAS (assembly) code?

I don't think there's a way (PTX or otherwise) to use any specific GPU hardware to atomically update more than one location at a time. Someone else may have a clever idea. Normally I think this type of problem would be solved using a "critical section" methodology, and you may want to use the search box in the upper right hand corner to search "cuda critical section" and see what is described in some of those questions. It appears that you might want to run this on a per-thread basis, and per-thread critical section management can be quite perilous/difficult. — Robert Crovella, Jul 01 '13 at 19:01
Actually, for this limited case where you have only two 32-bit quantities you're trying to manage, it may be possible to create a custom atomic function, perhaps built around `atomicCAS`, utilizing a 64-bit quantity (by cleverly combining the two 32-bit quantities), perhaps along the lines given for the [arbitrary atomic example](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions) given in the documentation. — Robert Crovella, Jul 01 '13 at 19:10

Robert Crovella · Accepted Answer · 2022-12-07T02:29:53.257

As I stated in my second comment above, it's possible to combine your two 32-bit quantities into a single 64-bit atomically managed quantity, and deal with the problem that way. We then manage the 64-bit quantity atomically using the arbitrary atomic example as a rough guide. Obviously you can't extend this idea beyond two 32-bit quantities. Here's an example:

#include <stdio.h>
#define DSIZE 5000
#define nTPB 256

#define cudaCheckErrors(msg) \
    do { \
        cudaError_t __err = cudaGetLastError(); \
        if (__err != cudaSuccess) { \
            fprintf(stderr, "Fatal error: %s (%s at %s:%d)\n", \
                msg, cudaGetErrorString(__err), \
                __FILE__, __LINE__); \
            fprintf(stderr, "*** FAILED - ABORTING\n"); \
            exit(1); \
        } \
    } while (0)

typedef union  {
  float floats[2];                 // floats[0] = lowest
  int ints[2];                     // ints[1] = lowIdx
  unsigned long long int ulong;    // for atomic update
} my_atomics;

__device__ my_atomics test;

__device__ unsigned long long int my_atomicMin(unsigned long long int* address, float val1, int val2)
{
    my_atomics loc, loctest;
    loc.floats[0] = val1;
    loc.ints[1] = val2;
    loctest.ulong = *address;
    while (loctest.floats[0] >  val1) 
      loctest.ulong = atomicCAS(address, loctest.ulong,  loc.ulong);
    return loctest.ulong;
}


__global__ void min_test(const float* data)
{

    int idx = (blockDim.x * blockIdx.x) + threadIdx.x;
    if (idx < DSIZE)
      my_atomicMin(&(test.ulong), data[idx],idx);
}

int main() {

  float *d_data, *h_data;
  my_atomics my_init;
  my_init.floats[0] = 10.0f;
  my_init.ints[1] = DSIZE;

  h_data = (float *)malloc(DSIZE * sizeof(float));
  if (h_data == 0) {printf("malloc fail\n"); return 1;}
  cudaMalloc((void **)&d_data, DSIZE * sizeof(float));
  cudaCheckErrors("cm1 fail");
  // create random floats between 0 and 1
  for (int i = 0; i < DSIZE; i++) h_data[i] = rand()/(float)RAND_MAX;
  cudaMemcpy(d_data, h_data, DSIZE*sizeof(float), cudaMemcpyHostToDevice);
  cudaCheckErrors("cmcp1 fail");
  cudaMemcpyToSymbol(test, &(my_init.ulong), sizeof(unsigned long long int));
  cudaCheckErrors("cmcp2 fail");
  min_test<<<(DSIZE+nTPB-1)/nTPB, nTPB>>>(d_data);
  cudaDeviceSynchronize();
  cudaCheckErrors("kernel fail");

  cudaMemcpyFromSymbol(&(my_init.ulong), test, sizeof(unsigned long long int));
  cudaCheckErrors("cmcp3 fail");

  printf("device min result = %f\n", my_init.floats[0]);
  printf("device idx result = %d\n", my_init.ints[1]);

  float host_val = 10.0f;
  int host_idx = DSIZE;
  for (int i=0; i<DSIZE; i++)
    if (h_data[i] < host_val){
      host_val = h_data[i];
      host_idx = i;
      }

  printf("host min result = %f\n", host_val);
  printf("host idx result = %d\n", host_idx);
  return 0;
}

Here is a similar example that does atomic update of 2 float quantities.

Here is another custom atomic example.

score -1 · Answer 2 · answered Jul 03 '14 at 22:32

-1

@Robert Crovella: Excellent idea, but I think the function should be modified a little bit as follows:

__device__ unsigned long long int my_atomicMin(unsigned long long int* address, float val1, int val2)
{
    my_atomics loc, loctest, old;
    loc.floats[0] = val1;
    loc.ints[1] = val2;
    loctest.ulong = *address;
    old.ulong = loctest.ulong;
    while (loctest.floats[0] > val1){
        old.ulong = loctest.ulong;
        loctest.ulong = atomicCAS(address, loctest.ulong,  loc.ulong);
    }
    return old.ulong;
}

answered Jul 03 '14 at 22:32

Ehsan Shoja

443
3
6

1

I'm not sure why. It seems we only disagree on the return value of the function. In your case, the return value pattern *does not match* the pattern established by the [example given in the documentation](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions) which returns *the most recent value* returned from the `atomicCAS` function (assuming the while loop is entered). Your variety does not do this. – Robert Crovella Jul 04 '14 at 04:35
@RobertCrovella: Indeed, returning the last `atomicCAS` return value makes it an `atomic_fetch_min`, returning the previous contents of the memory location after updating it, or after realizing the value in memory is already lower. Returning `old` means you can return an earlier value than that, if you exit on `loctest.floats[0] > val1` being false after a failed CAS. That doesn't seem useful – Peter Cordes Dec 07 '22 at 03:37

How can I implement a custom atomic function involving several variables?

2 Answers2

Linked

Related