2

I am trying to modify an existing particle method code using OpenACC to run on GPU. The existing code utilizes a 2D dynamic array of struct in c. I need to copy the structure(s) to GPU for further calculation. A code sample is given below:

typedef struct{
  int *list;  // it is list of particles in a given bucket
  int  count; // it is the total number of particles in the bucket
} structBucket;


typedef struct{
structBucket  **bucket;
int    numberOfBuckets[2]; // number of buckets in x- and y- dimensions
} structDomain;

structDomain domain;

// Allocate memory for **bucket
  domain.numberOfBuckets[XDIM] = 10; domain.numberOfBuckets[YDIM] = 5;
  int iX,iY, capacity;

  domain.bucket = (structBucket**)malloc( sizeof(structBucket*) * domain.numberOfBuckets[XDIM] );

   for (iX=0 ; iX < domain.numberOfBuckets[XDIM] ; iX++) 
      domain.bucket[iX] = (structBucket*)malloc( sizeof(structBucket) * domain.numberOfBuckets[YDIM]);

// Calculate domain.bucket[iX][iY].count here using some logic
.
.
.
// Allocate memory for *list
  for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
  {
    for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
    {
        capacity = domain.bucket[iX][iY].count;

        if (capacity > 0)
        {
          domain.bucket[iX][iY].list = (int *)malloc(sizeof(int) * capacity);
        }
    }
  }

After reviewing various sources on the internet, I have come up with the following solution (which might be utterly wrong)"

// It is needed to create the memory for **bucket and *list on GPU. 
#pragma acc enter data copyin(domain)
#pragma acc enter data copyin(domain.bucket)
#pragma acc enter data create(domain.bucket[0:domain.numberOfBuckets[XDIM]][0:domain.numberOfBuckets[YDIM]])
  for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
  {
    for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
    {
        #pragma acc enter data create(domain.bucket[iX][iY].list[0:domain.bucket[iX][iY].count])
    }
  }

It is requested for an advice manual deep copy of **bucket and *list to GPU memory. Is my solution accurate? Could someone suggest improvements or a better solution for manual deep copy of said struct(s).

I am using PGI 19.4 compiler on Windows 10. Many thanks

Ali Imran
  • 27
  • 4

1 Answers1

1

Close. The only thing I'd do different is to not create "domain.bucket" and update the bucket's count so the device has this information. Also, since updates/copies are shallow, be sure to only update the list array or scalars in the structs. Otherwise you may overwrite device/host pointers. Here's an example. While I'm using Linux, other than the executable name, the code should the same.

% cat test.c

#include <stdio.h>
#include <stdlib.h>

typedef struct{
  int *list;  // it is list of particles in a given bucket
  int  count; // it is the total number of particles in the bucket
} structBucket;


typedef struct{
structBucket  **bucket;
int    numberOfBuckets[2]; // number of buckets in x- and y- dimensions
} structDomain;

#define XDIM 64
#define YDIM 64

int main() {

  structDomain domain;
  int iX,iY, capacity;

// Allocate memory for **bucket
  domain.numberOfBuckets[XDIM] = 10; domain.numberOfBuckets[YDIM] = 5;

  domain.bucket = (structBucket**)malloc( sizeof(structBucket*) * domain.numberOfBuckets[XDIM] );

   for (iX=0 ; iX < domain.numberOfBuckets[XDIM] ; iX++)
      domain.bucket[iX] = (structBucket*)malloc( sizeof(structBucket) * domain.numberOfBuckets[YDIM]);


// Calculate domain.bucket[iX][iY].count here using some logic
  for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
  {
    for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
    {
       domain.bucket[iX][iY].count = iX*domain.numberOfBuckets[YDIM]+iY;
  }}
#pragma acc enter data copyin(domain)
#pragma acc enter data create(domain.bucket[:domain.numberOfBuckets[XDIM]][:domain.numberOfBuckets[YDIM]])
// Allocate memory for *list
  for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
  {
    for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
    {
        capacity = domain.bucket[iX][iY].count;
#pragma acc update device(domain.bucket[iX][iY].count)
        if (capacity > 0)
        {
          domain.bucket[iX][iY].list = (int *)malloc(sizeof(int) * capacity);
#pragma acc enter data create(domain.bucket[iX][iY].list[:capacity])
        }
    }
  }

#pragma acc parallel loop gang collapse(2) present(domain)
  for (iX = 0; iX < domain.numberOfBuckets[XDIM]; iX++)
  {
    for (iY = 0; iY < domain.numberOfBuckets[YDIM]; iY++)
    {
        capacity = domain.bucket[iX][iY].count;
        if (capacity > 0) {
#pragma acc loop vector
           for (int i = 0; i < capacity; ++i) {
                domain.bucket[iX][iY].list[i] = i;
           }
        }
   }}

  for (iX = 0; iX < 5; iX++)
  {
    for (iY = 0; iY < 5; iY++)
    {
        capacity = domain.bucket[iX][iY].count;
        if (capacity > 0) {
#pragma acc update host(domain.bucket[iX][iY].list[:capacity])
           printf("iX=%d iY=%d Cnt=%d\n\t",iX,iY,capacity);
           for (int i = 0; i < capacity; ++i) {
                printf("%d ",domain.bucket[iX][iY].list[i]);
           }
           printf("\n");
        }
   }}

  exit(0);
}
% pgcc test.c -ta=tesla -Minfo=accel -V19.4
main:
     40, Generating enter data copyin(domain)
     41, Generating enter data create(domain.bucket[:domain.numberOfBuckets][:domain.numberOfBuckets])
     49, Generating update device(domain.bucket->->count)
     52, Generating enter data create(domain.bucket->->list[:capacity])
     57, Generating present(domain)
         Generating Tesla code
         58, #pragma acc loop gang collapse(2) /* blockIdx.x */
         60,   /* blockIdx.x collapsed */
         65, #pragma acc loop vector(128) /* threadIdx.x */
     65, Accelerator restriction: size of the GPU copy of domain.bucket is unknown
         Loop is parallelizable
     78, Generating update self(domain.bucket->->list[:capacity])
% a.out
iX=0 iY=1 Cnt=1
        0
iX=0 iY=2 Cnt=2
        0 1
iX=0 iY=3 Cnt=3
        0 1 2
iX=0 iY=4 Cnt=4
        0 1 2 3
iX=1 iY=0 Cnt=5
        0 1 2 3 4
iX=1 iY=1 Cnt=6
        0 1 2 3 4 5
iX=1 iY=2 Cnt=7
        0 1 2 3 4 5 6
iX=1 iY=3 Cnt=8
        0 1 2 3 4 5 6 7
iX=1 iY=4 Cnt=9
        0 1 2 3 4 5 6 7 8
iX=2 iY=0 Cnt=10
        0 1 2 3 4 5 6 7 8 9
iX=2 iY=1 Cnt=11
        0 1 2 3 4 5 6 7 8 9 10
iX=2 iY=2 Cnt=12
        0 1 2 3 4 5 6 7 8 9 10 11
iX=2 iY=3 Cnt=13
        0 1 2 3 4 5 6 7 8 9 10 11 12
iX=2 iY=4 Cnt=14
        0 1 2 3 4 5 6 7 8 9 10 11 12 13
iX=3 iY=0 Cnt=15
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
iX=3 iY=1 Cnt=16
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
iX=3 iY=2 Cnt=17
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
iX=3 iY=3 Cnt=18
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
iX=3 iY=4 Cnt=19
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
iX=4 iY=0 Cnt=20
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
iX=4 iY=1 Cnt=21
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
iX=4 iY=2 Cnt=22
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
iX=4 iY=3 Cnt=23
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
iX=4 iY=4 Cnt=24
        0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
Mat Colgrove
  • 5,441
  • 1
  • 10
  • 11
  • Sir, I can't thank you enough for your extremely helpful answer. It has not only solved this problem but has also given me some useful hints to address another related issue that I was, frankly, having a hard time even to describe in the question. Thanks a lot for your help. – Ali Imran Dec 25 '19 at 07:11