Genetic cellular automata with PyCuda, how to efficiently pass a lot of data per cell to CUDA kernel?

Question

I'm developing a genetic cellular automata using PyCuda. Each cell will have a lot of genome data, along with cell parameters. I'm wondering what could be a most efficient way to 1) pass cells data to a CUDA kernel, then 2) to process this data.

I began with one particularly bad (imo), yet still working solution. It was passing each parameter in a separate array, then process them with a switch-case and a lot of duplicate code.

Then, realized that I could quickly end up with pretty large number of parameters per kernel function, and decide to rewrite it.

Second solution was to store all bunch of cell's parameters in a single array with extra dimension. That was much more elegant in code, but surprisingly the code runs 10x slower!

To make it more clear, the full list of data I need to be stored per cell:

(Fc, Mc, Tc): 3x (int) - the cell's current 'flavor', mass and temperature
(Rfc, Rmc, Rtc): 3x (int) - the cell's current registers
(Fi, Mi, Ti) for each neighbour: 8*3x (int) - incoming values
(Rfi, Rmi, Rti) for each neighbour: 8*3x (int) - incoming values
gate orientation: 1x (uchar)
execution pointer: 1x (uchar)
current micro-operations memory: 32x (uchar)
last step's micro-operations memory: 32x (uchar)

I'm splitting an automata step in 2 phases. First (emit phase) is calculating (Fi, Mi, Ti) for each cell neighbours. Second (absorb phase) is blending 8x(Fi, Mi, Ti) values with current cells' states. No genome or registers implemented yet, but I need its data to be passed for future.

So, the code for my first solution was:

Mk = 64
Tk = 1000

emit_gpu = ElementwiseKernel("int3 *cells, int3 *dcells0, int3 *dcells1, int3 *dcells2, int3 *dcells3, int3 *dcells4, int3 *dcells5, int3 *dcells6, int3 *dcells7, int w, int h", """
    int x = i / h;
    int y = i % h;

    int3 cell = cells[i];
    float M = (float) cell.y;
    float T = (float) cell.z;
    int Mi = (int) (fmin(1, T / Tk) * M);
    cells[i].y -= Mi;
    cells[i].z -= (int) (T * fmin(1, T / Tk) / 1);

    int Fi = cell.x;
    int Mbase = Mi / 8;
    int Mpart = Mi % 8;
    int Madd;
    int Ti = cell.z;
    int ii, xo, yo;

    for (int cc = 0; cc < 9; cc++) {
      int c = (cc + Fi) % 9;
      if (c == 4) continue;
      xo = x + c%3 - 1;
      if (xo < 0) xo = w + xo;
      if (xo >= w) xo = xo - w;
      yo = y + c/3 - 1;
      if (yo < 0) yo = h + yo;
      if (xo >= w) yo = yo - h;
      ii = xo * h + yo;
      if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
      switch(c) {
        case 0: dcells0[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 1: dcells1[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 2: dcells2[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 3: dcells3[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 5: dcells4[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 6: dcells5[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 7: dcells6[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        case 8: dcells7[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
        default: break;
      }

    } 
""", "ca_prepare", preamble="""
#define Tk %s
""" % Tk)

absorb_gpu = ElementwiseKernel("int3 *cells, int3 *dcells0, int3 *dcells1, int3 *dcells2, int3 *dcells3, int3 *dcells4, int3 *dcells5, int3 *dcells6, int3 *dcells7, int *img, int w, int h", """
    int3 cell = cells[i];

    int3 dcell = dcells0[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;

    dcell = dcells1[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells2[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells3[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells4[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells5[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells6[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    dcell = dcells7[i];
    cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
    cell.x = cell.x % 360;
    if (cell.x < 0) cell.x += 360;
    if (cell.z > Tk) cell.z = Tk;

    cells[i] = cell;
    img[i] = hsv2rgb(cell);

""", "ca_calc", preamble="""
#include <math.h>
#define Mk %s
#define Tk %s

__device__ int3 operator+(const int3 &a, const int3 &b) {
    return make_int3(a.x+b.x, a.y+b.y, a.z+b.z);
}

__device__ int3 calc_d(int Fc, int Mc, int Tc, int Fi, int Mi, int Ti) {
    int dF = Fi - Fc;
    if (dF > 180) Fc += 360;
    if (dF < -180) Fc -= 360;
    float sM = Mi + Mc;
    if (sM != 0) sM = Mi / sM; else sM = 0;
    dF = (int) (Fi - Fc) * sM;
    int dM = Mi;
    int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
    return make_int3(dF, dM, dT);
}

__device__ uint hsv2rgb(int3 pixel) {
    // skipped for brevity
}
""" % (Mk, Tk, RAM))

The second and current solution:

Mk = 64
Tk = 1000
CELL_LEN = 120 # number of parameters per cell

emit_gpu = ElementwiseKernel("int *cells, int w, int h", """
    int x = i / h;
    int y = i % h;
    int ii = i * CN;

    int Fc = cells[ii];
    int Mc = cells[ii+1];
    int Tc = cells[ii+2];
    float M = (float) Mc;
    float T = (float) Tc;
    int Mi = (int) (fmin(1, T / Tk) * M);
    cells[ii+1] = Mc - Mi;
    cells[ii+2] = Tc - (int) (T * fmin(1, T / Tk));

    int Mbase = Mi / 8;
    int Mpart = Mi % 8;
    int Madd;
    int iii, xo, yo;

    for (int cc = 0; cc < 9; cc++) {
      int c = (cc + Fc) % 9;
      if (c == 4) continue;
      xo = x + c%3 - 1;
      if (xo < 0) xo = w + xo; else if (xo >= w) xo = xo - w;
      yo = y + c/3 - 1;
      if (yo < 0) yo = h + yo; else if (xo >= w) yo = yo - h;
      if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
      if (c > 4) c--;
      iii = (xo * h + yo) * CN + 6 + c*3;

      cells[iii] = Fc;
      cells[iii+1] = Mbase + Madd;
      cells[iii+2] = Tc;

    } 
""", "ca_emit", preamble="""
#define Tk %s
#define CN %s
""" % (Tk, CELL_LEN))

absorb_gpu = ElementwiseKernel("int *cells, int *img, int w, int h", """
    int ii = i * CN;
    int Fc = cells[ii];
    int Mc = cells[ii+1];
    int Tc = cells[ii+2];

    for (int c=0; c < 8; c++){
      int iii = ii + c * 3 + 6;
      int Fi = cells[iii];
      int Mi = cells[iii+1];
      int Ti = cells[iii+2];

      int dF = Fi - Fc;
      if (dF > 180) Fc += 360;
      if (dF < -180) Fc -= 360;
      float sM = Mi + Mc;
      if (sM != 0) sM = Mi / sM; else sM = 0;
      dF = (int) (Fi - Fc) * sM;
      int dM = Mi;
      int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
      Fc += dF;
      Mc += dM;
      Tc += dT;
      Fc = Fc % 360;
      if (Fc < 0) Fc += 360;
      if (Tc > Tk) Tc = Tk;
    }      

    cells[ii] = Fc;
    cells[ii+1] = Mc;
    cells[ii+2] = Tc;
    cells[ii+18] = (cells[ii+18] + 1) % 8;

    img[i] = hsv2rgb(Fc, Tc, Mc);

""", "ca_absorb", preamble="""
#include <math.h>
#define Mk %s
#define Tk %s
#define CN %s

__device__ uint hsv2rgb(int hue, int sat, int val) {
    // skipped for brevity
}
""" % (Mk, Tk, CELL_LEN))

Both variants produce exactly the same CA behaviour, but latter is running much slower.

GTX Titan:

Field size: 1900x1080 cells
Solution #1: ~200 steps/s
Solution #2: ~20 steps/s

GT 630M:

Field size: 1600x900 cells
Solution #1: ~7.8 steps/s
Solution #2: ~1.5 steps/s

Please feel free to play with both solutions' if you need:

Solution #1 full source

Solution #2 full source

Any clues or advises are welcome:

Why the performance is slowed down?
Is it possible to raise the performance of solution #2 at least to the level of #1?
Or another solution would be better?

are you surprised that a card with 2688 CUDA cores beats one with 96 cores? — AlexanderBrevig, Dec 16 '14 at 09:37
sorry, misread your question - it relates to solution 1 vs 2, not card one and two. Sorry. — AlexanderBrevig, Dec 16 '14 at 09:38
Exactly, the first thing to start optimization is to know why I have so drastic performance changes between two solutions. Different cards results are just to compare. Less cores leads to less slowing down. 10x slow on Titan VS 5x slow on GT630. — a5kin, Dec 16 '14 at 09:44
Disclaimer: I don't know Python. Overall, your code seems to be completely oblivious of the vector nature of GPUs, so you won't get good performance in either case. That said, your second kernel is worse in terms of memory access patterns than the first one, which may explain the slowdown. You seem to be processing cell attributes sequentially, therefore I would suggest packing related attributes in different arrays (as in your first solution) and put them in a structure to avoid parameter explosion. Doing the opposite is usually a performance penalty (classic SoA vs AoS). — user703016, Dec 16 '14 at 12:01
If I remember correctly pycuda is a python wrapper for cuda c/c++ (please correct me if i am wrong) so you may use nvprof or nsight to profile your executeable — Michael Haidl, Dec 16 '14 at 13:12
If this question gets closed, [codereview.stackexchange.com](http://codereview.stackexchange.com/) may be better suited for this sort of question. — Uyghur Lives Matter, Dec 16 '14 at 14:08
This is my first question, can you please explain what is so bad about it that it would be closed? I'm not asking there to fix my computations or the model. What I'm looking for is a pattern to efficiently pass a large amount of parameters to CUDA kernel, and then process and store them back. It's not only applicable to my particular problem, but rather to a large variety of problems, I believe. I posted my code and model only to illustrate a way I'm trying to solve a problem formulated in the question's title. — a5kin, Dec 16 '14 at 15:55
@cpburnz Zero questions tagged 'cuda' on codereview.stackexchange.com, why should I post in a place where no one interested in the main topic of a question? — a5kin, Dec 16 '14 at 18:20
@a5kin It's not so much that I think this question should be closed, but I noticed that there were already a couple close votes (now 4 which is 1 shy of closure). So I thought I would mention an alternative location where you might find an answer if the community here (which can be harsh) deems it off topic. — Uyghur Lives Matter, Dec 16 '14 at 18:28
@cpburnz Can you please suggest a way to improve the content of the question, or it's really just wrong place to ask? Anyway, thank you for the warning and for the link to codereview. — a5kin, Dec 16 '14 at 18:40
@a5kin I don't see anything wrong with the question. Though it may fall into a grey area because it fits both sites as indicated by: [Can I post questions about optimizing code on Stack Overflow?](http://meta.stackoverflow.com/q/261841/369450) — Uyghur Lives Matter, Dec 16 '14 at 19:02
@cpburnz Yes, it's exact the case Warren Dew mentioned in the tread you shown. My code is not just need to be optimized, it's the crucial thing to speed it up at least 15x. The current speed is like it's not working at all. I ran other CAs 150x faster using same GPU and same framework, but they had much fewer params. That's why I decide to ask am I passing and processing params correct. Anyway, I already have clues in comments to start with. — a5kin, Dec 16 '14 at 19:56

a5kin · Answer 1 · 2014-12-19T11:01:11.910

OK, I managed how to run second solution almost 15x faster. Following changes were made:

Convert main parameters array from int to int4. This made it even faster than solution with int3. Although, extra space left unused (.w dimension). [3x speedup]
Repack related parameters in WIDTHxHEIGHT groups. So, shape changed from (WIDTH, HEIGHT, N) to (N, WIDTH, HEIGHT). This made memory access more efficient, since elements inside groups tends to be processed together. [5x speedup]

The optimized code looks like:

Mk = 64
Tk = 1000

emit_gpu = ElementwiseKernel("int4 *cells, int w, int h, int cn", """
    int x = i / h;
    int y = i % h;

    int4 cell = cells[i];
    int Fc = cell.x;
    int Mc = cell.y;
    int Tc = cell.z;
    float M = (float) Mc;
    float T = (float) Tc;
    int Mi = (int) (fmin(1, T / Tk) * M);
    cells[i] = make_int4(Fc, Mc - Mi, Tc - (int) (T * fmin(1, T / Tk)), 0);

    int Mbase = Mi / 8;
    int Mpart = Mi % 8;
    int Madd;
    int ii;
    int xo, yo;

    int cnn = 0;
    for (int dx = -1; dx < 2; dx++) {
      xo = x + dx;
      if (xo < 0) xo = w + xo; else if (xo >= w) xo = xo - w;
      for (int dy = -1; dy < 2; dy++) {
        if (dx == 0 && dy == 0) continue;
        cnn += cn;
        yo = y + dy;
        if (yo < 0) yo = h + yo; else if (yo >= h) yo = yo - h;
        if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
        ii = (xo * h + yo) + cnn;
        cells[ii] = make_int4(Fc, Mbase + Madd, Tc, 0);
      }
    } 
""", "ca_emit", preamble="""
#define Tk %s
#define CN %s
""" % (Tk, CELL_LEN))

absorb_gpu = ElementwiseKernel("int4 *cells, int *img, int w, int h, int cn", """
    int ii = i;
    int4 cell = cells[i];
    int Fc = cell.x;
    int Mc = cell.y;
    int Tc = cell.z;

    for (int c=0; c < 8; c++){
      ii += cn;
      cell = cells[ii];
      int Fi = cell.x;
      int Mi = cell.y;
      int Ti = cell.z;

      int dF = Fi - Fc;
      if (dF > 180) Fc += 360;
      if (dF < -180) Fc -= 360;
      float sM = Mi + Mc;
      if (sM != 0) sM = Mi / sM; else sM = 0;
      dF = (int) (Fi - Fc) * sM;
      int dM = Mi;
      int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
      Fc += dF;
      Mc += dM;
      Tc += dT;
      Fc = Fc % 360;
      if (Fc < 0) Fc += 360;
      if (Tc > Tk) Tc = Tk;
    }      

    cells[i] = make_int4(Fc, Mc, Tc, 0);
    img[i] = hsv2rgb(Fc, Tc, Mc);

""", "ca_absorb", preamble="""
#include <math.h>
#define Mk %s
#define Tk %s

__device__ uint hsv2rgb(int hue, int sat, int val) {
    // skipped for brevity
}
""" % (Mk, Tk))

Thanks to Park Young-Bae for clues on repacking and also to Alexey Shchepin for some optimization issues.

Genetic cellular automata with PyCuda, how to efficiently pass a lot of data per cell to CUDA kernel?

1 Answers1