I'm developing a genetic cellular automata using PyCuda. Each cell will have a lot of genome data, along with cell parameters. I'm wondering what could be a most efficient way to 1) pass cells data to a CUDA kernel, then 2) to process this data.
I began with one particularly bad (imo), yet still working solution. It was passing each parameter in a separate array, then process them with a switch-case and a lot of duplicate code.
Then, realized that I could quickly end up with pretty large number of parameters per kernel function, and decide to rewrite it.
Second solution was to store all bunch of cell's parameters in a single array with extra dimension. That was much more elegant in code, but surprisingly the code runs 10x slower!
To make it more clear, the full list of data I need to be stored per cell:
- (Fc, Mc, Tc): 3x (int) - the cell's current 'flavor', mass and temperature
- (Rfc, Rmc, Rtc): 3x (int) - the cell's current registers
- (Fi, Mi, Ti) for each neighbour: 8*3x (int) - incoming values
- (Rfi, Rmi, Rti) for each neighbour: 8*3x (int) - incoming values
- gate orientation: 1x (uchar)
- execution pointer: 1x (uchar)
- current micro-operations memory: 32x (uchar)
- last step's micro-operations memory: 32x (uchar)
I'm splitting an automata step in 2 phases. First (emit phase) is calculating (Fi, Mi, Ti) for each cell neighbours. Second (absorb phase) is blending 8x(Fi, Mi, Ti) values with current cells' states. No genome or registers implemented yet, but I need its data to be passed for future.
So, the code for my first solution was:
Mk = 64
Tk = 1000
emit_gpu = ElementwiseKernel("int3 *cells, int3 *dcells0, int3 *dcells1, int3 *dcells2, int3 *dcells3, int3 *dcells4, int3 *dcells5, int3 *dcells6, int3 *dcells7, int w, int h", """
int x = i / h;
int y = i % h;
int3 cell = cells[i];
float M = (float) cell.y;
float T = (float) cell.z;
int Mi = (int) (fmin(1, T / Tk) * M);
cells[i].y -= Mi;
cells[i].z -= (int) (T * fmin(1, T / Tk) / 1);
int Fi = cell.x;
int Mbase = Mi / 8;
int Mpart = Mi % 8;
int Madd;
int Ti = cell.z;
int ii, xo, yo;
for (int cc = 0; cc < 9; cc++) {
int c = (cc + Fi) % 9;
if (c == 4) continue;
xo = x + c%3 - 1;
if (xo < 0) xo = w + xo;
if (xo >= w) xo = xo - w;
yo = y + c/3 - 1;
if (yo < 0) yo = h + yo;
if (xo >= w) yo = yo - h;
ii = xo * h + yo;
if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
switch(c) {
case 0: dcells0[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 1: dcells1[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 2: dcells2[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 3: dcells3[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 5: dcells4[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 6: dcells5[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 7: dcells6[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
case 8: dcells7[ii] = make_int3(Fi, Mbase + Madd, Ti); break;
default: break;
}
}
""", "ca_prepare", preamble="""
#define Tk %s
""" % Tk)
absorb_gpu = ElementwiseKernel("int3 *cells, int3 *dcells0, int3 *dcells1, int3 *dcells2, int3 *dcells3, int3 *dcells4, int3 *dcells5, int3 *dcells6, int3 *dcells7, int *img, int w, int h", """
int3 cell = cells[i];
int3 dcell = dcells0[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
dcell = dcells1[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells2[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells3[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells4[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells5[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells6[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
dcell = dcells7[i];
cell = cell + calc_d(cell.x, cell.y, cell.z, dcell.x, dcell.y, dcell.z);
cell.x = cell.x % 360;
if (cell.x < 0) cell.x += 360;
if (cell.z > Tk) cell.z = Tk;
cells[i] = cell;
img[i] = hsv2rgb(cell);
""", "ca_calc", preamble="""
#include <math.h>
#define Mk %s
#define Tk %s
__device__ int3 operator+(const int3 &a, const int3 &b) {
return make_int3(a.x+b.x, a.y+b.y, a.z+b.z);
}
__device__ int3 calc_d(int Fc, int Mc, int Tc, int Fi, int Mi, int Ti) {
int dF = Fi - Fc;
if (dF > 180) Fc += 360;
if (dF < -180) Fc -= 360;
float sM = Mi + Mc;
if (sM != 0) sM = Mi / sM; else sM = 0;
dF = (int) (Fi - Fc) * sM;
int dM = Mi;
int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
return make_int3(dF, dM, dT);
}
__device__ uint hsv2rgb(int3 pixel) {
// skipped for brevity
}
""" % (Mk, Tk, RAM))
The second and current solution:
Mk = 64
Tk = 1000
CELL_LEN = 120 # number of parameters per cell
emit_gpu = ElementwiseKernel("int *cells, int w, int h", """
int x = i / h;
int y = i % h;
int ii = i * CN;
int Fc = cells[ii];
int Mc = cells[ii+1];
int Tc = cells[ii+2];
float M = (float) Mc;
float T = (float) Tc;
int Mi = (int) (fmin(1, T / Tk) * M);
cells[ii+1] = Mc - Mi;
cells[ii+2] = Tc - (int) (T * fmin(1, T / Tk));
int Mbase = Mi / 8;
int Mpart = Mi % 8;
int Madd;
int iii, xo, yo;
for (int cc = 0; cc < 9; cc++) {
int c = (cc + Fc) % 9;
if (c == 4) continue;
xo = x + c%3 - 1;
if (xo < 0) xo = w + xo; else if (xo >= w) xo = xo - w;
yo = y + c/3 - 1;
if (yo < 0) yo = h + yo; else if (xo >= w) yo = yo - h;
if (Mpart > 0) { Madd = 1; Mpart--;} else Madd = 0;
if (c > 4) c--;
iii = (xo * h + yo) * CN + 6 + c*3;
cells[iii] = Fc;
cells[iii+1] = Mbase + Madd;
cells[iii+2] = Tc;
}
""", "ca_emit", preamble="""
#define Tk %s
#define CN %s
""" % (Tk, CELL_LEN))
absorb_gpu = ElementwiseKernel("int *cells, int *img, int w, int h", """
int ii = i * CN;
int Fc = cells[ii];
int Mc = cells[ii+1];
int Tc = cells[ii+2];
for (int c=0; c < 8; c++){
int iii = ii + c * 3 + 6;
int Fi = cells[iii];
int Mi = cells[iii+1];
int Ti = cells[iii+2];
int dF = Fi - Fc;
if (dF > 180) Fc += 360;
if (dF < -180) Fc -= 360;
float sM = Mi + Mc;
if (sM != 0) sM = Mi / sM; else sM = 0;
dF = (int) (Fi - Fc) * sM;
int dM = Mi;
int dT = fabs((float) (Fi - Fc)) * fmin((float) Mc, (float) Mi) / Mk + (Ti - Tc) * sM;
Fc += dF;
Mc += dM;
Tc += dT;
Fc = Fc % 360;
if (Fc < 0) Fc += 360;
if (Tc > Tk) Tc = Tk;
}
cells[ii] = Fc;
cells[ii+1] = Mc;
cells[ii+2] = Tc;
cells[ii+18] = (cells[ii+18] + 1) % 8;
img[i] = hsv2rgb(Fc, Tc, Mc);
""", "ca_absorb", preamble="""
#include <math.h>
#define Mk %s
#define Tk %s
#define CN %s
__device__ uint hsv2rgb(int hue, int sat, int val) {
// skipped for brevity
}
""" % (Mk, Tk, CELL_LEN))
Both variants produce exactly the same CA behaviour, but latter is running much slower.
GTX Titan:
- Field size: 1900x1080 cells
- Solution #1: ~200 steps/s
- Solution #2: ~20 steps/s
GT 630M:
- Field size: 1600x900 cells
- Solution #1: ~7.8 steps/s
- Solution #2: ~1.5 steps/s
Please feel free to play with both solutions' if you need:
Any clues or advises are welcome:
- Why the performance is slowed down?
- Is it possible to raise the performance of solution #2 at least to the level of #1?
- Or another solution would be better?