0

I'm trying to write a multithreaded Nagel–Schreckenberg model simulation in c language and have some problems when a thread accesses the data which wasn't calculated yet.

Here is a working code which only parallelizes velocity calculation per line:

#define L 3000         // number of cells in row
#define num_iters 3000 // number of iterations
#define density 0.48  // how many positives
#define vmax 2
#define p 0.2

    for (int i = 0; i < num_iters - 1; i++)
    {
            int temp[L] = {0};

            #pragma omp parallel for
            for (int x = 0; x < L; x++)
            {
                if (iterations[i][x] > -1)
                {
                    int vi = iterations[i][x]; // velocity of previews iteration
                    int d = 1;                 // index of the next vehicle

                    while (iterations[i][(x + d) % L] < 0)
                        d++;

                    int vtemp = min(min(vi + 1, d - 1), vmax);    // increase speed, but avoid hitting the next car
                    int v = r2() < p ? max(vtemp - 1, 0) : vtemp; // stop the vehicle with probability p
                    temp[x] = v;
                }
            }
            
            for (int x = 0; x < L; x++) // write the velocities to the next line
            {
                if (iterations[i][x] > -1)
                {
                    int v = temp[x];
                    iterations[i + 1][(x + v) % L] = v;
                }
            }
    }

working code

This works fine, but it's not fast enough. I'm trying to use convolution to increase the performance, but it can't read neighbor thread's data half of the time because it wasn't calculated yet. Here is the code I used:

#include <omp.h>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <math.h>
#include <string.h>
#include <sys/time.h>

#define L 4000         // number of cells in row
#define num_iters 4000 // number of iterations
#define density 0.48  // how many positives
#define vmax 2
#define p 0.2
#define BLOCKS_Y 4
#define BLOCKS_X 4
#define BLOCKSIZEY (L / BLOCKS_Y)
#define BLOCKSIZEX (L / BLOCKS_X)

time_t t;

#ifndef min
#define min(a, b) (((a) < (b)) ? (a) : (b))
#endif

#ifndef max
#define max(a, b) (((a) > (b)) ? (a) : (b))
#endif

void shuffle(int *array, size_t n)
{
  if (n > 1)
  {
    size_t i;
    for (i = 0; i < n - 1; i++)
    {
      size_t j = i + rand() / (RAND_MAX / (n - i) + 1);
      int t = array[j];
      array[j] = array[i];
      array[i] = t;
    }
  }
}

double r2()
{
  return (double)rand() / (double)RAND_MAX;
}

void writeImage(int *iterations[], char filename[])
{
    int h = L;
    int w = num_iters;
    FILE *f;
    unsigned char *img = NULL;
    int filesize = 54 + 3 * w * h;

    img = (unsigned char *)malloc(3 * w * h);
    memset(img, 0, 3 * w * h);

    for (int i = 0; i < w; i++)
    {
        for (int j = 0; j < h; j++)
        {
            int x = i;
            int y = (h - 1) - j;
            int color = iterations[i][j] == 0 ? 0 : 255;
            img[(x + y * w) * 3 + 2] = (unsigned char)(color);
            img[(x + y * w) * 3 + 1] = (unsigned char)(color);
            img[(x + y * w) * 3 + 0] = (unsigned char)(color);
        }
    }

    unsigned char bmpfileheader[14] = {'B', 'M', 0, 0, 0, 0, 0, 0, 0, 0, 54, 0, 0, 0};
    unsigned char bmpinfoheader[40] = {40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 24, 0};
    unsigned char bmppad[3] = {0, 0, 0};

    bmpfileheader[2] = (unsigned char)(filesize);
    bmpfileheader[3] = (unsigned char)(filesize >> 8);
    bmpfileheader[4] = (unsigned char)(filesize >> 16);
    bmpfileheader[5] = (unsigned char)(filesize >> 24);

    bmpinfoheader[4] = (unsigned char)(w);
    bmpinfoheader[5] = (unsigned char)(w >> 8);
    bmpinfoheader[6] = (unsigned char)(w >> 16);
    bmpinfoheader[7] = (unsigned char)(w >> 24);
    bmpinfoheader[8] = (unsigned char)(h);
    bmpinfoheader[9] = (unsigned char)(h >> 8);
    bmpinfoheader[10] = (unsigned char)(h >> 16);
    bmpinfoheader[11] = (unsigned char)(h >> 24);

    f = fopen(filename, "wb");
    fwrite(bmpfileheader, 1, 14, f);
    fwrite(bmpinfoheader, 1, 40, f);
    for (int i = 0; i < h; i++)
    {
        fwrite(img + (w * (h - i - 1) * 3), 3, w, f);
        fwrite(bmppad, 1, (4 - (w * 3) % 4) % 4, f);
    }

    free(img);
    fclose(f);
}

void simulation()
{
    printf("L=%d, num_iters=%d\n", L, num_iters);
    int z = 0;
    z++;
    int current_index = 0;
    int success_moves = 0;

    const int cars_num = (int)(density * L);

    int **iterations = (int **)malloc(num_iters * sizeof(int *));
    for (int i = 0; i < num_iters; i++)
        iterations[i] = (int *)malloc(L * sizeof(int));

    for (int i = 0; i < L; i++)
    {
        iterations[0][i] = i <= cars_num ? 0 : -1;
    }
    shuffle(iterations[0], L);

    for (int i = 0; i < num_iters - 1; i++)
        for (int x = 0; x < L; x++)
            iterations[i + 1][x] = -1;

    double *randoms = (double *)malloc(L * num_iters * sizeof(double));
    for (int i = 0; i < L * num_iters; i++) {
        randoms[i] = r2();
    }

    #pragma omp parallel for collapse(2)
    for (int blocky = 0; blocky < BLOCKS_Y; blocky++)
    {
        for (int blockx = 0; blockx < BLOCKS_X; blockx++)
        {
            int ystart = blocky * BLOCKSIZEY;
            int yend = ystart + BLOCKSIZEY;
            int xstart = blockx * BLOCKSIZEX;
            int xend = xstart + BLOCKSIZEX;

            for (int y = ystart; y < yend; y++)
            {
                for (int x = xstart; x < xend; x++)
                {
                    if (iterations[y][x] > -1)
                    {
                        int vi = iterations[y][x];
                        int d = 1;

                        int start = (x + d) % L;
                        int i;
                        for (i = start; i < L && iterations[y][i] < 0; ++i);
                        d += i - start;
                        if (i == L)
                        {
                            for (i = 0; i < start && iterations[y][i] < 0; ++i);
                            d += i;
                        }

                        int vtemp = min(min(vi + 1, d - 1), vmax);
                        int v = randoms[x * y] < p ? max(vtemp - 1, 0) : vtemp;
                        iterations[y + 1][(x + v) % L] = v;
                    }
                }
            }
        }
    }

    if (L <= 4000)
        writeImage(iterations, "img.bmp");
    free(iterations);
}

void main() {
    srand((unsigned)time(&t));
    simulation();
}

the problem

As you can see, as the second block gets calculated the first one didn't probably calculate yet which produces that empty space.

I think it's possible to solve this with the convolution, but I'm just doing something wrong and I'm not sure what. If you could give any advice on how to fix this problem, I would really appreciate it.

ivan0biwan
  • 189
  • 2
  • 8
  • How big is `num_iters` and `L`? – Laci Dec 28 '21 at 22:01
  • they both are 3000 on the screenshot, but eventually will be 20000+ – ivan0biwan Dec 28 '21 at 22:10
  • Regarding the first code: It is memory bound (few calculations and lots of memory read/write), so it is not surprising if it not scales well with the number of cores used. Do you need to store the result of all previous iterations? Regarding the second code: I do not know anything about Nagel–Schreckenberg model, but if the bottleneck is memory read/write It may not be significantly faster. On the other hand I cannot see the connection between the first and second code. – Laci Dec 28 '21 at 22:33
  • 1. How many cores do you have and what speedup do you get? 2. Do you set `OMP_PROC_BIND`? 3. How is your first code correct? You parallelize over `x` but each index depends on others. – Victor Eijkhout Dec 28 '21 at 23:55
  • @VictorEijkhout The first code is OK, only `temp[x]` is written inside the parallel region (`iterations` is read only). – Laci Dec 29 '21 at 04:40
  • @Laci Got it. But then I don't see the problem with the second code. Apart from the fact that multiple locations in `iterations[i+1]` can be assigned (and some none at all?) there are also no dependencies. – Victor Eijkhout Dec 29 '21 at 05:52
  • 1
    Can you explicitly put the "same code as above" just to be sure there is no basic mistakes? It would help to have a minimal *reproducible* example. – Jérôme Richard Dec 29 '21 at 11:51
  • @VictorEijkhout I have no problem with the second code, I simply do not know what "same code as above" exactly means (e.g. how loop variables `x` and `y` are used). – Laci Dec 29 '21 at 12:58
  • I updated the code a little bit, adding the constants and replacing "the same code as above" with the actual code. Nagel–Schreckenberg model is a traffic simulation. On the x axis is the road with vehicles and on the y axis is how the vehicles move over the time. I think it needs to read the whole row to determine where is the next vehicle (the while loop). ```y``` in the second example is the same as ```i``` from the first – ivan0biwan Dec 29 '21 at 13:14
  • Does it work properly if OpenMP disabled? – Laci Dec 29 '21 at 13:25
  • the first one does, the second one looks the same – ivan0biwan Dec 29 '21 at 15:33

1 Answers1

0

There is a race condition in the second code because iterations can be read by a thread and written by another. More specifically, iterations[y + 1][(x + v) % L] = v set a value that another thread should read when checking iterations[y][x] or iterations[y][(x + d) % L] when two threads are working on consecutive y values (of two consecutive blocky values).

Moreover, the r2 function have to be thread-safe. It appears to be a random number generator (RNG), but such random function is generally implemented using global variables that are often not thread-safe. One simple and efficient solution is to use thread_local variables instead. An alternative solution is to explicitly pass in parameter a mutable state to the random function. The latter is a good practice when you design parallel applications since it makes visible the mutation of an internal state and it provides way to better control the determinism of the RNG.

Besides this, please note that modulus are generally expensive, especially if L is not a compile-time constant. You can remove some of them by pre-computing the remainder before a loop or splitting a loop so to perform checks only near the boundaries. Here is an (untested) example for the while:

int start = (x + d) % L;
int i;

for(i=start ; i < L && iterations[y][i] < 0 ; ++i);
d += i - start;

if(i == L) {
    for(i=0 ; i < start && iterations[y][i] < 0 ; ++i);
    d += i;
}

Finally, please note that the blocks should be divisible by 4. Otherwise, the current code is not valid (a min/max clamping is likely needed).

Jérôme Richard
  • 41,678
  • 6
  • 29
  • 59
  • Thanks a lot for a detailed answer, I updated the question with all the suggested changes, but it still doesn't function properly. I used pregenerated random just to be extra safe while testing. I'm aware of the race condition, but not sure how to get rid of it – ivan0biwan Jan 03 '22 at 13:42
  • The proposed change of the `while` section was to improve performance and not to solve the race condition. I think the main issue is still present in the line `iterations[y + 1][(x + v) % L] = ...`. You likely need *multiple arrays* or an additional *synchronization* (or even multiple steps) to prevent threads to access to the same array location at the same time. This highly depends of the underlying algorithm (which I do not fully understand yet). – Jérôme Richard Jan 03 '22 at 14:26
  • I updated the code, it should work now. The algorithm is a traffic simulation, it was good explained here https://de.zxc.wiki/wiki/Nagel-Schreckenberg-Modell I'm not sure multiple arrays would help because the array is already allocated, I think it's more about accessing order. I tried to apply the Convolution algorithm to it in the example, but I think I did it wrong – ivan0biwan Jan 03 '22 at 17:37