0

I was trying to solve this openmp tutorial exercise in which I was required to parallelize the following serial code:

/*
**  PROGRAM: A simple serial producer/consumer program
**
**  One function generates (i.e. produces) an array of random values.  
**  A second functions consumes that array and sums it.
**
**  HISTORY: Written by Tim Mattson, April 2007.
*/
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>

#define N        10000

/* Some random number constants from numerical recipies */
#define SEED       2531
#define RAND_MULT  1366
#define RAND_ADD   150889
#define RAND_MOD   714025
int randy = SEED;

/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
   int i; 
   for (i=0;i<length;i++) {
     randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
     *(a+i) = ((double) randy)/((double) RAND_MOD);
   }   
}

/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
   int i;  double sum = 0.0;
   for (i=0;i<length;i++)  sum += *(a+i);  
   return sum; 
}
  
int main()
{
  double *A, sum, runtime;
  int flag = 0;

  A = (double *)malloc(N*sizeof(double));

  runtime = omp_get_wtime();

  fill_rand(N, A);        // Producer: fill an array of data

  sum = Sum_array(N, A);  // Consumer: sum the array
   
  runtime = omp_get_wtime() - runtime;

  printf(" In %f seconds, The sum is %f \n",runtime,sum);
}

I came up with the following solution for this exercise. First I parallelizes both the producer and the consumer using #pragma omp parallel for and then I added a barrier and a flush between the calls to the functions. Here is my code:

/*
 **  PROGRAM: A simple serial producer/consumer program
 **
 **  One function generates (i.e. produces) an array of random values.  
 **  A second functions consumes that array and sums it.
 **
 **  HISTORY: Written by Tim Mattson, April 2007.
 */
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>

#define N        1000000000

/* Some random number constants from numerical recipies */
#define SEED       2531
#define RAND_MULT  1366
#define RAND_ADD   150889
#define RAND_MOD   714025
int randy = SEED;

/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
        int i;
        #pragma omp parallel for schedule(static)
        for (i=0;i<length;i++) {
                randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
                *(a+i) = ((double) randy)/((double) RAND_MOD);
        }
}

/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
        int i;  double sum = 0.0;
        #pragma omp parallel for reduction(+:sum) schedule(static)
        for (i=0;i<length;i++) sum += *(a+i);  
        return sum; 
}

int main()
{
        double *A, sum, runtime;
        int flag = 0;
        int __flag;

        A = (double *)malloc(N*sizeof(double));

        runtime = omp_get_wtime();
        
        fill_rand(N, A);        // Producer: fill an array of data
        
        #pragma omp barrier
        #pragma omp flush
        
        sum = Sum_array(N, A);  // Consumer: sum the array
        
        runtime = omp_get_wtime() - runtime;
        printf(" In %f seconds, The sum is %f \n",runtime,sum);
}

However there is a race condition in this code as when I run it multiple times, I get slightly different values. The solution to this problem, as per the tutorial, uses the producer-consumer pattern. It first runs the fill_rand function. Then after that function finishes its execution, the code sets up a flag variable to instruct the consumer to start executing. Of course, it also adds flushes between the two sections of producer and consumer. To me this code looks similar to my solution. As far as I understand it, both pieces of code first run the producer, the flush the array to the memory, and then run the consumer to get the result. My code, however runs twice as fast, probably due to less cache flushes. But, my code seems to have some race conditions. Here is the provided solution which does not have any race conditions:

/*
 **  PROGRAM: A simple serial producer/consumer program
 **
 **  One function generates (i.e. produces) an array of random values.  
 **  A second functions consumes that array and sums it.
 **
 **  HISTORY: Written by Tim Mattson, April 2007.
 */
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>

#define N        1000000000

/* Some random number constants from numerical recipies */
#define SEED       2531
#define RAND_MULT  1366
#define RAND_ADD   150889
#define RAND_MOD   714025
int randy = SEED;

/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
        int i;
        #pragma omp parallel for schedule(static)
        for (i=0;i<length;i++) {
                randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
                *(a+i) = ((double) randy)/((double) RAND_MOD);
        }   
}

/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
        int i;  double sum = 0.0;
        #pragma omp parallel for reduction(+:sum) schedule(static)
        for (i=0;i<length;i++) sum += *(a+i);  
        return sum; 
}

int main()
{
        double *A, sum, runtime;
        int flag = 0;
        int __flag;

        A = (double *)malloc(N*sizeof(double));

        runtime = omp_get_wtime();
        
        #pragma omp parallel sections
        {
                #pragma omp section
                {
                        fill_rand(N, A);        // Producer: fill an array of data
                        #pragma omp flush
                       
                        #pragma omp atomic write
                        flag = 1;
                        #pragma omp flush(flag)
                }
                
                #pragma omp section
                {
                        #pragma omp flush(flag)
                        while(1)
                        {
                                #pragma omp flush(flag)
                                #pragma omp atomic read
                                __flag = flag;
                                if(__flag == 1) break;
                        }
                        
                        #pragma omp flush
                        sum = Sum_array(N, A);  // Consumer: sum the array
                }

                runtime = omp_get_wtime() - runtime;
        }
        printf(" In %f seconds, The sum is %f \n",runtime,sum);
}

I can't figure out what I am doing wrong. Can someone please help me out.

I also tried running the following code which adds the barrier and the flush in the parallel region, but even this version has a race condition somewhere.

/*
 **  PROGRAM: A simple serial producer/consumer program
 **
 **  One function generates (i.e. produces) an array of random values.  
 **  A second functions consumes that array and sums it.
 **
 **  HISTORY: Written by Tim Mattson, April 2007.
 */
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>

#define N        10000

/* Some random number constants from numerical recipies */
#define SEED       2531
#define RAND_MULT  1366
#define RAND_ADD   150889
#define RAND_MOD   714025
int randy = SEED;

/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
        int i;
        #pragma omp parallel
        {
                #pragma omp for schedule(static)
                for (i=0;i<length;i++) {
                        randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
                        *(a+i) = ((double) randy)/((double) RAND_MOD);
                }
                #pragma omp barrier
                #pragma omp flush
        }
}

/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
        int i;  double sum = 0.0;
        #pragma omp parallel for reduction(+:sum) schedule(static)
        for (i=0;i<length;i++) sum += *(a+i);  
        return sum; 
}

int main()
{
        double *A, sum, runtime;
        int flag = 0;
        int __flag;

        A = (double *)malloc(N*sizeof(double));

        runtime = omp_get_wtime();
        
        fill_rand(N, A);        // Producer: fill an array of data
       
        sum = Sum_array(N, A);  // Consumer: sum the array
        
        runtime = omp_get_wtime() - runtime;
        printf(" In %f seconds, The sum is %f \n",runtime,sum);
}

Update: As suggested by some comments, the problem was with the global variable randy which was being treated as a shared variable by default by openmp. So I changed that variable to firstprivate and now the run to run variation is no longer there. I suppose that the randy variable was the source of the race condition. However, after making this change in both the provided solution and my own solution, I am getting different answers from both the programs. Note that there is no run to run variation, but the answers that I am getting from both the versions is different. Moreover, the answer that I get when I run my version changes depending on the number of threads that I use. Again, I am not sure why this is happening. Here is the solution provided by the tutorial which results in the same answer regardless of the number of threads:

/*
 **  PROGRAM: A simple serial producer/consumer program
 **
 **  One function generates (i.e. produces) an array of random values.  
 **  A second functions consumes that array and sums it.
 **
 **  HISTORY: Written by Tim Mattson, April 2007.
 */
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>

#define N        10000

/* Some random number constants from numerical recipies */
#define SEED       2531
#define RAND_MULT  1366
#define RAND_ADD   150889
#define RAND_MOD   714025
int randy = SEED;

/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
        int i;
        #pragma omp parallel for schedule(static) firstprivate(randy)
        for (i=0;i<length;i++) {
                randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
                *(a+i) = ((double) randy)/((double) RAND_MOD);
        }   
}

/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
        int i;  double sum = 0.0;
        #pragma omp parallel for reduction(+:sum) schedule(static)
        for (i=0;i<length;i++) sum += *(a+i);  
        return sum; 
}

int main()
{
        double *A, sum, runtime;
        int flag = 0;
        int __flag;

        A = (double *)malloc(N*sizeof(double));

        runtime = omp_get_wtime();
        
        #pragma omp parallel sections
        {
                #pragma omp section
                {
                        fill_rand(N, A);        // Producer: fill an array of data
                        #pragma omp flush
                       
                        #pragma omp atomic write
                        flag = 1;
                        #pragma omp flush(flag)
                }
                
                #pragma omp section
                {
                        #pragma omp flush(flag)
                        while(1)
                        {
                                #pragma omp flush(flag)
                                #pragma omp atomic read
                                __flag = flag;
                                if(__flag == 1) break;
                        }
                        
                        #pragma omp flush
                        sum = Sum_array(N, A);  // Consumer: sum the array
                }

                runtime = omp_get_wtime() - runtime;
        }
        printf(" In %f seconds, The sum is %f \n",runtime,sum);
}

And here is my solution which results in the different answers depending on the number of threads:

/*
 **  PROGRAM: A simple serial producer/consumer program
 **
 **  One function generates (i.e. produces) an array of random values.  
 **  A second functions consumes that array and sums it.
 **
 **  HISTORY: Written by Tim Mattson, April 2007.
 */
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>

#define N        10000

/* Some random number constants from numerical recipies */
#define SEED       2531
#define RAND_MULT  1366
#define RAND_ADD   150889
#define RAND_MOD   714025
int randy = SEED;

/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
        int i;
        #pragma omp parallel
        {
                #pragma omp for schedule(static) firstprivate(randy)
                for (i=0;i<length;i++) {
                        randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
                        *(a+i) = ((double) randy)/((double) RAND_MOD);
                }
                #pragma omp barrier
                #pragma omp flush
        }
}

/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
        int i;  double sum = 0.0;
        #pragma omp parallel for reduction(+:sum) schedule(static)
        for (i=0;i<length;i++) sum += *(a+i);  
        return sum; 
}

int main()
{
        double *A, sum, runtime;
        int flag = 0;
        int __flag;

        A = (double *)malloc(N*sizeof(double));

        runtime = omp_get_wtime();
        
        fill_rand(N, A);        // Producer: fill an array of data
       
        sum = Sum_array(N, A);  // Consumer: sum the array
        
        runtime = omp_get_wtime() - runtime;
        printf(" In %f seconds, The sum is %f \n",runtime,sum);
}

Update #2: The problem was indeed with the variable randy. Refer to my answer below for more information. Thanks!

Setu
  • 180
  • 8
  • 2
    Never use malloc.h, always use stdlib.h. – Lundin Aug 23 '22 at 10:50
  • And then maybe check the result of `malloc()` to see if it truly lets you allocate 8GB of RAM memory just like that. – Lundin Aug 23 '22 at 10:53
  • @Lundin I checked the result of malloc and it was indeed letting my allocate that much memory. But to be on the safer side, I reduced N to 10000, and I added a check to see if malloc fails. When I ran the code again, malloc didn't fail (as expected) but the race condition was still there. – Setu Aug 23 '22 at 11:44
  • 1
    *However there is a race condition in this code as when I run it multiple times, I get slightly different values.* Can you be clear about the differences? Why should we think they are anything more than the usual indeterminancy of parallel floating-point arithmetic? – High Performance Mark Aug 23 '22 at 11:59
  • When I run the provided solution I always get the values of sum as 5030.674031. However, when I run my code I get different values such as 4997.657813, 4982.215627, 4994.405360, 5005.432989, 5018.018078, etc. That's why I think there might be a race condition. – Setu Aug 23 '22 at 16:35
  • `randy` is a global variable that is mutated in threads causing a race condition. The best thing is to avoid global variable like the plague in parallel application (due to side effects, spooky action at distance and software engineering issues), especially ones that are written (it almost always results in bugs and headache in parallel code). OpenMP can copy the variable or/and make it private but there is a sequential dependency chain here so this part of the code cannot actually be parallelized. – Jérôme Richard Aug 23 '22 at 18:50
  • Besides, `*(a+i)` is cryptic: why not using `a[i]`? Nested parallel section should also be avoided as much as possible. They are generally not efficient. Tasks are the way to go. Using a spin wait is also a bad practice (it use a core for nearly nothing, can slow down other threads and waste energy which starts to be precious nowadays). Wait conditions are better suited for this use-case. Finally, `#pragma omp barrier` and `#pragma omp flush` are useless since they are put outside any parallel OpenMP region. – Jérôme Richard Aug 23 '22 at 18:56
  • @JérômeRichard The problem might be due to `randy` as you suggested, but then why is it not showing up in the provided solution. Also, if I understood you correctly the `#pragma omp barrier` and `#pragma omp flush` are essentially nops in my codes since they are not in a parallel section. However, this would also mean that the threads executing in the `fill_rand` function would have already finished before the threads for `Sum_array` are called, creating an implicit barrier. Then in this case, even if `#pragma omp barrier` is a nop, then it should be fine. But then what am I doing wrong? – Setu Aug 24 '22 at 16:02
  • @JérômeRichard I also tried adding the barrier and the flush in the parallel region to ensure that they are not being treated as nops (see the updated version of the post), but even that didn't help. – Setu Aug 24 '22 at 16:09
  • Compiler can assume there are no race conditions and make any crazy optimization like removing all the code, not reloading variable, produce bogus code or make the program simply crash (because of the undefined behavior). Besides, race conditions are often non-deterministic and dependent of many low-level behavior. having a program apparently working does not mean the program works correctly (*absence of evidence is not evidence of absence*). That being said, I think the second version might run sequentially due to nesting (explaining why there is no visible race condition). – Jérôme Richard Aug 24 '22 at 16:42
  • The barrier/flush is useless since parallel section already include an implicit barrier/flush. In fact, a parallel for already does an implicit barrier at the end (hence the presence of a `nowait` close). This is why adding such directive change nothing. – Jérôme Richard Aug 24 '22 at 16:45
  • @JérômeRichard But if the barrier/flush is implicit then I don't understand why am I running into race conditions, since the implicit barrier/flush in `fill_rand` would ensure that all the data is written back to memory before `Sum_array` is even called. Apart from the array A, I don't see any other variable which can cause this issue. And this array A would be written back to memory by the implicit barrier/flush anyways. Then why am I getting different answers when I am run the the code multiple times? – Setu Aug 24 '22 at 17:32
  • 1
    @JérômeRichard your suspicion of the global variable `randy` was correct. Declaring it threadprivate solved the issue. Thanks! – Setu Aug 24 '22 at 17:57

1 Answers1

1

As suggested by the comments, the problem was with the variable randy. Declaring it as threadprivate solved the issue. Thanks for the help everyone! For the sake of completion, here is the working code:

/*
 **  PROGRAM: A simple serial producer/consumer program
 **
 **  One function generates (i.e. produces) an array of random values.  
 **  A second functions consumes that array and sums it.
 **
 **  HISTORY: Written by Tim Mattson, April 2007.
 */
#include <omp.h>
#ifdef APPLE
#include <stdlib.h>
#else
#include <malloc.h>
#endif
#include <stdio.h>

#define N        10000

/* Some random number constants from numerical recipies */
#define SEED       2531
#define RAND_MULT  1366
#define RAND_ADD   150889
#define RAND_MOD   714025
int randy = SEED;
#pragma omp threadprivate(randy)

/* function to fill an array with random numbers */
void fill_rand(int length, double *a)
{
        int i;
        #pragma omp for schedule(static)
        for (i=0;i<length;i++) {
                randy = (RAND_MULT * randy + RAND_ADD) % RAND_MOD;
                *(a+i) = ((double) randy)/((double) RAND_MOD);
        }
}

/* function to sum the elements of an array */
double Sum_array(int length, double *a)
{
        int i;  double sum = 0.0;
        #pragma omp parallel for reduction(+:sum) schedule(static)
        for (i=0;i<length;i++) sum += *(a+i);  
        return sum; 
}

int main()
{
        double *A, sum, runtime;
        int flag = 0;
        int __flag;

        A = (double *)malloc(N*sizeof(double));

        runtime = omp_get_wtime();
        
        fill_rand(N, A);        // Producer: fill an array of data
       
        sum = Sum_array(N, A);  // Consumer: sum the array
        
        runtime = omp_get_wtime() - runtime;
        printf(" In %f seconds, The sum is %f \n",runtime,sum);
}
Setu
  • 180
  • 8
  • Note that you could also use `#pragma omp parallel for schedule(static) firstprivate(randy)`. Also note that results are different from the sequential version since each thread has its own variable. Furthermore, all threads generate the same sequence of pseudo-random values due to the same seed. Initialising them with a different seed (eg. something like `SEED+get_thread_num()` should fix the problem). – Jérôme Richard Aug 24 '22 at 19:20