I want to parallelize a fragment of code in C usig OpenMP but getting a wrong output-CodePudding

I have this fragment of code that calculates the force experienced by each unique pair of particles due to a potential form. I am trying to parallelize the nested for loops, however, the output that is expected is different from the correct output. The code is :

for(ic = 0; ic < (N_TOTAL-1); ic  )
{
     for (jc = ic 1; jc < N_TOTAL; jc  )
          { 
             
               rc_x = X[ic]-X[jc];
               rc_z = Z[ic]-Z[jc];

            /* Minimum image convention */
            if (fabs(rc_x) > LxScaled/2.0)
              {
                rc_x = rc_x - SignR (LxScaled, rc_x);
              }
            if (fabs(rc_z) > LzScaled/2.0)
              {
                rc_z = rc_z - SignR (LzScaled, rc_z);
              }  
     
            rij = sqrt(rc_x * rc_x   rc_z * rc_z);
            rr = rr   (rc_x * rc_x   rc_z * rc_z);
            
            /* Compute Force : Yukawa potential*/
            second = exp (-rij * Xi ) * ((1.0*(1.0))/rij);
            third = (Xi 1.0/rij);
            a = second * third;

            a_x[ic] = a_x[ic]   (rc_x/rij)*a;
            a_x[jc] = a_x[jc] - (rc_x/rij)*a; 
            a_z[ic] = a_z[ic]   (rc_z/rij)*a;
            a_z[jc] = a_z[jc] - (rc_z/rij)*a;

        uVal = second;
        uSum = uSum   (second);
        
         }  
      }
    }

Here X[i] and Z[i] are the position and the a_x[i] and a_z[i] are the x and z components of the force experienced by the ith particle. I have an OpenMP version 4.0 thus I cannot use reduction clause for the 1D array a_x[] and a_z[]. Thus I intend to use #pragma omp atomic caluse to avoid any race conditions. Thus i use this to parallelize as :

#pragma omp parallel for schedule(guided, 8) reduction( :rr, uSum) default(shared) private(ic, jc, rc_x, rc_z, rij, uVal, second, third, a) 

for(ic = 0; ic < (N_TOTAL-1); ic  )
{
     for (jc = ic 1; jc < N_TOTAL; jc  )
          { 
             
               rc_x = X[ic]-X[jc];
               rc_z = Z[ic]-Z[jc];

            /* Minimum image convention */
            if (fabs(rc_x) > LxScaled/2.0)
              {
                #pragma omp atomic
                rc_x = rc_x - SignR (LxScaled, rc_x);
              }
            if (fabs(rc_z) > LzScaled/2.0)
              {
                #pragma omp atomic
                rc_z = rc_z - SignR (LzScaled, rc_z);
              }  
     
            rij = sqrt(rc_x * rc_x   rc_z * rc_z);
            rr = rr   (rc_x * rc_x   rc_z * rc_z);
            
            /* Compute Force : Yukawa potential*/
            second = exp (-rij * Xi ) * ((1.0*(1.0))/rij);
            third = (Xi 1.0/rij);
            a = second * third;

            #pragma omp atomic
            a_x[ic] = a_x[ic]   (rc_x/rij)*a;
            #pragma omp atomic
            a_x[jc] = a_x[jc] - (rc_x/rij)*a; 
            #pragma omp atomic
            a_z[ic] = a_z[ic]   (rc_z/rij)*a;
            #pragma omp atomic
            a_z[jc] = a_z[jc] - (rc_z/rij)*a;

        uVal = second;
        uSum = uSum   (second);
        
         }  
      }
    }

However I am not getting the correct output which is to be expected when the fragment of code is run in series. There are some deviation over time when the same force is computed over a longer period of time.

Where is the mistake that I am making, I try to use atomic update of force to eliminate any race condition that might occue. I am new to parallelization part and this is the 1t project I am trying to parallelize, Can anyon spot the mistake and point me in the right direction.

Edit: I also tried to reduce the two nested for loops into 1 however I am unable to procude the exact output as in serial.

CodePudding user response：

Since I cannot comment at the moment, I answer your additional question in your comment "any suggestions in improving the speed of the code?"

Move all the atomic operations outside of the inner loop. You can do that by declaring local pointers a_x_local and a_y_local and allocate them (with the same size as a_x and a_y) at the beginning of the outer loop. Since they are private you no longer need atomic operations to update them in the inner loop. Then, after the inner loop you update all the elements a_x and a_z at once (with a critical section or atomic operations).

#pragma omp parallel for schedule(guided, 8) reduction( :rr, uSum) default(shared) private(ic, jc, rc_x, rc_z, rij, uVal, second, third, a) 

for(ic = 0; ic < (N_TOTAL-1); ic  )
{
    double *a_x_local, *a_z_local;
    a_x_local = (double *)malloc(N_TOTAL * sizeof(double));
    a_z_local = (double *)malloc(N_TOTAL * sizeof(double));
    for (jc = 0; jc < N_TOTAL; jc  ) {
        a_x_local[jc] = 0.0;
        a_z_local[jc] = 0.0;
    }
    for (jc = ic 1; jc < N_TOTAL; jc  )
        { 
            rc_x = X[ic]-X[jc];
            rc_z = Z[ic]-Z[jc];
            /* Minimum image convention */
            if (fabs(rc_x) > LxScaled/2.0)
                rc_x = rc_x - SignR (LxScaled, rc_x);
            if (fabs(rc_z) > LzScaled/2.0)
                rc_z = rc_z - SignR (LzScaled, rc_z);
            rij = sqrt(rc_x * rc_x   rc_z * rc_z);
            rr = rr   (rc_x * rc_x   rc_z * rc_z);
            /* Compute Force : Yukawa potential*/
            second = exp (-rij * Xi ) * ((1.0*(1.0))/rij);
            third = (Xi 1.0/rij);
            a = second * third;

            a_x_local[ic]  = (rc_x/rij)*a;
            a_x_local[jc] -= (rc_x/rij)*a; 
            a_z_local[ic]  = (rc_z/rij)*a;
            a_z_local[jc] -= (rc_z/rij)*a;

            uVal = second;
            uSum = uSum   (second);
        }  
    }
    #pragma omp critical
    for (jc = 0; jc < N_TOTAL; jc  ) {
        a_x[jc]  = a_x_local[jc];
        a_z[jc]  = a_z_local[jc];
    } 
    free(a_x_local);
    free(a_z_local);
}

And as pointed out in the comments, you don't need the atomic pragma to update rc_x and rc_z.

Also, you may omit the schedule clause at first, and test various scheduling options later on.

CodePudding user response：

As already explained in the comments if you do floating point operations in a different order you may get a different result. It is due to rounding errors, for more details please read the following document: What Every Computer Scientist Should Know About Floating-Point Arithmetic
There is no race condition in your code, but there is a problem with your global variable uVal. In the parallel region its sharing attribute is private, so if you change it inside the parallel region, it does not change the global value. In your serial code, it is related to the last calculated particle pair, which makes no sense to me. You mentioned that you use it in other part of the code. In my opinion, it can be easily calculated when needed. No need to use it in your parallel region.
Variables rc_x and rc_z are private, so there is no need for atomic operations when you change them.
I suggest using your variables in their minimum required scope. It may help the compiler to make better optimizations.
Multiplications are generally faster than division, so you should change your code accordingly, e.g. instead of LxScaled/2.0 use LxScaled*0.5
You mentioned that you use OpenMP 4.0, which does not have array reduction. It is not a problem, you can do it easily manually.

Putting it together your code should be something like this:

#pragma omp parallel default(none) shared(X,Z,a_x,a_z, uSum, rr, Xi) 
{
    double a_x_local[N_TOTAL];
    double a_z_local[N_TOTAL];
    for (int jc = 0; jc < N_TOTAL; jc  ) {
        a_x_local[jc] = 0.0;
        a_z_local[jc] = 0.0;
    }

    #pragma omp for schedule(guided, 8) reduction( :rr, uSum) 
    for(int ic = 0; ic < (N_TOTAL-1); ic  )
    {
        for (int jc = ic 1; jc < N_TOTAL; jc  )
            { 
                double rc_x = X[ic]-X[jc];
                double rc_z = Z[ic]-Z[jc];
                /* Minimum image convention */
                if (fabs(rc_x) > 0.5*LxScaled)
                    rc_x -=  SignR (LxScaled, rc_x);
                if (fabs(rc_z) > 0.5*LzScaled)
                    rc_z -=  SignR (LzScaled, rc_z);
                const double d_rr=rc_x * rc_x   rc_z * rc_z;
                rr  = d_rr;
                const double rij = sqrt(d_rr);
                const double rij_1 = 1.0/rij;           
                /* Compute Force : Yukawa potential*/
                const double second = exp (-rij * Xi )*rij_1;
                const double second_x_third = second * (Xi rij_1);
                const double d_rc_x = rc_x* rij_1 * second_x_third;
                const double d_rc_z = rc_z* rij_1 * second_x_third;

                a_x_local[ic]  = d_rc_x;
                a_x_local[jc] -= d_rc_x; 
                a_z_local[ic]  = d_rc_z;
                a_z_local[jc] -= d_rc_z;

                //uVal = second; Shoud be calculated when needed
                uSum  = second;
            }  
    }
    for (int jc = 0; jc < N_TOTAL; jc  ) {
        #pragma omp atomic
        a_x[jc]  = a_x_local[jc];
        #pragma omp atomic
        a_z[jc]  = a_z_local[jc];
    } 
}

On my laptop this code runs faster than your or @PierU's parallel code (of course, I just used arbitrary data, and I have no idea what your SignR function does, the code is here):

Measured runtimes (N_TOTAL=10000):
Your serial code=0.520492 s
Your parallel code=0.169654 s
PierU's parallel code=0.140338 s
My parallel code=0.092958 s