Is it possible to parallelize this for a loop?

Question

Is it possible to parallelize this for a loop?

I was provided with some code for parallelization using OpenMP and, among various function calls, I noticed that this loop fortakes some big fault during the calculation.

  double U[n][n];
  double L[n][n];
  double Aprime[n][n];
  for(i=0; i<n; i++) {
    for(j=0; j<n; j++) {
      if (j <= i) {
          double s;
          s=0;
          for(k=0; k<j; k++) {
            s += L[j][k] * U[k][i];
          } 
          U[j][i] = Aprime[j][i] - s;
      } else if (j >= i) {
          double s;
          s=0;
          for(k=0; k<i; k++) {
            s += L[j][k] * U[k][i];
          }
          L[j][i] = (Aprime[j][i] - s) / U[i][i];
      }
    }

However, trying to parallelize it and apply some semaphores here and there (with no luck), I came to understand that the condition else ifis highly dependent on the early one if( L[j][i]which is the processed number with U[i][i], which can be set on the early one if), which makes it, in my opinion opposition that is not parallelized due to race conditions.

Is it possible to parallelize this code so that it can be executed else ifonly if the previous one is ifalready completed?

+4

c++ c parallel-processing openmp

user2018675 09 . '16 3:08

1

Craig Estey · Accepted Answer · 2016-09-09T05:12:05+0000

, .

, if .

, , . .

. №3 , fix5, # 2, 3.9x.

, .

omp. , , (, omp atomic update ..)

, :

double U[n][n];
double L[n][n];
double Aprime[n][n];

for (i = 0; i < n; i++) {
    for (j = 0; j < n; j++) {
        if (j <= i) {
            double s;

            s = 0;
            for (k = 0; k < j; k++) {
                s += L[j][k] * U[k][i];
            }
            U[j][i] = Aprime[j][i] - s;
        }
        else if (j >= i) {
            double s;

            s = 0;
            for (k = 0; k < i; k++) {
                s += L[j][k] * U[k][i];
            }
            L[j][i] = (Aprime[j][i] - s) / U[i][i];
        }
    }
}

else if (j >= i) else. j , if/else:

// fix2.c -- split up j loop to eliminate if/else inside

double U[n][n];
double L[n][n];
double Aprime[n][n];

for (i = 0; i < n; i++) {
    for (j = 0; j <= i; j++) {
        double s = 0;
        for (k = 0; k < j; k++)
            s += L[j][k] * U[k][i];
        U[j][i] = Aprime[j][i] - s;
    }

    for (; j < n; j++) {
        double s = 0;
        for (k = 0; k < i; k++)
            s += L[j][k] * U[k][i];
        L[j][i] = (Aprime[j][i] - s) / U[i][i];
    }
}

U[i][i] j, :

// fix3.c -- save off value of U[i][i]

double U[n][n];
double L[n][n];
double Aprime[n][n];

for (i = 0; i < n; i++) {
    for (j = 0; j <= i; j++) {
        double s = 0;
        for (k = 0; k < j; k++)
            s += L[j][k] * U[k][i];
        U[j][i] = Aprime[j][i] - s;
    }

    double Uii = U[i][i];

    for (; j < n; j++) {
        double s = 0;
        for (k = 0; k < i; k++)
            s += L[j][k] * U[k][i];
        L[j][i] = (Aprime[j][i] - s) / Uii;
    }
}

, , . , , :

// fix4.c -- transpose matrix coordinates to get _much_ better memory/cache
// performance

double U[n][n];
double L[n][n];
double Aprime[n][n];

for (i = 0; i < n; i++) {
    for (j = 0; j <= i; j++) {
        double s = 0;
        for (k = 0; k < j; k++)
            s += L[k][j] * U[i][k];
        U[i][j] = Aprime[i][j] - s;
    }

    double Uii = U[i][i];

    for (; j < n; j++) {
        double s = 0;
        for (k = 0; k < i; k++)
            s += L[k][j] * U[i][k];
        L[i][j] = (Aprime[i][j] - s) / Uii;
    }
}

UPDATE:

k- k<j k<i ?

, . fix1.c, fix2-fix4, .

# 2:

.

, [ static], , , , , (, 8 MB)

VLA [ n ], . , , .

, , , , () #pragma omp shared(Aprime) shared(U) shared(L).

- s. fix4 U, L .

,

, , , .

, L , . . , .

, . , . , , .

// fix5.c -- further transpose to fix poor performance on s calc loops
//
// flip the U dimensions back to original

double U[n][n];
double L[n][n];
double Aprime[n][n];

double *Up;
double *Lp;
double *Ap;

for (i = 0; i < n; i++) {
    Ap = Aprime[i];
    Up = U[i];

    for (j = 0; j <= i; j++) {
        double s = 0;
        Lp = L[j];
        for (k = 0; k < j; k++)
            s += Lp[k] * Up[k];
        Up[j] = Ap[j] - s;
    }

    double Uii = Up[i];

    for (; j < n; j++) {
        double s = 0;
        Lp = L[j];
        for (k = 0; k < i; k++)
            s += Lp[k] * Up[k];
        Lp[i] = (Ap[j] - s) / Uii;
    }
}

, , . , , , , .

# 3:

. n, 1037:

orig: 1.780916929 1.000x
fix1: 3.730602026 0.477x
fix2: 1.743769884 1.021x
fix3: 1.765769482 1.009x
fix4: 1.762100697 1.011x
fix5: 0.452481270 3.936x

.

, , . , ...

Is it possible to parallelize this for a loop?

More articles: