, .
, if .
, , . .
. №3 , fix5, # 2, 3.9x.
, .
omp. , , (, omp atomic update ..)
, :
double U[n][n];
double L[n][n];
double Aprime[n][n];
for (i = 0; i < n; i++) {
for (j = 0; j < n; j++) {
if (j <= i) {
double s;
s = 0;
for (k = 0; k < j; k++) {
s += L[j][k] * U[k][i];
}
U[j][i] = Aprime[j][i] - s;
}
else if (j >= i) {
double s;
s = 0;
for (k = 0; k < i; k++) {
s += L[j][k] * U[k][i];
}
L[j][i] = (Aprime[j][i] - s) / U[i][i];
}
}
}
else if (j >= i) else. j , if/else:
// fix2.c -- split up j loop to eliminate if/else inside
double U[n][n];
double L[n][n];
double Aprime[n][n];
for (i = 0; i < n; i++) {
for (j = 0; j <= i; j++) {
double s = 0;
for (k = 0; k < j; k++)
s += L[j][k] * U[k][i];
U[j][i] = Aprime[j][i] - s;
}
for (; j < n; j++) {
double s = 0;
for (k = 0; k < i; k++)
s += L[j][k] * U[k][i];
L[j][i] = (Aprime[j][i] - s) / U[i][i];
}
}
U[i][i] j, :
// fix3.c -- save off value of U[i][i]
double U[n][n];
double L[n][n];
double Aprime[n][n];
for (i = 0; i < n; i++) {
for (j = 0; j <= i; j++) {
double s = 0;
for (k = 0; k < j; k++)
s += L[j][k] * U[k][i];
U[j][i] = Aprime[j][i] - s;
}
double Uii = U[i][i];
for (; j < n; j++) {
double s = 0;
for (k = 0; k < i; k++)
s += L[j][k] * U[k][i];
L[j][i] = (Aprime[j][i] - s) / Uii;
}
}
, , . , , :
// fix4.c -- transpose matrix coordinates to get _much_ better memory/cache
// performance
double U[n][n];
double L[n][n];
double Aprime[n][n];
for (i = 0; i < n; i++) {
for (j = 0; j <= i; j++) {
double s = 0;
for (k = 0; k < j; k++)
s += L[k][j] * U[i][k];
U[i][j] = Aprime[i][j] - s;
}
double Uii = U[i][i];
for (; j < n; j++) {
double s = 0;
for (k = 0; k < i; k++)
s += L[k][j] * U[i][k];
L[i][j] = (Aprime[i][j] - s) / Uii;
}
}
UPDATE:
k- k<j k<i ?
, . fix1.c, fix2-fix4, .
# 2:
.
, [ static], , , , , (, 8 MB)
VLA [ n ], . , , .
, , , , () #pragma omp shared(Aprime) shared(U) shared(L).
- s. fix4 U, L .
,
, , , .
, L , . . , .
, . , . , , .
// fix5.c -- further transpose to fix poor performance on s calc loops
//
// flip the U dimensions back to original
double U[n][n];
double L[n][n];
double Aprime[n][n];
double *Up;
double *Lp;
double *Ap;
for (i = 0; i < n; i++) {
Ap = Aprime[i];
Up = U[i];
for (j = 0; j <= i; j++) {
double s = 0;
Lp = L[j];
for (k = 0; k < j; k++)
s += Lp[k] * Up[k];
Up[j] = Ap[j] - s;
}
double Uii = Up[i];
for (; j < n; j++) {
double s = 0;
Lp = L[j];
for (k = 0; k < i; k++)
s += Lp[k] * Up[k];
Lp[i] = (Ap[j] - s) / Uii;
}
}
, , . , , , , .
# 3:
. n, 1037:
orig: 1.780916929 1.000x
fix1: 3.730602026 0.477x
fix2: 1.743769884 1.021x
fix3: 1.765769482 1.009x
fix4: 1.762100697 1.011x
fix5: 0.452481270 3.936x
.
, , . , ...