I calculate the average and variance of an array using the built-in SSE functions. In principle, this is a summation of values ββand their squares, which can be illustrated in the following program:
int main( int argc, const char* argv[] ) { union u { __m128 m; float f[4]; } x; // Allocate memory and initialize data: [1,2,3,...stSize+1] const size_t stSize = 1024; float *pData = (float*) _aligned_malloc(stSize*sizeof(float), 32); for ( size_t s = 0; s < stSize; ++s ) { pData[s] = s+1; } // Sum and sum of squares { // Accumlation using SSE intrinsics __m128 mEX = _mm_set_ps1(0.f); __m128 mEXX = _mm_set_ps1(0.f); for ( size_t s = 0; s < stSize; s+=4 ) { __m128 m = _mm_load_ps(pData + s); mEX = _mm_add_ps(mEX, m); mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m)); } // Final reduction xm = mEX; double dEX = xf[0] + xf[1] + xf[2] + xf[3]; xm = mEXX; double dEXX = xf[0] + xf[1] + xf[2] + xf[3]; std::cout << "Sum expected: " << (stSize * stSize + stSize) / 2 << std::endl; std::cout << "EX: " << dEX << std::endl; std::cout << "Sum of squares expected: " << 1.0/6.0 * stSize * (stSize + 1) * (2 * stSize + 1) << std::endl; std::cout << "EXX: " << dEXX << std::endl; } // Clean up _aligned_free(pData); }
Now, when I compile and run the program in Debug mode, I get the following (and correct) output:
Sum expected: 524800 EX: 524800 Sum of squares expected: 3.58438e+008 EXX: 3.58438e+008
However, compiling and running the program in Release mode produces the following (and incorrect) results:
Sum expected: 524800 EX: 524800 Sum of squares expected: 3.58438e+008 EXX: 3.49272e+012
Changing the accumulation order, that is, EXX is updated to EX, the results are in order:
Sum expected: 524800 EX: 524800 Sum of squares expected: 3.58438e+008 EXX: 3.58438e+008
Looks like "counterproductive" compiler optimization or why does execution order matter? Is this a known bug?
EDIT: I just looked at the assembler output. This is what I get (only relevant parts). To build the release with the /arch:AVX compiler flag, we have:
; 69 : // Second test: sum and sum of squares ; 70 : { ; 71 : __m128 mEX = _mm_set_ps1(0.f); vmovaps xmm1, XMMWORD PTR __xmm@0 mov ecx, 256 ; 00000100H ; 72 : __m128 mEXX = _mm_set_ps1(0.f); vmovaps xmm2, xmm1 npad 12 $LL3@main : ; 73 : for ( size_t s = 0; s < stSize; s+=4 ) ; 74 : { ; 75 : __m128 m = _mm_load_ps(pData + s); vmovaps xmm0, xmm1 ; 76 : mEX = _mm_add_ps(mEX, m); vaddps xmm1, xmm1, XMMWORD PTR [rax] add rax, 16 ; 77 : mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m)); vmulps xmm0, xmm0, xmm0 vaddps xmm2, xmm0, xmm2 dec rcx jne SHORT $LL3@main
This is clearly wrong, since this (1) stores the accumulated EX result ( xmm1 ) in xmm0 (2) accumulates EX with the current value ( XMMWORD PTR [rax] ) and (3) the square of the accumulated EX result accumulates in EXX ( xmm2 ), previously saved in xmm0 .
In contrast, the version without /arch:AVX looks fine and as expected:
; 69 : // Second test: sum and sum of squares ; 70 : { ; 71 : __m128 mEX = _mm_set_ps1(0.f); movaps xmm1, XMMWORD PTR __xmm@0 mov ecx, 256 ; 00000100H ; 72 : __m128 mEXX = _mm_set_ps1(0.f); movaps xmm2, xmm1 npad 10 $LL3@main : ; 73 : for ( size_t s = 0; s < stSize; s+=4 ) ; 74 : { ; 75 : __m128 m = _mm_load_ps(pData + s); movaps xmm0, XMMWORD PTR [rax] add rax, 16 dec rcx ; 76 : mEX = _mm_add_ps(mEX, m); addps xmm1, xmm0 ; 77 : mEXX = _mm_add_ps(mEXX, _mm_mul_ps(m,m)); mulps xmm0, xmm0 addps xmm2, xmm0 jne SHORT $LL3@main
It really seems like a mistake. Can someone confirm or deny this problem with another version of the compiler? (I currently do not have permission to update the compiler)