MSVC uses its own definition:
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) { \ __m128 tmp3, tmp2, tmp1, tmp0; \ \ tmp0 = _mm_shuffle_ps((row0), (row1), 0x44); \ tmp2 = _mm_shuffle_ps((row0), (row1), 0xEE); \ tmp1 = _mm_shuffle_ps((row2), (row3), 0x44); \ tmp3 = _mm_shuffle_ps((row2), (row3), 0xEE); \ \ (row0) = _mm_shuffle_ps(tmp0, tmp1, 0x88); \ (row1) = _mm_shuffle_ps(tmp0, tmp1, 0xDD); \ (row2) = _mm_shuffle_ps(tmp2, tmp3, 0x88); \ (row3) = _mm_shuffle_ps(tmp2, tmp3, 0xDD); \ }
The last line is converted to _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f) = _mm_shuffle_ps(tmp2,tmp3, 0XDD); , which compiles in MSVC just fine, but with a lvalue error in GCC. I am not sure why MSVC allows this.
I looked at the assembly of this code in MSVC2013
#include <immintrin.h> #include <stdio.h> int main() { __m128 rows[4]; //rows[0] = _mm_setr_ps( 1, 2, 3, 4); //rows[1] = _mm_setr_ps( 5, 6, 7, 8); rows[2] = _mm_setr_ps( 9,10,11,12); rows[3] = _mm_setr_ps(13,14,15,16); //_MM_TRANSPOSE4_PS(rows[0],rows[1],rows[2],rows[3]); //_MM_TRANSPOSE4_PS(rows[0],rows[1],rows[2],_mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f)); rows[2] = _mm_shuffle_ps(rows[2], rows[3], 0x88); _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f) = _mm_shuffle_ps(rows[2],rows[3], 0XDD); }
Here is the corresponding build code
; Line 14 mov eax, 16 imul rax, 3 mov ecx, 16 imul rcx, 2 movups xmm0, XMMWORD PTR rows$[rsp+rcx] shufps xmm0, XMMWORD PTR rows$[rsp+rax], 136 ; 00000088H movaps XMMWORD PTR $T6[rsp], xmm0 mov eax, 16 imul rax, 2 movaps xmm0, XMMWORD PTR $T6[rsp] movups XMMWORD PTR rows$[rsp+rax], xmm0 ; Line 15 mov eax, 16 imul rax, 3 mov ecx, 16 imul rcx, 2 movups xmm0, XMMWORD PTR rows$[rsp+rcx] shufps xmm0, XMMWORD PTR rows$[rsp+rax], 221 ; 000000ddH movaps XMMWORD PTR $T8[rsp], xmm0 movaps xmm0, XMMWORD PTR __xmm@3f800000000000000000000000000000 movaps XMMWORD PTR $T7[rsp], xmm0 movaps xmm0, XMMWORD PTR $T8[rsp] movaps XMMWORD PTR $T7[rsp], xmm0