The name may seem meaningless, but let me explain. I studied the program the other day when I came across the following assembler code:
movaps xmm3, xmmword ptr [rbp-30h] lea rdx, [rdi+1320h] movaps xmm5, xmm3 movaps xmm6, xmm3 movaps xmm0, xmm3 movss dword ptr [rdx], xmm3 shufps xmm5, xmm3, 55h shufps xmm6, xmm3, 0AAh shufps xmm0, xmm3, 0FFh movaps xmm4, xmm3 movss dword ptr [rdx+4], xmm5 movss dword ptr [rdx+8], xmm6 movss dword ptr [rdx+0Ch], xmm0 mulss xmm4, xmm3
and it looks like it is just copying four floats from [rbp-30h] to [rdx]. Those shufps
are only used to select one of the four floats in xmm3
(for example, shufps xmm5, xmm3, 55h
selects the second float and puts it in xmm5
).
This makes me wonder if the compiler did this because shufps
is actually faster than memory access (something like movss xmm0, dword ptr [rbp-30h]
, movss dword ptr [rdx], xmm0
).
So, I wrote several tests to compare these two approaches and found shufps
always slower than a few memory accesses. Now I think that using shufps
has nothing to do with performance. It might just be for obfuscating the code, so decompilers cannot easily create clean code (tried with IDA pro, and it was really too complicated).
Although I probably will never use shufps
explicitly in any case (using _mm_shuffle_ps
for example) in any practical programs, since the compiler is most likely smarter than me, I still want to know why the compiler that compiled the program generates such code. It is neither faster nor less. It makes no sense.
In any case, I will give the tests that I wrote below.
#include <Windows.h> #include <iostream> using namespace std; __declspec(noinline) DWORD profile_routine(void (*routine)(void *), void *arg, int iterations = 1) { DWORD startTime = GetTickCount(); while (iterations--) { routine(arg); } DWORD timeElapsed = GetTickCount() - startTime; return timeElapsed; } struct Struct { float x, y, z, w; }; __declspec(noinline) Struct shuffle1(float *arr) { float x = arr[3]; float y = arr[2]; float z = arr[0]; float w = arr[1]; return {x, y, z, w}; } #define SS0 (0x00) #define SS1 (0x55) #define SS2 (0xAA) #define SS3 (0xFF) __declspec(noinline) Struct shuffle2(float *arr) { Struct r; __m128 packed = *reinterpret_cast<__m128 *>(arr); __m128 x = _mm_shuffle_ps(packed, packed, SS3); __m128 y = _mm_shuffle_ps(packed, packed, SS2); __m128 z = _mm_shuffle_ps(packed, packed, SS0); __m128 w = _mm_shuffle_ps(packed, packed, SS1); _mm_store_ss(&r.x, x); _mm_store_ss(&r.y, y); _mm_store_ss(&r.z, z); _mm_store_ss(&r.w, w); return r; } void profile_shuffle_r1(void *arg) { float *arr = static_cast<float *>(arg); Struct q = shuffle1(arr); arr[0] += qw; arr[1] += qz; arr[2] += qy; arr[3] += qx; } void profile_shuffle_r2(void *arg) { float *arr = static_cast<float *>(arg); Struct q = shuffle2(arr); arr[0] += qw; arr[1] += qz; arr[2] += qy; arr[3] += qx; } int main(int argc, char **argv) { int n = argc + 3; float arr1[4], arr2[4]; for (int i = 0; i < 4; i++) { arr1[i] = static_cast<float>(n + i); arr2[i] = static_cast<float>(n + i); } int iterations = 20000000; DWORD time1 = profile_routine(profile_shuffle_r1, arr1, iterations); cout << "time1 = " << time1 << endl; DWORD time2 = profile_routine(profile_shuffle_r2, arr2, iterations); cout << "time2 = " << time2 << endl; return 0; }
In the above test, I have two shuffle methods shuffle1
and shuffle2
that do the same. When compiling with MSVC-O2, it creates the following code:
shuffle1: mov eax,dword ptr [rdx+0Ch] mov dword ptr [rcx],eax mov eax,dword ptr [rdx+8] mov dword ptr [rcx+4],eax mov eax,dword ptr [rdx] mov dword ptr [rcx+8],eax mov eax,dword ptr [rdx+4] mov dword ptr [rcx+0Ch],eax mov rax,rcx ret shuffle2: movaps xmm2,xmmword ptr [rdx] mov rax,rcx movaps xmm0,xmm2 shufps xmm0,xmm2,0FFh movss dword ptr [rcx],xmm0 movaps xmm0,xmm2 shufps xmm0,xmm2,0AAh movss dword ptr [rcx+4],xmm0 movss dword ptr [rcx+8],xmm2 shufps xmm2,xmm2,55h movss dword ptr [rcx+0Ch],xmm2 ret
shuffle1
always 30% faster than shuffle2
on my machine. I noticed that shuffle2
has two more instructions, and shuffle1
uses eax
instead of xmm0
, so I thought that if you add some arithmetic operations, the result will be different.
Therefore, I changed them as follows:
__declspec(noinline) Struct shuffle1(float *arr) { float x0 = arr[3]; float y0 = arr[2]; float z0 = arr[0]; float w0 = arr[1]; float x = x0 + y0 + z0; float y = y0 + z0 + w0; float z = z0 + w0 + x0; float w = w0 + x0 + y0; return {x, y, z, w}; } #define SS0 (0x00) #define SS1 (0x55) #define SS2 (0xAA) #define SS3 (0xFF) __declspec(noinline) Struct shuffle2(float *arr) { Struct r; __m128 packed = *reinterpret_cast<__m128 *>(arr); __m128 x0 = _mm_shuffle_ps(packed, packed, SS3); __m128 y0 = _mm_shuffle_ps(packed, packed, SS2); __m128 z0 = _mm_shuffle_ps(packed, packed, SS0); __m128 w0 = _mm_shuffle_ps(packed, packed, SS1); __m128 yz = _mm_add_ss(y0, z0); __m128 x = _mm_add_ss(x0, yz); __m128 y = _mm_add_ss(w0, yz); __m128 wx = _mm_add_ss(w0, x0); __m128 z = _mm_add_ss(z0, wx); __m128 w = _mm_add_ss(y0, wx); _mm_store_ss(&r.x, x); _mm_store_ss(&r.y, y); _mm_store_ss(&r.z, z); _mm_store_ss(&r.w, w); return r; }
and now the assembly looks a little more fair, since they have the same number of instructions, and both must use xmm registers.
shuffle1: movss xmm5,dword ptr [rdx+8] mov rax,rcx movss xmm3,dword ptr [rdx+0Ch] movaps xmm0,xmm5 movss xmm2,dword ptr [rdx] addss xmm0,xmm3 movss xmm4,dword ptr [rdx+4] movaps xmm1,xmm2 addss xmm1,xmm5 addss xmm0,xmm2 addss xmm1,xmm4 movss dword ptr [rcx],xmm0 movaps xmm0,xmm4 addss xmm0,xmm2 addss xmm4,xmm3 movss dword ptr [rcx+4],xmm1 addss xmm0,xmm3 addss xmm4,xmm5 movss dword ptr [rcx+8],xmm0 movss dword ptr [rcx+0Ch],xmm4 ret shuffle2: movaps xmm4,xmmword ptr [rdx] mov rax,rcx movaps xmm3,xmm4 movaps xmm5,xmm4 shufps xmm5,xmm4,0AAh movaps xmm2,xmm4 shufps xmm2,xmm4,0FFh movaps xmm0,xmm5 addss xmm0,xmm3 shufps xmm4,xmm4,55h movaps xmm1,xmm4 addss xmm1,xmm2 addss xmm2,xmm0 addss xmm4,xmm0 addss xmm3,xmm1 addss xmm5,xmm1 movss dword ptr [rcx],xmm2 movss dword ptr [rcx+4],xmm4 movss dword ptr [rcx+8],xmm3 movss dword ptr [rcx+0Ch],xmm5 ret
but it does not matter. shuffle1
is still 30% faster!