Is shufps slower than memory access?

The name may seem meaningless, but let me explain. I studied the program the other day when I came across the following assembler code:

movaps xmm3, xmmword ptr [rbp-30h] lea rdx, [rdi+1320h] movaps xmm5, xmm3 movaps xmm6, xmm3 movaps xmm0, xmm3 movss dword ptr [rdx], xmm3 shufps xmm5, xmm3, 55h shufps xmm6, xmm3, 0AAh shufps xmm0, xmm3, 0FFh movaps xmm4, xmm3 movss dword ptr [rdx+4], xmm5 movss dword ptr [rdx+8], xmm6 movss dword ptr [rdx+0Ch], xmm0 mulss xmm4, xmm3 

and it looks like it is just copying four floats from [rbp-30h] to [rdx]. Those shufps are only used to select one of the four floats in xmm3 (for example, shufps xmm5, xmm3, 55h selects the second float and puts it in xmm5 ).

This makes me wonder if the compiler did this because shufps is actually faster than memory access (something like movss xmm0, dword ptr [rbp-30h] , movss dword ptr [rdx], xmm0 ).

So, I wrote several tests to compare these two approaches and found shufps always slower than a few memory accesses. Now I think that using shufps has nothing to do with performance. It might just be for obfuscating the code, so decompilers cannot easily create clean code (tried with IDA pro, and it was really too complicated).

Although I probably will never use shufps explicitly in any case (using _mm_shuffle_ps for example) in any practical programs, since the compiler is most likely smarter than me, I still want to know why the compiler that compiled the program generates such code. It is neither faster nor less. It makes no sense.

In any case, I will give the tests that I wrote below.

 #include <Windows.h> #include <iostream> using namespace std; __declspec(noinline) DWORD profile_routine(void (*routine)(void *), void *arg, int iterations = 1) { DWORD startTime = GetTickCount(); while (iterations--) { routine(arg); } DWORD timeElapsed = GetTickCount() - startTime; return timeElapsed; } struct Struct { float x, y, z, w; }; __declspec(noinline) Struct shuffle1(float *arr) { float x = arr[3]; float y = arr[2]; float z = arr[0]; float w = arr[1]; return {x, y, z, w}; } #define SS0 (0x00) #define SS1 (0x55) #define SS2 (0xAA) #define SS3 (0xFF) __declspec(noinline) Struct shuffle2(float *arr) { Struct r; __m128 packed = *reinterpret_cast<__m128 *>(arr); __m128 x = _mm_shuffle_ps(packed, packed, SS3); __m128 y = _mm_shuffle_ps(packed, packed, SS2); __m128 z = _mm_shuffle_ps(packed, packed, SS0); __m128 w = _mm_shuffle_ps(packed, packed, SS1); _mm_store_ss(&r.x, x); _mm_store_ss(&r.y, y); _mm_store_ss(&r.z, z); _mm_store_ss(&r.w, w); return r; } void profile_shuffle_r1(void *arg) { float *arr = static_cast<float *>(arg); Struct q = shuffle1(arr); arr[0] += qw; arr[1] += qz; arr[2] += qy; arr[3] += qx; } void profile_shuffle_r2(void *arg) { float *arr = static_cast<float *>(arg); Struct q = shuffle2(arr); arr[0] += qw; arr[1] += qz; arr[2] += qy; arr[3] += qx; } int main(int argc, char **argv) { int n = argc + 3; float arr1[4], arr2[4]; for (int i = 0; i < 4; i++) { arr1[i] = static_cast<float>(n + i); arr2[i] = static_cast<float>(n + i); } int iterations = 20000000; DWORD time1 = profile_routine(profile_shuffle_r1, arr1, iterations); cout << "time1 = " << time1 << endl; DWORD time2 = profile_routine(profile_shuffle_r2, arr2, iterations); cout << "time2 = " << time2 << endl; return 0; } 

In the above test, I have two shuffle methods shuffle1 and shuffle2 that do the same. When compiling with MSVC-O2, it creates the following code:

 shuffle1: mov eax,dword ptr [rdx+0Ch] mov dword ptr [rcx],eax mov eax,dword ptr [rdx+8] mov dword ptr [rcx+4],eax mov eax,dword ptr [rdx] mov dword ptr [rcx+8],eax mov eax,dword ptr [rdx+4] mov dword ptr [rcx+0Ch],eax mov rax,rcx ret shuffle2: movaps xmm2,xmmword ptr [rdx] mov rax,rcx movaps xmm0,xmm2 shufps xmm0,xmm2,0FFh movss dword ptr [rcx],xmm0 movaps xmm0,xmm2 shufps xmm0,xmm2,0AAh movss dword ptr [rcx+4],xmm0 movss dword ptr [rcx+8],xmm2 shufps xmm2,xmm2,55h movss dword ptr [rcx+0Ch],xmm2 ret 

shuffle1 always 30% faster than shuffle2 on my machine. I noticed that shuffle2 has two more instructions, and shuffle1 uses eax instead of xmm0 , so I thought that if you add some arithmetic operations, the result will be different.

Therefore, I changed them as follows:

 __declspec(noinline) Struct shuffle1(float *arr) { float x0 = arr[3]; float y0 = arr[2]; float z0 = arr[0]; float w0 = arr[1]; float x = x0 + y0 + z0; float y = y0 + z0 + w0; float z = z0 + w0 + x0; float w = w0 + x0 + y0; return {x, y, z, w}; } #define SS0 (0x00) #define SS1 (0x55) #define SS2 (0xAA) #define SS3 (0xFF) __declspec(noinline) Struct shuffle2(float *arr) { Struct r; __m128 packed = *reinterpret_cast<__m128 *>(arr); __m128 x0 = _mm_shuffle_ps(packed, packed, SS3); __m128 y0 = _mm_shuffle_ps(packed, packed, SS2); __m128 z0 = _mm_shuffle_ps(packed, packed, SS0); __m128 w0 = _mm_shuffle_ps(packed, packed, SS1); __m128 yz = _mm_add_ss(y0, z0); __m128 x = _mm_add_ss(x0, yz); __m128 y = _mm_add_ss(w0, yz); __m128 wx = _mm_add_ss(w0, x0); __m128 z = _mm_add_ss(z0, wx); __m128 w = _mm_add_ss(y0, wx); _mm_store_ss(&r.x, x); _mm_store_ss(&r.y, y); _mm_store_ss(&r.z, z); _mm_store_ss(&r.w, w); return r; } 

and now the assembly looks a little more fair, since they have the same number of instructions, and both must use xmm registers.

 shuffle1: movss xmm5,dword ptr [rdx+8] mov rax,rcx movss xmm3,dword ptr [rdx+0Ch] movaps xmm0,xmm5 movss xmm2,dword ptr [rdx] addss xmm0,xmm3 movss xmm4,dword ptr [rdx+4] movaps xmm1,xmm2 addss xmm1,xmm5 addss xmm0,xmm2 addss xmm1,xmm4 movss dword ptr [rcx],xmm0 movaps xmm0,xmm4 addss xmm0,xmm2 addss xmm4,xmm3 movss dword ptr [rcx+4],xmm1 addss xmm0,xmm3 addss xmm4,xmm5 movss dword ptr [rcx+8],xmm0 movss dword ptr [rcx+0Ch],xmm4 ret shuffle2: movaps xmm4,xmmword ptr [rdx] mov rax,rcx movaps xmm3,xmm4 movaps xmm5,xmm4 shufps xmm5,xmm4,0AAh movaps xmm2,xmm4 shufps xmm2,xmm4,0FFh movaps xmm0,xmm5 addss xmm0,xmm3 shufps xmm4,xmm4,55h movaps xmm1,xmm4 addss xmm1,xmm2 addss xmm2,xmm0 addss xmm4,xmm0 addss xmm3,xmm1 addss xmm5,xmm1 movss dword ptr [rcx],xmm2 movss dword ptr [rcx+4],xmm4 movss dword ptr [rcx+8],xmm3 movss dword ptr [rcx+0Ch],xmm5 ret 

but it does not matter. shuffle1 is still 30% faster!

+7
source share
2 answers

Without a broader context, it's hard to say for sure, but ... when optimizing for newer processors, you should consider using different ports. See here Agners: http://www.agner.org/optimize/instruction_tables.pdf

In this case, although this may seem unlikely, there are several possibilities that jump on me if we assume that the assembly is actually optimized.

  • This may appear in a code snippet where the Out-Of-Order scheduler has more ports 5 (for example, on Haswell) than in ports 2 and 3 (again, using Haswell as an example).
  • Similar to # 1, but the same effect can be observed with hyper-flow. This code may be designed not to steal read operations from a sibling hyperstream.
  • Finally, especially for this kind of optimization and where I used something like that. Let's say you have a branch that runs at runtime about 100% predictable, but not at compile time. Imagine, presumably, that right after the branch there is a reading, which is often a cache miss. You want to read as soon as possible. The Out-of-Order scheduler will read further and begin to execute this reading if it does not use the read ports. This can make shufps instructions essentially "free" to execute. Here is an example:

      MOV ecx, [some computed, mostly constant at run-time global] label loop: ADD rdi, 16 ADD rbp, 16 CALL shuffle SUB ecx, 1 JNE loop MOV rax, [rdi] ;do a read that could be "predicted" properly MOV rbx, [rax] 

Honestly, this looks like a poorly written assembly or poorly generated machine code, so I would not think about it. The example I give is quite a bit alike.

+3
source

You do not show whether the later code uses the broadcast results of each element to all four positions of the vector. (e.g. 0x55 is _MM_SHUFFLE(1,1,1,1) ). If you already need this for the instruction ...ps later, then you still need these shuffles, so there would be no reason to also perform scalar downloads.

If you do not, and the only visible side effect is the accumulation in memory, then this is simply a fun-missed optimization either by a human programmer using the built-in functions and / or a compiler . As in your MSVC output examples for your test functions.

Keep in mind that some compilers (such as ICC and MSVC) do not actually optimize the built-in functions, so if you write 3x _mm_shuffle_ps you will get 3x shufps , so this is a bad decision made by the person using the built-in functions, not the compiler .


But Clang, on the other hand, is really actively optimizing the shuffle. clang optimizes both of your shuffle functions for one movaps load, one shufps (or pshufd ), and one movups store. This is optimal for most processors, since the work is done in the least number of instructions and mops.

(gcc automatically vectorizes shuffle1 but not shuffle2 . MSVC fails in everything, just using the scalar for shuffle1 )

(If you just need every scalar floating-point number at the bottom of the xmm register for instructions ...ss , you can use the random order that your storage vector creates as one of them, because it has a low element other than input movaps copy first though, or use pshufd to avoid breaking reg with the original low element.)

If you configure movups (for example, Intel pre-Nehalem) specifically for CPUs with slow storage and the result is not aligned, then you will still use one shufps but save the result using movlps and movhps . This is what gcc does if you compile -mtune=core2 .

You obviously know that your input vector is aligned, so it still makes sense to load it using movaps . K8 will split movaps into two 8-byte boot mops, but most other x86-64 processors can perform aligned 16-byte loads as one mop. (Pentium M / Core 1 were the last major Intel processors to separate 128-bit vector operations in this way, and they did not support 64-bit mode.)

vbroadcastss requires AVX, so without AVX, if you want a word from memory to be translated into the XMM register, you must use the shuffle command, which needs port 5 of ALU uop. ( vbroadcastss xmm0, [rsi+4] decodes before a clean UP boot on Intel processors, no ALU uop is required, so it has a bandwidth of 2 per clock frequency instead of 1.)

Older processors, such as Merom and K8, have slow-mixing units that are only 64 bits shufps , so shufps pretty slow because it's full 128-bit mixing with a granularity of less than 64 bits. You might consider running 2x movsd or movq to feed pshuflw , which is fast since it only pshuflw lower 64 bits. But only if you specifically tune into older processors.


  // for gcc, I used __attribute__((ms_abi)) to target the Windows x64 calling convention Struct shuffle3(float *arr) { Struct r; __m128 packed = _mm_load_ps(arr); __m128 xyzw = _mm_shuffle_ps(packed, packed, _MM_SHUFFLE(1,0,2,3)); _mm_storeu_ps(&r.x, xyzw); return r; } 

shuffle1 and shuffle3 are compiled into identical code using gcc and clang ( in the Godbolt compiler explorer ), because they automatically vectorize scalar assignments. The only difference is the use of load movups for shuffle1 , because nothing guarantees 16-byte alignment there. (If we promised the compiler a aligned pointer for a scalar version in pure C, then it would be exactly the same.)

 # MSVC compiles shuffle3 like this as well # from gcc9.1 -O3 (default baseline x86-64, tune=generic) shuffle3(float*): movaps xmm0, XMMWORD PTR [rdx] # MSVC still uses movups even for _mm_load_ps mov rax, rcx # return the retval pointer shufps xmm0, xmm0, 75 movups XMMWORD PTR [rcx], xmm0 # store to the hidden retval pointer ret 

With -mtune=core2 gcc still automatically vectorizes shuffle1 . It uses shared unbalanced loads because we did not promise compiler alignment of memory.

For shuffle3 it uses movaps but still splits _mm_storeu_ps into movlps + movhps . (This is one of the interesting effects that settings can have. They do not allow the compiler to use the new instructions, just change the selection for existing ones.)

 # gcc9.1 -O3 -mtune=core2 # auto-vectorizing shuffle1 shuffle1(float*): movq xmm0, QWORD PTR [rdx] mov rax, rcx movhps xmm0, QWORD PTR [rdx+8] shufps xmm0, xmm0, 75 movlps QWORD PTR [rcx], xmm0 # store in 2 halves movhps QWORD PTR [rcx+8], xmm0 ret 

MSVC has no configuration options and does not automatically vectorize shuffle1 .

0
source

Source: https://habr.com/ru/post/1014897/


All Articles