Reading and writing long memory runs

I have a reorder.cc source file that looks like this:

void reorder(float *output, float *input) {
  output[56] = input[0];
  output[57] = input[1];
  output[58] = input[2];
  output[59] = input[3];
  output[60] = input[4];
  ...
  output[75] = input[19];
  output[76] = input[20];
  output[77] = input[21];
  output[78] = input[22];
  output[79] = input[23];
  output[80] = input[24];
  ...
  output[98] = 0;
  output[99] = 0;
  output[100] = 0;
  output[101] = 0;
  output[102] = 0;
  output[103] = 0;
  output[104] = 0;
  output[105] = input[1];
  output[106] = input[2];
  output[107] = input[3];
  output[108] = input[4];
  output[109] = input[5];
  output[110] = input[6];
  output[111] = 0; 
  ...
}

The function reordering function has a very long list of operations for moving memory from input to the output buffer. The correspondence between input and output is complex, but usually there are fairly long runs with floats of at least 10 in size that are guaranteed to be contiguous. The start is interrupted by a new run, which is started either from an arbitrary input index, or has the value "0".

The associated assembly (.S) file with (g ++ - 6 -march = native -Ofast -S reorder.cc) generates the following assembly:

 .file "reorder.cc"
  .text
  .p2align 4,,15
  .globl  _Z9optimizedPfS_
  .type _Z9optimizedPfS_, @function
_Z9optimizedPfS_:
.LFB0:
  .cfi_startproc
  movss (%rsi), %xmm0
  movss %xmm0, 32(%rdi)
  movss 4(%rsi), %xmm0
  movss %xmm0, 36(%rdi)
  movss 8(%rsi), %xmm0
  movss %xmm0, 40(%rdi)
  movss 12(%rsi), %xmm0
  movss %xmm0, 44(%rdi)
  movss 16(%rsi), %xmm0
  movss %xmm0, 48(%rdi)
  movss 20(%rsi), %xmm0
  movss %xmm0, 52(%rdi)
  movss 28(%rsi), %xmm0
  movss %xmm0, 60(%rdi)
  movss 32(%rsi), %xmm0
  movss %xmm0, 64(%rdi)
  movss 36(%rsi), %xmm0
  ...

..., (fp32) . , , , MOVDQU (Move Unaligned Double Quadword, 128- ) ?

, movdqu, , .

, ? , ?

reorder.cc 100 000 , , , .

, - 100K + ? g++ - 6 -Ofast 1M, Macbook Pro i7.

+4
2

, ( ) " ", input[] - . - , . OTOH, , .


__restrict__, , ? , , .

restrict - C99, ISO ++, ++ __restrict__ __restrict . CPP #define __restrict__ __restrict MSVC , - .


gcc /, clang .

gcc, / (. bugzilla, , , , gcc4.0 - ). ( /), .

__restrict__, clang / xmm ymm. vpextrd ! . + asm Godbolt, clang++ 3.8 -O3 -march=haswell.

g++ 6.1 , . ( gcc godbolt). memcpy, SIMD, -march=haswell, .:/


- , reorder() , . / SIMD, - uop - L1. / L2. , 8- , . ( + ModRM + disp32). , , gcc 32- mov (1 ) movss (3 )

, , , , , , - 32kiB L1 - ( mov ). perf counters, I-cache. . wiki, x86, Agner Fog.


, calloc - , . , ( ). , , calloc - , , - / TLB. , , , , , .


memcpy memset . , , , - perl ( ), memset.

(, 128 ), asm rep movsd ( rep movsq) , Intel. gcc memcpy rep movs memcpy, , (SIMD vs. rep movs) -mstringop-strategy. , , , , .

, , , - , rep movs , . ( Intel, , IvB fast rep movsb ( P6).)

clang gcc, , asm . ( ), asm, rsi, rdi ecx rep movsd.


, . , -. OTOH, - , , . , .

intrinsics, , NT , ( 64B, 64B), . , , , NT?

NT , IDK, NT - . , , (. ).


, . , , . , , , . , - , , .

+5

, , , movups movdqu, , , .

, __restrict__.

, gcc , :

// #define __restrict__ 
void reorder(float * __restrict__ output, float * __restrict__ input) {

  for (auto i = 0; i < 5; i++)
    output[i+56] = input[i];

  for (auto i = 0; i < 6; i++)
    output[i+75] = input[i+19];

  for (auto i = 0; i < 7; i++)
    output[i+98] = 0;

  for (auto i = 0; i < 6; i++)
    output[i+105] = input[i+1];

  output[111] = 0; 
}

, -O2 -ftree-vectorize, :

reorder(float*, float*):
        movups  xmm0, XMMWORD PTR [rsi]
        movups  XMMWORD PTR [rdi+224], xmm0
        movss   xmm0, DWORD PTR [rsi+16]
        movss   DWORD PTR [rdi+240], xmm0
        movups  xmm0, XMMWORD PTR [rsi+76]
        movups  XMMWORD PTR [rdi+300], xmm0
        movss   xmm0, DWORD PTR [rsi+92]
        movss   DWORD PTR [rdi+316], xmm0
        movss   xmm0, DWORD PTR [rsi+96]
        movss   DWORD PTR [rdi+320], xmm0
        pxor    xmm0, xmm0
        movups  xmm1, XMMWORD PTR [rsi+4]
        movups  XMMWORD PTR [rdi+392], xmm0
        pxor    xmm0, xmm0
        movups  XMMWORD PTR [rdi+420], xmm1
        movss   DWORD PTR [rdi+408], xmm0
        movss   DWORD PTR [rdi+412], xmm0
        movss   xmm1, DWORD PTR [rsi+20]
        movss   DWORD PTR [rdi+436], xmm1
        movss   xmm1, DWORD PTR [rsi+24]
        movss   DWORD PTR [rdi+416], xmm0
        movss   DWORD PTR [rdi+440], xmm1
        movss   DWORD PTR [rdi+444], xmm0
        ret

, insn.

https://godbolt.org/g/9aSmB1

+6

Source: https://habr.com/ru/post/1652401/


All Articles