I have a simple function that calculates the product of two double arrays:
#include <stdlib.h>
#include <emmintrin.h>
struct S {
double *x;
double *y;
double *z;
};
void f(S& s, size_t n) {
for (int i = 0; i < n; i += 2) {
__m128d xs = _mm_load_pd(&s.x[i]);
__m128d ys = _mm_load_pd(&s.y[i]);
_mm_store_pd(&s.z[i], _mm_mul_pd(xs, ys) );
}
return;
}
int main(void) {
S s;
size_t size = 4;
posix_memalign((void **)&s.x, 16, sizeof(double) * size);
posix_memalign((void **)&s.y, 16, sizeof(double) * size);
posix_memalign((void **)&s.z, 16, sizeof(double) * size);
f(s, size);
return 0;
}
Note that the first argument to f is passed by reference. Let's look at the resulting assembly f () (I deleted some irrelevant snippets, inserted comments, and put some shortcuts):
$ g++ -O3 -S asmtest.cpp
.globl _Z1fR1Sm
_Z1fR1Sm:
xorl %eax, %eax
testq %rsi, %rsi
je .L1
.L5:
movq (%rdi), %r8
movq 8(%rdi), %rcx
movq 16(%rdi), %rdx
movapd (%r8,%rax,8), %xmm0
mulpd (%rcx,%rax,8), %xmm0
movaps %xmm0, (%rdx,%rax,8)
addq $2, %rax
cmpq %rax, %rsi
ja .L5
Note that the addresses of the arrays x, y, z are loaded into universal registers at each iteration, see statements (1), (2), (3). Why doesn't gcc move these instructions outside the loop?
Now create a local copy (not a deep copy) of the structure:
void __attribute__((noinline)) f(S& args, size_t n) {
S s = args;
for (int i = 0; i < n; i += 2) {
__m128d xs = _mm_load_pd(&s.x[i]);
__m128d ys = _mm_load_pd(&s.y[i]);
_mm_store_pd(&s.z[i], _mm_mul_pd(xs, ys) );
}
return;
}
Assembly:
_Z1fR1Sm:
.LFB525:
.cfi_startproc
xorl %eax, %eax
testq %rsi, %rsi
movq (%rdi), %r8
movq 8(%rdi), %rcx
movq 16(%rdi), %rdx
je .L1
.L5:
movapd (%r8,%rax,8), %xmm0
mulpd (%rcx,%rax,8), %xmm0
movaps %xmm0, (%rdx,%rax,8)
addq $2, %rax
cmpq %rax, %rsi
ja .L5
.L1:
rep ret
Please note that unlike the previous code, (1), (2), (3) are outside the loop.
I would appreciate an explanation of why these two assembly codes are different. Is memory smoothing relevant here? Thanks.
$gcc --version
gcc (Debian 5.2.1-21) 5.2.1 20151003