I played a little with the built-in functions, since I needed a complexity function O (1)similar memcmp()to that for a fixed input size. I ended up writing this:
#include <stdint.h>
#include <emmintrin.h>
int64_t f (int64_t a[4], int64_t b[4]) {
__m128i *x = (void *) a, *y = (void *) b, r[2], t;
int64_t *ret = (void *) &t;
r[0] = _mm_xor_si128(x[0], y[0]);
r[1] = _mm_xor_si128(x[1], y[1]);
t = _mm_or_si128(r[0], r[1]);
return (ret[0] | ret[1]);
}
which when compiled turns into this:
f:
movdqa xmm0, XMMWORD PTR [rdi]
movdqa xmm1, XMMWORD PTR [rdi+16]
pxor xmm0, XMMWORD PTR [rsi]
pxor xmm1, XMMWORD PTR [rsi+16]
por xmm0, xmm1
movq rdx, xmm0
pextrq rax, xmm0, 1
or rax, rdx
ret
http://goo.gl/EtovJa (Godbolt Compiler Explorer)
After that, I became curious to find out if I really needed to use internal functions or if I only needed types, and I could just use regular operators. Then I modified the code above (only three SSE lines) and as a result:
#include <stdint.h>
#include <emmintrin.h>
int64_t f (int64_t a[4], int64_t b[4]) {
__m128i *x = (void *) a, *y = (void *) b, r[2], t;
int64_t *ret = (void *) &t;
r[0] = x[0] ^ y[0];
r[1] = x[1] ^ y[1];
t = r[0] | r[1];
return (ret[0] | ret[1]);
}
which compiles instead:
f:
movdqa xmm0, XMMWORD PTR [rdi+16]
movdqa xmm1, XMMWORD PTR [rdi]
pxor xmm0, XMMWORD PTR [rsi+16]
pxor xmm1, XMMWORD PTR [rsi]
por xmm0, xmm1
movq rdx, xmm0
pextrq rax, xmm0, 1
or rax, rdx
ret
http://goo.gl/oDHF3z (Godbolt Compiler Explorer)
(AFAICT) . , , ; . , . - , ?
. GCC .