, ? _mm_load_si128 , .
bool (1 ABI, g++ x86) . SIMD, 1 .
pmovmskb (_mm_movemask_epi8) . , , .
, , pmovmskb Haswell ( 0). (http://agner.org/optimize/). 0x7F 0x80 ( ) 1, 0x7F ( ) 0. ( a bool x86-64 V ABI 0 1, 0 ).
pcmpeqb _mm_set1_epi8(1)? Skylake pcmpeqb 0/1, paddb 3 ALU (0/1/5). pmovmskb pcmpeqb/w/d/q.
#include <immintrin.h>
#include <stdint.h>
void pack_bools(uint16_t *dst, const bool *src, size_t n)
{
__m128i carry_to_highbit = _mm_set1_epi8(0x7F);
for (size_t i = 0 ; i < n ; i+=1) {
__m128i boolvec = _mm_loadu_si128( (__m128i*)&src[i*16] );
__m128i highbits = _mm_add_epi8(boolvec, carry_to_highbit);
dst[i] = _mm_movemask_epi8(highbits);
}
}
, , dst uint16_t . AVX2 uint32_t. (, combine = tmp1 << 16 | tmp, pmovmskb. , , .)
asm ( gcc7.3 -O3 Godbolt)
.L3:
movdqu xmm0, XMMWORD PTR [rsi]
add rsi, 16
add rdi, 2
paddb xmm0, xmm1
pmovmskb eax, xmm0
mov WORD PTR [rdi-2], ax
cmp rdx, rsi
jne .L3
, (7 fuse-domain uops → 16 ~ 1,75 ). Clang 2 16 bools 1,5 .
(pslld xmm0, 7) 2 Haswell, 0.