(Update: new answer from 2019-01-31)
Three alternatives:
Great answer by Peter Cordes . Fast. This solution is not branching, which should not be a problem if the input is often zero with an irregular occurrence pattern.
My previous answer, which is now in the edit history of this answer. Less effective than Peter Cordes's answer, but without branches.
This is the answer. Very fast if the data from 2 tiny lookup tables is in L1 cache. The size of the L1 cache is 128 bytes. Off-site. A frequent call may cause a cache error.
In this answer, the input vector epi64 compared to zero, which creates a mask. This mask is converted to the 4-bit i_mask index (using _mm256_movemask_pd ). With the i_mask index i_mask two values ββare read from two lookup tables: 1. the index of the first non-zero 64-bit element and 2. the number of non-zero preceding (from left to right) zero elements. Finally, _lzcnt_u64 first non-zero 64-bit element is calculated and added to the value of the lookup table. The mm256_lzcnt_si256 function implements this method:
#include <stdio.h> #include <stdint.h> #include <x86intrin.h> #include <stdalign.h> /* gcc -Wall -m64 -O3 -march=haswell clz_avx256_upd.c */ int mm256_lzcnt_si256(__m256i input) { /* Version with lookup tables and scratch array included in the function */ /* Two tiny lookup tables (64 bytes each, less space is possible with uint8_t or uint16_t arrays instead of uint32_t): */ /* i_mask (input==0) 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 */ /* ~i_mask (input!=0) 1111 1110 1101 1100 1011 1010 1001 1000 0111 0110 0101 0100 0011 0010 0001 0000 */ static const uint32_t indx[16] = { 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 0, 0}; static const uint32_t lz_msk[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 64, 64, 64, 64, 128, 128, 192, 192}; alignas(32) uint64_t tmp[4] = { 0, 0, 0, 0}; /* tmp is a scratch array of 32 bytes, preferably 32 byte aligned */ _mm256_storeu_si256((__m256i*)&tmp[0], input); /* Store input in the scratch array */ __m256i mask = _mm256_cmpeq_epi64(input, _mm256_setzero_si256()); /* Check which 64 bits elements are zero */ uint32_t i_mask = _mm256_movemask_pd(_mm256_castsi256_pd(mask)); /* Move vector mask to integer mask */ uint64_t input_i = tmp[indx[i_mask]]; /* Load the first (from the left) non-zero 64 bit element input_i */ int32_t lz_input_i = _lzcnt_u64(input_i); /* Count the number of leading zeros in input_i */ int32_t lz = lz_msk[i_mask] + lz_input_i; /* Add the number of leading zeros of the preceding 64 bit elements */ return lz; } int mm256_lzcnt_si256_v2(__m256i input, uint64_t* restrict tmp, const uint32_t* indx, const uint32_t* lz_msk) { /* Version that compiles to nice assembly, although, after inlining there won't be any difference between the different versions. */ _mm256_storeu_si256((__m256i*)&tmp[0], input); /* Store input in the scratch array */ __m256i mask = _mm256_cmpeq_epi64(input, _mm256_setzero_si256()); /* Check which 64 bits elements are zero */ uint32_t i_mask = _mm256_movemask_pd(_mm256_castsi256_pd(mask)); /* Move vector mask to integer mask */ uint64_t input_i = tmp[indx[i_mask]]; /* Load the first (from the left) non-zero 64 bit element input_i */ int32_t lz_input_i = _lzcnt_u64(input_i); /* Count the number of leading zeros in input_i */ int32_t lz = lz_msk[i_mask] + lz_input_i; /* Add the number of leading zeros of the preceding 64 bit elements */ return lz; } __m256i bit_mask_avx2_lsb(unsigned int n) { __m256i ones = _mm256_set1_epi32(-1); __m256i cnst32_256 = _mm256_set_epi32(256,224,192,160, 128,96,64,32); __m256i shift = _mm256_set1_epi32(n); shift = _mm256_subs_epu16(cnst32_256,shift); return _mm256_srlv_epi32(ones,shift); } int print_avx2_hex(__m256i ymm) { long unsigned int x[4]; _mm256_storeu_si256((__m256i*)x,ymm); printf("%016lX %016lX %016lX %016lX ", x[3],x[2],x[1],x[0]); return 0; } int main() { unsigned int i; __m256i x; printf("mm256_lzcnt_si256\n"); for (i = 0; i < 257; i++){ printf("x="); x = bit_mask_avx2_lsb(i); print_avx2_hex(x); printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x)); } printf("\n"); x = _mm256_set_epi32(0,0,0,0, 0,15,1,0); printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x)); x = _mm256_set_epi32(0,0,0,8, 0,0,0,256); printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x)); x = _mm256_set_epi32(0,0x100,0,8, 0,192,0,0); printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x)); x = _mm256_set_epi32(-1,0x100,0,8, 0,0,32,0); printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x)); /* Set arrays for mm256_lzcnt_si256_v2: */ alignas(32) static const uint32_t indx[16] = { 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 0, 0}; alignas(32) static const uint32_t lz_msk[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 64, 64, 64, 64, 128, 128, 192, 192}; alignas(32) uint64_t tmp[4] = { 0, 0, 0, 0}; printf("\nmm256_lzcnt_si256_v2\n"); for (i = 0; i < 257; i++){ printf("x="); x = bit_mask_avx2_lsb(i); print_avx2_hex(x); printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk)); } printf("\n"); x = _mm256_set_epi32(0,0,0,0, 0,15,1,0); printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk)); x = _mm256_set_epi32(0,0,0,8, 0,0,0,256); printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk)); x = _mm256_set_epi32(0,0x100,0,8, 0,192,0,0); printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk)); x = _mm256_set_epi32(-1,0x100,0,8, 0,0,32,0); printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk)); return 0; }
The conclusion suggests that the code is correct:
$ ./a.out mm256_lzcnt_si256 x=0000000000000000 0000000000000000 0000000000000000 0000000000000000 lzcnt(x)=256 x=0000000000000000 0000000000000000 0000000000000000 0000000000000001 lzcnt(x)=255 ... x=0000000000000000 0000000000000000 7FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF lzcnt(x)=129 x=0000000000000000 0000000000000000 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF lzcnt(x)=128 x=0000000000000000 0000000000000001 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF lzcnt(x)=127 ... x=7FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF lzcnt(x)=1 x=FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF lzcnt(x)=0 x=0000000000000000 0000000000000000 000000000000000F 0000000100000000 lzcnt(x)=188 x=0000000000000000 0000000000000008 0000000000000000 0000000000000100 lzcnt(x)=124 x=0000000000000100 0000000000000008 00000000000000C0 0000000000000000 lzcnt(x)=55 x=FFFFFFFF00000100 0000000000000008 0000000000000000 0000002000000000 lzcnt(x)=0
The mm256_lzcnt_si256_v2 function is an alternative version of the same function, but now pointers to lookup tables and the working array are passed with the function call. This results in a clean build code (without stack operations) and gives the impression of what instructions are needed after embedding mm256_lzcnt_si256 in the loop.
With gcc 8.2 and the options -m64 -O3 -march=skylake :
mm256_lzcnt_si256_v2: vpxor xmm1, xmm1, xmm1 vmovdqu YMMWORD PTR [rdi], ymm0 vpcmpeqq ymm0, ymm0, ymm1 vmovmskpd ecx, ymm0 mov eax, DWORD PTR [rsi+rcx*4] lzcnt rax, QWORD PTR [rdi+rax*8] add eax, DWORD PTR [rdx+rcx*4] vzeroupper ret
In the context of the loop and with embedding, vpxor probably goes beyond the loop.