Count leading zeros in __m256i word

Question

Count leading zeros in __m256i word

I understand the AVX-2 instructions, and I'm looking for a quick way to count the number of leading zeros in the __m256i word (which has 256 bits).

So far I have understood the following:

 // Computes the number of leading zero bits. // Here, avx_word is of type _m256i. if (!_mm256_testz_si256(avx_word, avx_word)) { uint64_t word = _mm256_extract_epi64(avx_word, 0); if (word > 0) return (__builtin_clzll(word)); word = _mm256_extract_epi64(avx_word, 1); if (word > 0) return (__builtin_clzll(word) + 64); word = _mm256_extract_epi64(avx_word, 2); if (word > 0) return (__builtin_clzll(word) + 128); word = _mm256_extract_epi64(avx_word, 3); return (__builtin_clzll(word) + 192); } else return 256; // word is entirely zero

However, I find it rather inconvenient to figure out the exact non-zero word in a 256-bit register.

Does anyone know if there is a more elegant (or faster) way to do this?

Also as additional information: I really want to calculate the index of the first bit of the set for arbitrarily long vectors created by logical Is, and I compare the performance of standard 64-bit operations with SSE and AVX-2 code. Here is my entire test code:

 #include <stdio.h> #include <stdlib.h> #include <immintrin.h> #include <stdint.h> #include <assert.h> #include <time.h> #include <sys/time.h> #include <stdalign.h> #define ALL 0xFFFFFFFF #define NONE 0x0 #define BV_SHIFTBITS ((size_t) 6) #define BV_MOD_WORD ((size_t) 63) #define BV_ONE ((uint64_t) 1) #define BV_ZERO ((uint64_t) 0) #define BV_WORDSIZE ((uint64_t) 64) uint64_t* Vector_new( size_t num_bits) { assert ((num_bits % 256) == 0); size_t num_words = num_bits >> BV_SHIFTBITS; size_t mod = num_bits & BV_MOD_WORD; if (mod > 0) assert (0); uint64_t* words; posix_memalign((void**) &(words), 32, sizeof(uint64_t) * num_words); for (size_t i = 0; i < num_words; ++i) words[i] = 0; return words; } void Vector_set( uint64_t* vector, size_t pos) { const size_t word_index = pos >> BV_SHIFTBITS; const size_t offset = pos & BV_MOD_WORD; vector[word_index] |= (BV_ONE << (BV_MOD_WORD - offset)); } size_t Vector_and_first_bit( uint64_t** vectors, const size_t num_vectors, const size_t num_words) { for (size_t i = 0; i < num_words; ++i) { uint64_t word = vectors[0][i]; for (size_t j = 1; j < num_vectors; ++j) word &= vectors[j][i]; if (word > 0) return (1 + i * BV_WORDSIZE + __builtin_clzll(word)); } return 0; } size_t Vector_and_first_bit_256( uint64_t** vectors, const size_t num_vectors, const size_t num_avx_words) { for (size_t i = 0; i < num_avx_words; ++i) { const size_t addr_offset = i << 2; __m256i avx_word = _mm256_load_si256( (__m256i const*) (vectors[0] + addr_offset)); // AND the AVX words for (size_t j = 1; j < num_vectors; ++j) { avx_word = _mm256_and_si256( avx_word, _mm256_load_si256((__m256i const*) (vectors[j] + addr_offset)) ); } // test whether resulting AVX word is not zero if (!_mm256_testz_si256(avx_word, avx_word)) { uint64_t word = _mm256_extract_epi64(avx_word, 0); const size_t shift = i << 8; if (word > 0) return (1 + shift + __builtin_clzll(word)); word = _mm256_extract_epi64(avx_word, 1); if (word > 0) return (1 + shift + __builtin_clzll(word) + 64); word = _mm256_extract_epi64(avx_word, 2); if (word > 0) return (1 + shift + __builtin_clzll(word) + 128); word = _mm256_extract_epi64(avx_word, 3); return (1 + shift + __builtin_clzll(word) + 192); } } return 0; } size_t Vector_and_first_bit_128( uint64_t** vectors, const size_t num_vectors, const size_t num_avx_words) { for (size_t i = 0; i < num_avx_words; ++i) { const size_t addr_offset = i << 1; __m128i avx_word = _mm_load_si128( (__m128i const*) (vectors[0] + addr_offset)); // AND the AVX words for (size_t j = 1; j < num_vectors; ++j) { avx_word = _mm_and_si128( avx_word, _mm_load_si128((__m128i const*) (vectors[j] + addr_offset)) ); } // test whether resulting AVX word is not zero if (!_mm_test_all_zeros(avx_word, avx_word)) { uint64_t word = _mm_extract_epi64(avx_word, 0); if (word > 0) return (1 + (i << 7) + __builtin_clzll(word)); word = _mm_extract_epi64(avx_word, 1); return (1 + (i << 7) + __builtin_clzll(word) + 64); } } return 0; } uint64_t* make_random_vector( const size_t num_bits, const size_t propability) { uint64_t* vector = Vector_new(num_bits); for (size_t i = 0; i < num_bits; ++i) { const int x = rand() % 10; if (x >= (int) propability) Vector_set(vector, i); } return vector; } size_t millis( const struct timeval* end, const struct timeval* start) { struct timeval e = *end; struct timeval s = *start; return (1000 * (e.tv_sec - s.tv_sec) + (e.tv_usec - s.tv_usec) / 1000); } int main( int argc, char** argv) { if (argc != 6) printf("fuck %s\n", argv[0]); srand(time(NULL)); const size_t num_vectors = atoi(argv[1]); const size_t size = atoi(argv[2]); const size_t num_iterations = atoi(argv[3]); const size_t num_dimensions = atoi(argv[4]); const size_t propability = atoi(argv[5]); const size_t num_words = size / 64; const size_t num_sse_words = num_words / 2; const size_t num_avx_words = num_words / 4; assert(num_vectors > 0); assert(size > 0); assert(num_iterations > 0); assert(num_dimensions > 0); struct timeval t1; gettimeofday(&t1, NULL); uint64_t*** vectors = (uint64_t***) malloc(sizeof(uint64_t**) * num_vectors); for (size_t j = 0; j < num_vectors; ++j) { vectors[j] = (uint64_t**) malloc(sizeof(uint64_t*) * num_dimensions); for (size_t i = 0; i < num_dimensions; ++i) vectors[j][i] = make_random_vector(size, propability); } struct timeval t2; gettimeofday(&t2, NULL); printf("Creation: %zu ms\n", millis(&t2, &t1)); size_t* results_64 = (size_t*) malloc(sizeof(size_t) * num_vectors); size_t* results_128 = (size_t*) malloc(sizeof(size_t) * num_vectors); size_t* results_256 = (size_t*) malloc(sizeof(size_t) * num_vectors); gettimeofday(&t1, NULL); for (size_t j = 0; j < num_iterations; ++j) for (size_t i = 0; i < num_vectors; ++i) results_64[i] = Vector_and_first_bit(vectors[i], num_dimensions, num_words); gettimeofday(&t2, NULL); const size_t millis_64 = millis(&t2, &t1); printf("64 : %zu ms\n", millis_64); gettimeofday(&t1, NULL); for (size_t j = 0; j < num_iterations; ++j) for (size_t i = 0; i < num_vectors; ++i) results_128[i] = Vector_and_first_bit_128(vectors[i], num_dimensions, num_sse_words); gettimeofday(&t2, NULL); const size_t millis_128 = millis(&t2, &t1); const double factor_128 = (double) millis_64 / (double) millis_128; printf("128 : %zu ms (factor: %.2f)\n", millis_128, factor_128); gettimeofday(&t1, NULL); for (size_t j = 0; j < num_iterations; ++j) for (size_t i = 0; i < num_vectors; ++i) results_256[i] = Vector_and_first_bit_256(vectors[i], num_dimensions, num_avx_words); gettimeofday(&t2, NULL); const size_t millis_256 = millis(&t2, &t1); const double factor_256 = (double) millis_64 / (double) millis_256; printf("256 : %zu ms (factor: %.2f)\n", millis_256, factor_256); for (size_t i = 0; i < num_vectors; ++i) { if (results_64[i] != results_256[i]) printf("ERROR: %zu (64) != %zu (256) with i = %zu\n", results_64[i], results_256[i], i); if (results_64[i] != results_128[i]) printf("ERROR: %zu (64) != %zu (128) with i = %zu\n", results_64[i], results_128[i], i); } free(results_64); free(results_128); free(results_256); for (size_t j = 0; j < num_vectors; ++j) { for (size_t i = 0; i < num_dimensions; ++i) free(vectors[j][i]); free(vectors[j]); } free(vectors); return 0; }

Compile:

 gcc -o main main.c -O3 -Wall -Wextra -pedantic-errors -Werror -march=native -std=c99 -fno-tree-vectorize

For execution:

 ./main 1000 8192 50000 5 9

Parameters mean: 1000 test boxes, vectors 8192 bits long, 50,000, test repetitions (the last two parameters are small changes).

Sample output for the above call on my machine:

 Creation: 363 ms 64 : 15000 ms 128 : 10070 ms (factor: 1.49) 256 : 6784 ms (factor: 2.21)

+6

c x86 avx simd intrinsics

Sven hager Mar 10 '18 at 20:18

source share

4 answers

(Update: new answer from 2019-01-31)

Three alternatives:

Great answer by Peter Cordes . Fast. This solution is not branching, which should not be a problem if the input is often zero with an irregular occurrence pattern.
My previous answer, which is now in the edit history of this answer. Less effective than Peter Cordes's answer, but without branches.
This is the answer. Very fast if the data from 2 tiny lookup tables is in L1 cache. The size of the L1 cache is 128 bytes. Off-site. A frequent call may cause a cache error.

In this answer, the input vector epi64 compared to zero, which creates a mask. This mask is converted to the 4-bit i_mask index (using _mm256_movemask_pd ). With the i_mask index i_mask two values are read from two lookup tables: 1. the index of the first non-zero 64-bit element and 2. the number of non-zero preceding (from left to right) zero elements. Finally, _lzcnt_u64 first non-zero 64-bit element is calculated and added to the value of the lookup table. The mm256_lzcnt_si256 function implements this method:

 #include <stdio.h> #include <stdint.h> #include <x86intrin.h> #include <stdalign.h> /* gcc -Wall -m64 -O3 -march=haswell clz_avx256_upd.c */ int mm256_lzcnt_si256(__m256i input) { /* Version with lookup tables and scratch array included in the function */ /* Two tiny lookup tables (64 bytes each, less space is possible with uint8_t or uint16_t arrays instead of uint32_t): */ /* i_mask (input==0) 0000 0001 0010 0011 0100 0101 0110 0111 1000 1001 1010 1011 1100 1101 1110 1111 */ /* ~i_mask (input!=0) 1111 1110 1101 1100 1011 1010 1001 1000 0111 0110 0101 0100 0011 0010 0001 0000 */ static const uint32_t indx[16] = { 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 0, 0}; static const uint32_t lz_msk[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 64, 64, 64, 64, 128, 128, 192, 192}; alignas(32) uint64_t tmp[4] = { 0, 0, 0, 0}; /* tmp is a scratch array of 32 bytes, preferably 32 byte aligned */ _mm256_storeu_si256((__m256i*)&tmp[0], input); /* Store input in the scratch array */ __m256i mask = _mm256_cmpeq_epi64(input, _mm256_setzero_si256()); /* Check which 64 bits elements are zero */ uint32_t i_mask = _mm256_movemask_pd(_mm256_castsi256_pd(mask)); /* Move vector mask to integer mask */ uint64_t input_i = tmp[indx[i_mask]]; /* Load the first (from the left) non-zero 64 bit element input_i */ int32_t lz_input_i = _lzcnt_u64(input_i); /* Count the number of leading zeros in input_i */ int32_t lz = lz_msk[i_mask] + lz_input_i; /* Add the number of leading zeros of the preceding 64 bit elements */ return lz; } int mm256_lzcnt_si256_v2(__m256i input, uint64_t* restrict tmp, const uint32_t* indx, const uint32_t* lz_msk) { /* Version that compiles to nice assembly, although, after inlining there won't be any difference between the different versions. */ _mm256_storeu_si256((__m256i*)&tmp[0], input); /* Store input in the scratch array */ __m256i mask = _mm256_cmpeq_epi64(input, _mm256_setzero_si256()); /* Check which 64 bits elements are zero */ uint32_t i_mask = _mm256_movemask_pd(_mm256_castsi256_pd(mask)); /* Move vector mask to integer mask */ uint64_t input_i = tmp[indx[i_mask]]; /* Load the first (from the left) non-zero 64 bit element input_i */ int32_t lz_input_i = _lzcnt_u64(input_i); /* Count the number of leading zeros in input_i */ int32_t lz = lz_msk[i_mask] + lz_input_i; /* Add the number of leading zeros of the preceding 64 bit elements */ return lz; } __m256i bit_mask_avx2_lsb(unsigned int n) { __m256i ones = _mm256_set1_epi32(-1); __m256i cnst32_256 = _mm256_set_epi32(256,224,192,160, 128,96,64,32); __m256i shift = _mm256_set1_epi32(n); shift = _mm256_subs_epu16(cnst32_256,shift); return _mm256_srlv_epi32(ones,shift); } int print_avx2_hex(__m256i ymm) { long unsigned int x[4]; _mm256_storeu_si256((__m256i*)x,ymm); printf("%016lX %016lX %016lX %016lX ", x[3],x[2],x[1],x[0]); return 0; } int main() { unsigned int i; __m256i x; printf("mm256_lzcnt_si256\n"); for (i = 0; i < 257; i++){ printf("x="); x = bit_mask_avx2_lsb(i); print_avx2_hex(x); printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x)); } printf("\n"); x = _mm256_set_epi32(0,0,0,0, 0,15,1,0); printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x)); x = _mm256_set_epi32(0,0,0,8, 0,0,0,256); printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x)); x = _mm256_set_epi32(0,0x100,0,8, 0,192,0,0); printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x)); x = _mm256_set_epi32(-1,0x100,0,8, 0,0,32,0); printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256(x)); /* Set arrays for mm256_lzcnt_si256_v2: */ alignas(32) static const uint32_t indx[16] = { 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 1, 1, 0, 0}; alignas(32) static const uint32_t lz_msk[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 64, 64, 64, 64, 128, 128, 192, 192}; alignas(32) uint64_t tmp[4] = { 0, 0, 0, 0}; printf("\nmm256_lzcnt_si256_v2\n"); for (i = 0; i < 257; i++){ printf("x="); x = bit_mask_avx2_lsb(i); print_avx2_hex(x); printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk)); } printf("\n"); x = _mm256_set_epi32(0,0,0,0, 0,15,1,0); printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk)); x = _mm256_set_epi32(0,0,0,8, 0,0,0,256); printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk)); x = _mm256_set_epi32(0,0x100,0,8, 0,192,0,0); printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk)); x = _mm256_set_epi32(-1,0x100,0,8, 0,0,32,0); printf("x=");print_avx2_hex(x);printf("lzcnt(x)=%i \n", mm256_lzcnt_si256_v2(x, tmp, indx, lz_msk)); return 0; }

The conclusion suggests that the code is correct:

 $ ./a.out mm256_lzcnt_si256 x=0000000000000000 0000000000000000 0000000000000000 0000000000000000 lzcnt(x)=256 x=0000000000000000 0000000000000000 0000000000000000 0000000000000001 lzcnt(x)=255 ... x=0000000000000000 0000000000000000 7FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF lzcnt(x)=129 x=0000000000000000 0000000000000000 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF lzcnt(x)=128 x=0000000000000000 0000000000000001 FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF lzcnt(x)=127 ... x=7FFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF lzcnt(x)=1 x=FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF FFFFFFFFFFFFFFFF lzcnt(x)=0 x=0000000000000000 0000000000000000 000000000000000F 0000000100000000 lzcnt(x)=188 x=0000000000000000 0000000000000008 0000000000000000 0000000000000100 lzcnt(x)=124 x=0000000000000100 0000000000000008 00000000000000C0 0000000000000000 lzcnt(x)=55 x=FFFFFFFF00000100 0000000000000008 0000000000000000 0000002000000000 lzcnt(x)=0

The mm256_lzcnt_si256_v2 function is an alternative version of the same function, but now pointers to lookup tables and the working array are passed with the function call. This results in a clean build code (without stack operations) and gives the impression of what instructions are needed after embedding mm256_lzcnt_si256 in the loop.

With gcc 8.2 and the options -m64 -O3 -march=skylake :

 mm256_lzcnt_si256_v2: vpxor xmm1, xmm1, xmm1 vmovdqu YMMWORD PTR [rdi], ymm0 vpcmpeqq ymm0, ymm0, ymm1 vmovmskpd ecx, ymm0 mov eax, DWORD PTR [rsi+rcx*4] lzcnt rax, QWORD PTR [rdi+rax*8] add eax, DWORD PTR [rdx+rcx*4] vzeroupper ret

In the context of the loop and with embedding, vpxor probably goes beyond the loop.

+4

wim Mar 11 '18 at 0:02

source share

Since you are also asking for a more elegant (i.e., simpler) way to do this: on my computer, your code runs as fast as the one below. In both cases, it took 45 milliseconds to calculate the result for 10 million 256-bit words.

Since I filled in the AVX registers with (four) randomly generated uniformly distributed 64-bit integers (as well as equally distributed whole integers), the order of iteration over the array did not affect the result of my test test. Also, although this is almost useless, the compiler is smart enough to deploy a loop.

 uint32_t countLeadZeros(__m256i const& reg) { alignas(32) uint64_t v[4]; _mm256_store_si256((__m256i*)&v[0], reg); for (int i = 3; i >= 0; --i) if (v[i]) return _lzcnt_u64(v[i]) + (3 - i)*64; return 256; }

EDIT : as can be seen in the discussion below of my answer and in my change history, I began to apply an approach similar to @PeterCorbes ( but it provided a better optimized solution ). I changed my approach when I started doing tests, because I completely forgot that almost all of my inputs had the most significant bit located in the upper 64 bits of the word AVX.

After I realized the mistake I made, I decided to try to make the tests more correct. I will give two results below. I looked at the change history of my message, and from there I copied the function I introduced (but later edited) before I changed my approach and went for the forked version. This feature is presented below. I compared the performance of my "forked" function, my "unallocated" function and the unallocated function, which was independently developed by @PeterCorbes. Its version is superior to mine in terms of performance - see its superbly written entry containing many useful details .

 int countLeadZeros(__m256i const& reg){ __m256i zero = _mm256_setzero_si256(); __m256i cmp = _mm256_cmpeq_epi64(reg, zero); int mask = _mm256_movemask_epi8(cmp); if (mask == 0xffffffff) return 256; int first_nonzero_idx = 3 - (_lzcnt_u32(~mask) >> 3); alignas(32) uint64_t stored[4]; // edit: added alignas(32) _mm256_store_si256((__m256i*)stored, reg); int lead_zero_count = _lzcnt_u64(stored[first_nonzero_idx]); return (3 - first_nonzero_idx) * 64 + lead_zero_count; }

Control number 1

I will put the test code in pseudo code to make it short. AVX , . -, , :

 tick() for(int i = 0; i < N; ++i) { // "xoroshiro128+"-based random generator was actually used __m256i in = _mm256_set_epi64x(rand()%2, rand()%2, rand()%2, rand()%2); res = countLeadZeros(in); } tock();

10 200 . , , 65 . , @PeterCorbes, , 60 .

№ 2

, . , :

 tick() for(int i = 0; i < N; ++i) { // "rand()" represents random 64-bit int; xoroshiro128+ waw actually used here __m256i in = _mm256_set_epi64x(rand(), rand(), rand(), rand()); res = countLeadZeros(in); } tock();

; 10 45 . @PeterCorbes 50 , "" 55 .

, . , , , , , usecase.

EDIT: .

@PeterCorbes. , . - , , .

xoroshiro128 + ,

+2

Nejc Mar 11 '18 at 2:33

source share

, "", (Apple LLVM 9.0.0 (clang-900.0.39.2)):

 #define NOT_ZERO(x) (!!(x)) #ifdef UNIFORM_DISTRIBUTION #define LIKELY(x) __builtin_expect(NOT_ZERO(x), 1) #define UNLIKELY(x) __builtin_expect(NOT_ZERO(x), 0) #else #define LIKELY(x) (x) #define UNLIKELY(x) (x) #endif inline unsigned int clz_u128(uint64_t a, uint64_t b, int not_a, int not_b) { if(UNLIKELY(not_a)) { if(UNLIKELY(not_b)) { return 128; } else { return (__builtin_clzll(b)) + 64; } } else { return (__builtin_clzll(a)); } } unsigned int clz_u256(__m256i packed) { const uint64_t a_0 = (uint64_t)_mm256_extract_epi64(packed, 0); const uint64_t a_1 = (uint64_t)_mm256_extract_epi64(packed, 1); const uint64_t b_0 = (uint64_t)_mm256_extract_epi64(packed, 2); const uint64_t b_1 = (uint64_t)_mm256_extract_epi64(packed, 3); const int not_a_0 = !a_0; const int not_a_1 = !a_1; if(UNLIKELY(not_a_0 & not_a_1)) { return clz_u128(b_0, b_1, !b_0, !b_1) + 128; } else { return clz_u128(a_0, a_1, not_a_0, not_a_1); } }

It breaks down the big problem into smaller ones and exploits the fact that for higher bits, there are incredibly more nonzero bits than the lower bits if the vector distribution is uniform.

Just add #define UNIFORM_DISTRIBUTIONif uniform distribution is expected for added performance.

0

gpnuma Mar 11 '18 at 2:09

source share

Peter Cordes · Accepted Answer · 2018-03-11T05:49:47+0000

If your input values are evenly distributed, almost all the time when the most significant bit will be in the upper 64 bits of the vector (1 in 2 ^ 64). A branch on this condition will be very good at predicting. @Nejc's answer is good for this case .

But many problems in which lzcnt is part of the solution have a uniformly distributed output (or similar), so an advantage without branching has an advantage. Not strictly uniform, but everything that is usually for the most significant bit should be somewhere other than the highest 64 bits.

Wim's idea of lzcnt to compare a bitmap to find the right item is a very good approach.

However, indexing a run-time vector variable with storage / reload is probably better than shuffling . Store latency is low (maybe 5 to 7 cycles per Skylake), and this latency is parallel to index generation (compare / movemask / lzcnt). The shuffle strategy with the transition intersection movd/vpermd/movd takes 5 cycles after the index is known to get the right element in an integer register. (See http://agner.org/optimize/ )

I think this version should be better latency for Haswell / Skylake (and Ryzen) as well as better bandwidth . ( vpermd pretty slow on Ryzen, so it should be very good there). Calculation of the address for the load should have a similar delay as the transfer to the repository, so this is what is actually a critical way.

Aligning the stack to 32 to avoid splitting in the cache line in 32-byte storage requires additional instructions, so this is best if it can be built into a function that uses it several times, or already needs such alignment for some other __m256i .

 #include <stdint.h> #include <immintrin.h> #ifndef _MSC_VER #include <stdalign.h> //MSVC is missing this? #else #include <intrin.h> #pragma intrinsic(_BitScanReverse) // https://msdn.microsoft.com/en-us/library/fbxyd7zd.aspx suggests this #endif // undefined result for mask=0, like BSR uint32_t bsr_nonzero(uint32_t mask) { // on Intel, bsr has a minor advantage for the first step // for AMD, BSR is slow so you should use 31-LZCNT. //return 31 - _lzcnt_u32(mask); // Intel docs say there should be a _bit_scan_reverse(x), maybe try that with ICC #ifdef _MSC_VER unsigned long tmp; _BitScanReverse(&tmp, mask); return tmp; #else return 31 - __builtin_clz(mask); #endif }

And the interesting part :

 int mm256_lzcnt_si256(__m256i vec) { __m256i nonzero_elem = _mm256_cmpeq_epi8(vec, _mm256_setzero_si256()); unsigned mask = ~_mm256_movemask_epi8(nonzero_elem); if (mask == 0) return 256; // if this is rare, branching is probably good. alignas(32) // gcc chooses to align elems anyway, with its clunky code uint8_t elems[32]; _mm256_storeu_si256((__m256i*)elems, vec); // unsigned lz_msk = _lzcnt_u32(mask); // unsigned idx = 31 - lz_msk; // can use bsr to get the 31-x, because mask is known to be non-zero. // This takes the 31-x latency off the critical path, in parallel with final lzcnt unsigned idx = bsr_nonzero(mask); unsigned lz_msk = 31 - idx; unsigned highest_nonzero_byte = elems[idx]; return lz_msk * 8 + _lzcnt_u32(highest_nonzero_byte) - 24; // lzcnt(byte)-24, because we don't want to count the leading 24 bits of padding. }

In Godbolt with gcc7.3 -O3 -march=haswell we get asm like this to read ymm1 in esi .

  vpxor xmm0, xmm0, xmm0 mov esi, 256 vpcmpeqd ymm0, ymm1, ymm0 vpmovmskb eax, ymm0 xor eax, -1 # ~mask and set flags, unlike NOT je .L35 bsr eax, eax vmovdqa YMMWORD PTR [rbp-48], ymm1 # note no dependency on anything earlier; OoO exec can run it early mov ecx, 31 mov edx, eax # this is redundant, gcc should just use rax later. But it zero-latency on HSW/SKL and Ryzen. sub ecx, eax movzx edx, BYTE PTR [rbp-48+rdx] # has to wait for the index in edx lzcnt edx, edx lea esi, [rdx-24+rcx*8] # lzcnt(byte) + lzcnt(vectormask) * 8 .L35:

To find the highest non-zero element ( 31 - lzcnt(~movemask) ) 31 - lzcnt(~movemask) we use bsr to directly get the bit (and therefore byte), and subtract the critical path . This is safe as long as we fork the mask equal to zero. (An unallocated version will have to initialize the register to avoid an index outside the bounds).

On AMD processors, bsr significantly slower than lzcnt . On Intel processors, they have the same performance, with the exception of minor changes in the details of output dependencies .

bsr with a zero entry leaves the destination register unmodified, but GCC makes it impossible to take advantage of this. (Intel only documents it as an undefined output, but AMD documents the actual behavior of Intel / AMD processors as generating the old value in the destination register).

bsr sets ZF if the input was zero, and not based on the output, like most instructions. (This output dependency may be the reason that it slows down on AMD.) Branching into BSR flags is not particularly better than branching to ZF, as set by xor eax,-1 to invert the mask, which is what gcc does. In any case, Intel makes the document a _BitScanReverse(&idx, mask) internal , which returns bool , but gcc does not support it (not even with x86intrin.h ). The built-in GNU C does not return a boolean value that allows the use of the flag result, but perhaps gcc will make smart asm using the output of the bsr flag if you check that the input variable C is not zero.

Using the dword ( uint32_t ) and vmovmskps will allow the second lzcnt use the memory source operand instead of having to movzx for zero extension of one byte. But lzcnt has a false dependency on Intel processors before Skylake, so compilers may tend to boot separately and use lzcnt same,same as a workaround anyway. (I did not check.)

The Wim version requires lz_msk-24 because the high 24 bits are always zero with an 8-bit mask. But the 32-bit mask fills the 32-bit register.

This version with 8-bit elements and a 32-bit mask is the opposite: we need lzcnt selected byte, not including the 24 leading zero bits in the register. Therefore, our -24 moved to another place, and not part of the critical path for indexing the array.

gcc decides to do this as part of a single 3-component LEA ( reg + reg*scale - const ), which is great for bandwidth, but puts it on a critical path after the final lzcnt . (This is not free, because the 3-component LEA has an additional delay against reg + reg*scale on Intel processors. See Agner Fog instruction tables ).

Multiplying by 8 can be done as part of lea , but multiplying by 32 will require a shift (or being folded into two separate LEAs).

Intel Optimization Guide says (Table 2-24) that even Sandybridge can transfer from 256-bit storage to single-byte loads without a problem, so I think it's good on AVX2 processors, just like forwarding 32-bit downloads consisting of 4-byte cutouts in storage.

Count leading zeros in __m256i word

More articles: