BMI (Intel Haswell , AMD Excavator ), ,
AVX2. , BMI2 AVX2.
, Intel Skylake, ABX2
(do/while) , a b .
32 (. )
. OR-ed .
a, (. : )
: (IPC) , IPC
AVX2. . ,
.
, . :
1. a 3.5 . 2. a 16.0 (50%).
Intel Skylake i5-6500.
Time in sec.
3.5 nonz 16 nonz
mult_shft_Orient() 0.81 1.51
mult_shft_Harold() 0.84 1.51
mult_shft_Harold_unroll2() 0.64 1.58
mult_shft_Harold_unroll4() 0.48 1.34
mult_shft_AVX2() 0.44 0.40
, a b.
AVX2 vpmuludq:
2 Skylake.
:
#include <stdint.h>
#include <stdio.h>
#include <x86intrin.h>
uint64_t mult_shft_AVX2(uint32_t a_32, uint32_t b_32) {
__m256i a = _mm256_broadcastq_epi64(_mm_cvtsi32_si128(a_32));
__m256i b = _mm256_broadcastq_epi64(_mm_cvtsi32_si128(b_32));
__m256i b_0 = _mm256_and_si256(b,_mm256_set_epi64x(0x0000000000000008, 0x0000000000000004, 0x0000000000000002, 0x0000000000000001));
__m256i b_1 = _mm256_and_si256(b,_mm256_set_epi64x(0x0000000000000080, 0x0000000000000040, 0x0000000000000020, 0x0000000000000010));
__m256i b_2 = _mm256_and_si256(b,_mm256_set_epi64x(0x0000000000000800, 0x0000000000000400, 0x0000000000000200, 0x0000000000000100));
__m256i b_3 = _mm256_and_si256(b,_mm256_set_epi64x(0x0000000000008000, 0x0000000000004000, 0x0000000000002000, 0x0000000000001000));
__m256i b_4 = _mm256_and_si256(b,_mm256_set_epi64x(0x0000000000080000, 0x0000000000040000, 0x0000000000020000, 0x0000000000010000));
__m256i b_5 = _mm256_and_si256(b,_mm256_set_epi64x(0x0000000000800000, 0x0000000000400000, 0x0000000000200000, 0x0000000000100000));
__m256i b_6 = _mm256_and_si256(b,_mm256_set_epi64x(0x0000000008000000, 0x0000000004000000, 0x0000000002000000, 0x0000000001000000));
__m256i b_7 = _mm256_and_si256(b,_mm256_set_epi64x(0x0000000080000000, 0x0000000040000000, 0x0000000020000000, 0x0000000010000000));
__m256i m_0 = _mm256_mul_epu32(a, b_0);
__m256i m_1 = _mm256_mul_epu32(a, b_1);
__m256i m_2 = _mm256_mul_epu32(a, b_2);
__m256i m_3 = _mm256_mul_epu32(a, b_3);
__m256i m_4 = _mm256_mul_epu32(a, b_4);
__m256i m_5 = _mm256_mul_epu32(a, b_5);
__m256i m_6 = _mm256_mul_epu32(a, b_6);
__m256i m_7 = _mm256_mul_epu32(a, b_7);
m_0 = _mm256_or_si256(m_0, m_1);
m_2 = _mm256_or_si256(m_2, m_3);
m_4 = _mm256_or_si256(m_4, m_5);
m_6 = _mm256_or_si256(m_6, m_7);
m_0 = _mm256_or_si256(m_0, m_2);
m_4 = _mm256_or_si256(m_4, m_6);
m_0 = _mm256_or_si256(m_0, m_4);
__m128i m_0_lo = _mm256_castsi256_si128(m_0);
__m128i m_0_hi = _mm256_extracti128_si256(m_0, 1);
__m128i e = _mm_or_si128(m_0_lo, m_0_hi);
__m128i e_hi = _mm_unpackhi_epi64(e, e);
e = _mm_or_si128(e, e_hi);
uint64_t c = _mm_cvtsi128_si64x(e);
return c;
}
uint64_t mult_shft_Orient(uint32_t a, uint32_t b) {
uint64_t c = 0;
do {
c |= ((uint64_t)b) << (_bit_scan_forward(a) );
} while ((a = a & (a - 1)) != 0);
return c;
}
uint64_t mult_shft_Harold(uint32_t a_32, uint32_t b_32) {
uint64_t c = 0;
uint64_t a = a_32;
uint64_t b = b_32;
while (a) {
c |= b * (a & -a);
a &= a - 1;
}
return c;
}
uint64_t mult_shft_Harold_unroll2(uint32_t a_32, uint32_t b_32) {
uint64_t c = 0;
uint64_t a = a_32;
uint64_t b = b_32;
while (a) {
c |= b * (a & -a);
a &= a - 1;
c |= b * (a & -a);
a &= a - 1;
}
return c;
}
uint64_t mult_shft_Harold_unroll4(uint32_t a_32, uint32_t b_32) {
uint64_t c = 0;
uint64_t a = a_32;
uint64_t b = b_32;
while (a) {
c |= b * (a & -a);
a &= a - 1;
c |= b * (a & -a);
a &= a - 1;
c |= b * (a & -a);
a &= a - 1;
c |= b * (a & -a);
a &= a - 1;
}
return c;
}
int main(){
uint32_t a,b;
uint32_t rnd = 0xA0036011;
uint32_t rnd_old;
uint64_t c;
uint64_t sum = 0;
double popcntsum =0.0;
int i;
for (i=0;i<100000000;i++){
rnd_old = rnd;
rnd = _mm_crc32_u32(rnd, i);
b = rnd;
a = rnd;
#if 1 == 1
a = rnd & rnd_old;
a = (a & 0b00110000101001000011100010010000) | 1;
#endif
c = mult_shft_AVX2(a, b );
sum = sum + c;
}
printf("sum = %016lX \n\n", sum);
printf("average density = %f bits per uint32_t\n\n", popcntsum/100000000);
return 0;
}
mult_shft_AVX2() ! .
, 1 0.
2, .