I tried several different ways to implement this and compared them to several different compilers on the early Core i7 @ 2.67 GHz and the recent Haswell @ 3.6 GHz:
// // mask_shift_0 // // use PSHUFB (note: SSSE3 required) // inline __m128i mask_shift_0(uint32_t n) { const __m128i vmask = _mm_set1_epi8(255); const __m128i vperm = _mm_set_epi8(112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127); __m128i vp = _mm_add_epi8(vperm, _mm_set1_epi8(n)); return _mm_shuffle_epi8(vmask, vp); } // // mask_shift_1 // // use 16 element LUT // inline __m128i mask_shift_1(uint32_t n) { static const int8_t mask_lut[16][16] __attribute__ ((aligned(16))) = { { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1 } }; return _mm_load_si128((__m128i *)&mask_lut[n]); } // // mask_shift_2 // // use misaligned load from 2 vector LUT // inline __m128i mask_shift_2(uint32_t n) { static const int8_t mask_lut[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; return _mm_loadu_si128((__m128i *)(mask_lut + 16 - n)); } // // mask_shift_3 // // use compare and single vector LUT // inline __m128i mask_shift_3(uint32_t n) { const __m128i vm = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); __m128i vn = _mm_set1_epi8(n); return _mm_cmpgt_epi8(vm, vn); } // // mask_shift_4 // // use jump table and immediate shifts // inline __m128i mask_shift_4(uint32_t n) { const __m128i vmask = _mm_set1_epi8(-1); switch (n) { case 0: return vmask; case 1: return _mm_slli_si128(vmask, 1); case 2: return _mm_slli_si128(vmask, 2); case 3: return _mm_slli_si128(vmask, 3); case 4: return _mm_slli_si128(vmask, 4); case 5: return _mm_slli_si128(vmask, 5); case 6: return _mm_slli_si128(vmask, 6); case 7: return _mm_slli_si128(vmask, 7); case 8: return _mm_slli_si128(vmask, 8); case 9: return _mm_slli_si128(vmask, 9); case 10: return _mm_slli_si128(vmask, 10); case 11: return _mm_slli_si128(vmask, 11); case 12: return _mm_slli_si128(vmask, 12); case 13: return _mm_slli_si128(vmask, 13); case 14: return _mm_slli_si128(vmask, 14); case 15: return _mm_slli_si128(vmask, 15); } } // // lsb_mask_0 // // Contributed by by @Leeor/@dtb // // uses _mm_set_epi64x // inline __m128i lsb_mask_0(int n) { if (n >= 8) return _mm_set_epi64x(~(-1LL << (n - 8) * 8), -1); else return _mm_set_epi64x(0, ~(-1LL << (n - 0) * 8)); } // // lsb_mask_1 // // Contributed by by @Leeor/@dtb // // same as lsb_mask_0 but uses conditional operator instead of if/else // inline __m128i lsb_mask_1(int n) { return _mm_set_epi64x(n >= 8 ? ~(-1LL << (n - 8) * 8) : 0, n >= 8 ? -1 : ~(-1LL << (n - 0) * 8)); }
The results were interesting:
Core i7 @ 2.67 GHz, Apple LLVM gcc 4.2.1 (gcc-O3)
mask_shift_0: 2.23377 ns mask_shift_1: 2.14724 ns mask_shift_2: 2.14270 ns mask_shift_3: 2.15063 ns mask_shift_4: 2.98304 ns lsb_mask_0: 2.15782 ns lsb_mask_1: 2.96628 ns
Core i7 @ 2.67 GHz, Apple clang 4.2 (clang -Os)
mask_shift_0: 1.35014 ns mask_shift_1: 1.12789 ns mask_shift_2: 1.04329 ns mask_shift_3: 1.09258 ns mask_shift_4: 2.01478 ns lsb_mask_0: 1.70573 ns lsb_mask_1: 1.84337 ns
Haswell E3-1285 @ 3.6 GHz, gcc 4.7.2 (gcc-O2)
mask_shift_0: 0.851416 ns mask_shift_1: 0.575245 ns mask_shift_2: 0.577746 ns mask_shift_3: 0.850086 ns mask_shift_4: 1.398270 ns lsb_mask_0: 1.359660 ns lsb_mask_1: 1.709720 ns
So mask_shift_4
(switch / case) seems to be the slowest method in all cases, while others are pretty similar. LUT-based methods seem to be consistently the fastest.
NB: I get some suspiciously fast numbers with clang -O3
and gcc -O3
(gcc 4.7.2 only). I need to look at the generated assembly for these cases, to see what the compiler does, and make sure that it does not do anything โsmartโ, for example, it optimizes some part of the test harness.
If anyone has any further ideas on this, or if there is another implementation of mask_shift that they would like to try, I would be happy to add it to the test suite and update the results.