Clear top bytes of __m128i

Question

Clear top bytes of __m128i

How to clear upper bytes 16 - i from __m128i ?

I have tried this; it works, but I wonder if there is a better (shorter, faster) way:

 int i = ... // 0 < i < 16 __m128i x = ... __m128i mask = _mm_set_epi8( 0, (i > 14) ? -1 : 0, (i > 13) ? -1 : 0, (i > 12) ? -1 : 0, (i > 11) ? -1 : 0, (i > 10) ? -1 : 0, (i > 9) ? -1 : 0, (i > 8) ? -1 : 0, (i > 7) ? -1 : 0, (i > 6) ? -1 : 0, (i > 5) ? -1 : 0, (i > 4) ? -1 : 0, (i > 3) ? -1 : 0, (i > 2) ? -1 : 0, (i > 1) ? -1 : 0, -1); x = _mm_and_si128(x, mask);

+4

c ++ c sse simd visual-studio-2012

dtb Sep 7 '13 at 14:08

source share

2 answers

If these were regular 64-bit values, I would use something like -

  mask = (1 << (i * 8)) - 1;

But be careful when generalizing this value to 128, internal shift operators do not necessarily work in these ranges.

For 128b, you can either simply create upper and lower masks, for example,

  __m128i mask = _mm_set_epi64x( i > 7 ? 0xffffffff : (1 << ((i) * 8)) - 1 i > 7 ? (1 << ((i-8) * 8)) - 1 : 0 );

(assuming I have not changed the order, check me out on this, I'm not very familiar with these internals) Alternatively, you can do this on a 2-level uint64 array and load the 128b mask directly from memory using its address.

However, both of these methods do not seem natural, like the original ones, they simply expand the elements from 1 to 8 bytes, but are still partial. It would be preferable to make the right shift with a single 128b variable.

I just stumbled upon this topic regarding the 128b shift -

Looks for a 128 bit sse shift operation for an immediate shift value

It looks as possible, but I never used it. You can try the above single line with the corresponding SSE intrinsitc from there. I would drop that shot -

  mask = _mm_slli_si128(1, i); //emmintrin.h shows the second argument is in bytes already

And then just subtract one using your preferred method (I would be surprised if this type supports the plain old statement)

+2

Leeor Sep 7 '13 at 16:59

source share

Paul r · Accepted Answer · 2013-09-08T17:05:36+0000

I tried several different ways to implement this and compared them to several different compilers on the early Core i7 @ 2.67 GHz and the recent Haswell @ 3.6 GHz:

 // // mask_shift_0 // // use PSHUFB (note: SSSE3 required) // inline __m128i mask_shift_0(uint32_t n) { const __m128i vmask = _mm_set1_epi8(255); const __m128i vperm = _mm_set_epi8(112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127); __m128i vp = _mm_add_epi8(vperm, _mm_set1_epi8(n)); return _mm_shuffle_epi8(vmask, vp); } // // mask_shift_1 // // use 16 element LUT // inline __m128i mask_shift_1(uint32_t n) { static const int8_t mask_lut[16][16] __attribute__ ((aligned(16))) = { { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1 }, { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1 } }; return _mm_load_si128((__m128i *)&mask_lut[n]); } // // mask_shift_2 // // use misaligned load from 2 vector LUT // inline __m128i mask_shift_2(uint32_t n) { static const int8_t mask_lut[32] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; return _mm_loadu_si128((__m128i *)(mask_lut + 16 - n)); } // // mask_shift_3 // // use compare and single vector LUT // inline __m128i mask_shift_3(uint32_t n) { const __m128i vm = _mm_setr_epi8(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); __m128i vn = _mm_set1_epi8(n); return _mm_cmpgt_epi8(vm, vn); } // // mask_shift_4 // // use jump table and immediate shifts // inline __m128i mask_shift_4(uint32_t n) { const __m128i vmask = _mm_set1_epi8(-1); switch (n) { case 0: return vmask; case 1: return _mm_slli_si128(vmask, 1); case 2: return _mm_slli_si128(vmask, 2); case 3: return _mm_slli_si128(vmask, 3); case 4: return _mm_slli_si128(vmask, 4); case 5: return _mm_slli_si128(vmask, 5); case 6: return _mm_slli_si128(vmask, 6); case 7: return _mm_slli_si128(vmask, 7); case 8: return _mm_slli_si128(vmask, 8); case 9: return _mm_slli_si128(vmask, 9); case 10: return _mm_slli_si128(vmask, 10); case 11: return _mm_slli_si128(vmask, 11); case 12: return _mm_slli_si128(vmask, 12); case 13: return _mm_slli_si128(vmask, 13); case 14: return _mm_slli_si128(vmask, 14); case 15: return _mm_slli_si128(vmask, 15); } } // // lsb_mask_0 // // Contributed by by @Leeor/@dtb // // uses _mm_set_epi64x // inline __m128i lsb_mask_0(int n) { if (n >= 8) return _mm_set_epi64x(~(-1LL << (n - 8) * 8), -1); else return _mm_set_epi64x(0, ~(-1LL << (n - 0) * 8)); } // // lsb_mask_1 // // Contributed by by @Leeor/@dtb // // same as lsb_mask_0 but uses conditional operator instead of if/else // inline __m128i lsb_mask_1(int n) { return _mm_set_epi64x(n >= 8 ? ~(-1LL << (n - 8) * 8) : 0, n >= 8 ? -1 : ~(-1LL << (n - 0) * 8)); }

The results were interesting:

Core i7 @ 2.67 GHz, Apple LLVM gcc 4.2.1 (gcc-O3)

 mask_shift_0: 2.23377 ns mask_shift_1: 2.14724 ns mask_shift_2: 2.14270 ns mask_shift_3: 2.15063 ns mask_shift_4: 2.98304 ns lsb_mask_0: 2.15782 ns lsb_mask_1: 2.96628 ns

Core i7 @ 2.67 GHz, Apple clang 4.2 (clang -Os)

 mask_shift_0: 1.35014 ns mask_shift_1: 1.12789 ns mask_shift_2: 1.04329 ns mask_shift_3: 1.09258 ns mask_shift_4: 2.01478 ns lsb_mask_0: 1.70573 ns lsb_mask_1: 1.84337 ns

Haswell E3-1285 @ 3.6 GHz, gcc 4.7.2 (gcc-O2)

 mask_shift_0: 0.851416 ns mask_shift_1: 0.575245 ns mask_shift_2: 0.577746 ns mask_shift_3: 0.850086 ns mask_shift_4: 1.398270 ns lsb_mask_0: 1.359660 ns lsb_mask_1: 1.709720 ns

So mask_shift_4 (switch / case) seems to be the slowest method in all cases, while others are pretty similar. LUT-based methods seem to be consistently the fastest.

NB: I get some suspiciously fast numbers with clang -O3 and gcc -O3 (gcc 4.7.2 only). I need to look at the generated assembly for these cases, to see what the compiler does, and make sure that it does not do anything “smart”, for example, it optimizes some part of the test harness.

If anyone has any further ideas on this, or if there is another implementation of mask_shift that they would like to try, I would be happy to add it to the test suite and update the results.

Clear top bytes of __m128i

More articles: