How to make _mm256_maskstore_epi8 () in C / C ++?

Problem

I am trying to do if I have a vector of 27 (not 32!) int8_t :

x = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26}

I want to first cycle it right to n ( not a constant ), for example. if n = 1:

x2 = {26,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25}

This vector is then used to perform a very complex calculation, but for simplicity, suppose that the next step is simply to cycle it left to n and store it in memory. Therefore, I should have a new vector 27 int8_t :

y = {0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26}

So, thousands of such vectors and performance are very important here. The processor we use supports AVX2, so we want to use it to speed things up.

My current solution

To get x2 , I use two _mm256_loadu_si256() with _mm256_blendv_epi8() :

 int8_t x[31+27+31]; for(int i=0; i<27; i++){ x[31+i] = i; } __m256i mask = _mm256_set_epi32 (0x0, 0x00800000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0); __m256i x_second_part = _mm256_loadu_si256((__m256i*)(x+31+1)); //{1,2,...,26} __m256i x_first_part = _mm256_loadu_si256((__m256i*)(x+31-26)); //{0} __m256i x2 = _mm256_blendv_epi8(x_second_part, x_first_part, mask); //{1,2,...,26, 0} int8_t y[31+27+31]; _mm256_storeu_si256((__m256i*)(y+31-26), x2); _mm256_storeu_si256((__m256i*)(y+31+1), x2); 

The reason x and y are declared in size [31+27+31] is because in this case _mm256_loadu_si256() and _mm256_storeu_si256() will not call segfault.

And I can get the y value:

 for(int i=0; i<27; i++){ cout << (int)y[31+i] << ' '; } 

New problem

Unfortunately, all vectors must be continuous in memory, for example, if there are only two vectors that need to be processed:

 x = {[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26]; [27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53]}; 

Then I can’t just use _mm256_storeu_si256() to return the y value to memory, because when the value of the second vector is written into memory, it will overwrite some values ​​of the first vector:

 int8_t x[31+27+27+31]; int8_t y[31+27+27+31]; for(int i=0; i<27*2; i++){ x[31+i] = i; } for(int i=0; i<2; i++){ __m256i mask = _mm256_set_epi32 (0x0, 0x00800000, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0); __m256i x_second_part = _mm256_loadu_si256((__m256i*)(x+31+27*i+1)); //{1,2,...,26} __m256i x_first_part = _mm256_loadu_si256((__m256i*)(x+31+27*i-26)); //{0} __m256i x2 = _mm256_blendv_epi8(x_second_part, x_first_part, mask); //{1,2,...,26, 0} _mm256_storeu_si256((__m256i*)(y+31+27*i-26), x2); _mm256_storeu_si256((__m256i*)(y+31+27*i+1), x2); } for(int i=0; i<27; i++){ cout << (int)y[31+i] << ' '; }cout << endl; for(int i=0; i<27; i++){ cout << (int)y[31+27+i] << ' '; }cout << endl; 

displays

 0 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 

instead

 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 

So, I was thinking about using a mask. But in the Intel Intrinsic Guide, I could not find _mm256_maskstore_epi8 . This brings me back to the topic:

How to make _mm256_maskstore_epi8 () in C / C ++?

+6
source share
2 answers

There is another implementation of the cyclic shift within a 27-byte vector using AVX2:

 #include <iostream> #include <immintrin.h> const __m256i K0 = _mm256_setr_epi8( 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0); const __m256i K1 = _mm256_setr_epi8( 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0xF0, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70, 0x70); inline const __m256i Shuffle(const __m256i & value, const __m256i & shuffle) { return _mm256_or_si256(_mm256_shuffle_epi8(value, _mm256_add_epi8(shuffle, K0)), _mm256_shuffle_epi8(_mm256_permute4x64_epi64(value, 0x4E), _mm256_add_epi8(shuffle, K1))); } __m256i shuffles[27]; void Init() { uint8_t * p = (uint8_t *)shuffles; for (int s = 0; s < 27; ++s) for (int i = 0; i < 32; ++i) p[s*32 + i] = i < 27 ? (27 + i - s)%27 : i; } void CyclicShift27(const uint8_t * src, size_t shift, uint8_t * dst) { _mm256_storeu_si256((__m256i*)dst, Shuffle(_mm256_loadu_si256((__m256i*)src), shuffles[shift])); } int main() { Init(); uint8_t src[32] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 }, dst[32]; for (int j = 0; j < 27; ++j) { CyclicShift27(src, j, dst); std::cout << "\t"; for (int i = 0; i < 32; i++) std::cout << (int)dst[i] << ' '; std::cout << std::endl; } return 0; } 

Output:

  0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 27 28 29 30 31 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 27 28 29 30 31 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 27 28 29 30 31 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 27 28 29 30 31 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 27 28 29 30 31 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 27 28 29 30 31 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 27 28 29 30 31 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 27 28 29 30 31 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 27 28 29 30 31 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 27 28 29 30 31 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 27 28 29 30 31 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 27 28 29 30 31 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 27 28 29 30 31 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 27 28 29 30 31 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 27 28 29 30 31 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 27 28 29 30 31 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 27 28 29 30 31 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 27 28 29 30 31 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 27 28 29 30 31 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 27 28 29 30 31 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 27 28 29 30 31 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 27 28 29 30 31 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 27 28 29 30 31 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 27 28 29 30 31 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 27 28 29 30 31 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 27 28 29 30 31 

This seems simpler than my previous answer.

+3
source

I made a circular shift implementation inside a 27-byte vector using SSSE3:

 #include <iostream> #include <tmmintrin.h> union Shuffle { uint8_t s[64]; __m128i v[4]; }; Shuffle shuffles[27]; int Shift(int value) { return (value >= 0 && value < 16) ? value : -1; } void Init() { for (int s = 0; s < 27; ++s) { for (int i = 0; i < 16; ++i) { shuffles[s].s[0 + i] = s < 16 ? Shift(i - s) : Shift(i - s + 27); shuffles[s].s[16 + i] = Shift(16 + i - s); shuffles[s].s[32 + i] = Shift(11 + i - s); shuffles[s].s[48 + i] = s < 11 ? Shift(i - s) : Shift(i - s + 27); } } } void CyclicShift27(const uint8_t * src, size_t shift, uint8_t * dst) { __m128i srcLo = _mm_loadu_si128((__m128i*)(src + 0)); __m128i srcHi = _mm_loadu_si128((__m128i*)(src + 11)); __m128i dstLo = _mm_or_si128(_mm_shuffle_epi8(srcLo, shuffles[shift].v[0]), _mm_shuffle_epi8(srcHi, shuffles[shift].v[1])); __m128i dstHi = _mm_or_si128(_mm_shuffle_epi8(srcLo, shuffles[shift].v[2]), _mm_shuffle_epi8(srcHi, shuffles[shift].v[3])); _mm_storeu_si128((__m128i*)(dst + 0), dstLo); _mm_storeu_si128((__m128i*)(dst + 11), dstHi); } int main() { Init(); uint8_t src[27] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26 }, dst[27]; for (int j = 0; j < 27; ++j) { CyclicShift27(src, j, dst); for (int i = 0; i < 27; i++) std::cout << (int)dst[i] << ' '; std::cout << std::endl; } return 0; } 

Exit:

  0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 13 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 12 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 11 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 10 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 9 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 8 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 7 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 6 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 5 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 4 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 3 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 2 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 0 

Hope this will be helpful.

0
source

Source: https://habr.com/ru/post/1011764/


All Articles