How to implement the intersection of the logical bit shift / rotation intersection band (left and right) in AVX2

In accordance with this answer, I created the following test program:

#include <iso646.h>
#include <immintrin.h>

#include <stdio.h>

#define SHIFT_LEFT( N ) \ 
\
    inline __m256i shift_left_##N ( __m256i A  ) { \
\
    if ( N == 0 ) return A; \
    else if ( N <  16 ) return _mm256_alignr_epi8 ( A, _mm256_permute2x128_si256 ( A, A, _MM_SHUFFLE ( 0, 0, 2, 0 ) ), ( uint8_t ) ( 16 - N ) ); \
    else if ( N == 16 ) return _mm256_permute2x128_si256 ( A, A, _MM_SHUFFLE ( 0, 0, 2, 0 ) ); \
    else return _mm256_slli_si256 ( _mm256_permute2x128_si256 ( A, A, _MM_SHUFFLE ( 0, 0, 2, 0 ) ), ( uint8_t ) ( N - 16 ) ); \
}

void print ( const size_t n ) {

    size_t i = 0x8000000000000000;

    while ( i ) {

        putchar ( ( int ) ( n & i ) + ( int ) ( 48 ) );
        i >>= 1;
        putchar ( ( int ) ( n & i ) + ( int ) ( 48 ) );
        i >>= 1;

        putchar ( ' ' );
    }
}

SHIFT_LEFT ( 2 );

int main ( ) {

    __m256i a = _mm256_set_epi64x ( 0x00, 0x00, 0x00, 0x03 );
    __m256i b = shift_left_2 ( a );

    size_t * c = ( size_t * ) &b;

    print ( c [ 3 ] ); print ( c [ 2 ] ); print ( c [ 1 ] ); print ( c [ 0 ] ); putchar ( '\n' );

    return 0;
}

The above program does not give the expected (for me) output, as far as I can see. I do not understand how these functions work together (read the descriptions). Am I doing something wrong, or is shift_left () implementation incorrect?

EDIT1: I realized (and confirmed in the comments) that this code only intends to shift to a maximum of 32 (and will be bytes), so it does not satisfy my purpose. Because of what the question arises: "How to implement a cross logical bit-shift (left and right) in AVX2."

EDIT2: : , , . ( ) .

+4
2

/ ( ) AVX2:

// Prototypes...

__m256i _mm256_sli_si256 ( __m256i, int );
__m256i _mm256_sri_si256 ( __m256i, int );
__m256i _mm256_rli_si256 ( __m256i, int );
__m256i _mm256_rri_si256 ( __m256i, int );


// Implementations...

__m256i left_shift_000_063 ( __m256i a, int n ) { // 6

    return _mm256_or_si256 ( _mm256_slli_epi64 ( a, n ), _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_permute4x64_epi64 ( _mm256_srli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 2, 1, 0, 0 ) ), _MM_SHUFFLE ( 3, 3, 3, 0 ) ) );
}

__m256i left_shift_064_127 ( __m256i a, int n ) { // 7

    __m256i b = _mm256_slli_epi64 ( a, n );
    __m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 2, 1, 0, 0 ) );

    __m256i c = _mm256_srli_epi64 ( a, 64 - n );
    __m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 0, 0 ) );

    __m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 3, 3, 3, 0 ) );
    __m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 3, 3, 0, 0 ) ); // 6

    return _mm256_or_si256 ( f, g );
}

__m256i left_shift_128_191 ( __m256i a, int n ) { // 7

    __m256i b = _mm256_slli_epi64 ( a, n );
    __m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 1, 0, 0, 0 ) );

    __m256i c = _mm256_srli_epi64 ( a, 64 - n );
    __m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 0, 0 ) );

    __m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 3, 3, 0, 0 ) );
    __m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 3, 0, 0, 0 ) );

    return _mm256_or_si256 ( f, g );
}

__m256i left_shift_192_255 ( __m256i a, int n ) { // 5

    return _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_slli_epi64 ( _mm256_permute4x64_epi64 ( a, _MM_SHUFFLE ( 0, 0, 0, 0 ) ), n ), _MM_SHUFFLE ( 3, 0, 0, 0 ) );
}

__m256i _mm256_sli_si256 ( __m256i a, int n ) {

    if ( n < 128 ) return n <  64 ? left_shift_000_063 ( a, n ) : left_shift_064_127 ( a, n % 64 );
    else           return n < 192 ? left_shift_128_191 ( a, n % 64 ) : left_shift_192_255 ( a, n % 64 );
}


__m256i right_shift_000_063 ( __m256i a, int n ) { // 6

    return _mm256_or_si256 ( _mm256_srli_epi64 ( a, n ), _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_permute4x64_epi64 ( _mm256_slli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 0, 3, 2, 1 ) ), _MM_SHUFFLE ( 0, 3, 3, 3 ) ) );
}

__m256i right_shift_064_127 ( __m256i a, int n ) { // 7

    __m256i b = _mm256_srli_epi64 ( a, n );
    __m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 3, 3, 2, 1 ) );

    __m256i c = _mm256_slli_epi64 ( a, 64 - n );
    __m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 3, 3, 3, 2 ) );

    __m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 0, 3, 3, 3 ) );
    __m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 0, 0, 3, 3 ) );

    return _mm256_or_si256 ( f, g );
}

__m256i right_shift_128_191 ( __m256i a, int n ) { // 7

    __m256i b = _mm256_srli_epi64 ( a, n );
    __m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 3, 2, 3, 2 ) );

    __m256i c = _mm256_slli_epi64 ( a, 64 - n );
    __m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 3, 2, 1, 3 ) );

    __m256i f = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), d, _MM_SHUFFLE ( 0, 0, 3, 3 ) );
    __m256i g = _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), e, _MM_SHUFFLE ( 0, 0, 0, 3 ) );

    return _mm256_or_si256 ( f, g );
}

__m256i right_shift_192_255 ( __m256i a, int n ) { // 5

    return _mm256_blend_epi32 ( _mm256_setzero_si256 ( ), _mm256_srli_epi64 ( _mm256_permute4x64_epi64 ( a, _MM_SHUFFLE ( 0, 0, 0, 3 ) ), n ), _MM_SHUFFLE ( 0, 0, 0, 3 ) );
}

__m256i _mm256_sri_si256 ( __m256i a, int n ) {

    if ( n < 128 ) return n <  64 ? right_shift_000_063 ( a, n ) : right_shift_064_127 ( a, n % 64 );
    else           return n < 192 ? right_shift_128_191 ( a, n % 64 ) : right_shift_192_255 ( a, n % 64 );
}


__m256i left_rotate_000_063 ( __m256i a, int n ) { // 5

    return _mm256_or_si256 ( _mm256_slli_epi64 ( a, n ), _mm256_permute4x64_epi64 ( _mm256_srli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 2, 1, 0, 3 ) ) );
}

__m256i left_rotate_064_127 ( __m256i a, int n ) { // 6

    __m256i b = _mm256_slli_epi64 ( a, n );
    __m256i c = _mm256_srli_epi64 ( a, 64 - n );

    __m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 2, 1, 0, 3 ) );
    __m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 3, 2 ) );

    return _mm256_or_si256 ( d, e );
}

__m256i left_rotate_128_191 ( __m256i a, int n ) { // 6

    __m256i b = _mm256_slli_epi64 ( a, n );
    __m256i c = _mm256_srli_epi64 ( a, 64 - n );

    __m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 1, 0, 3, 2 ) );
    __m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 0, 3, 2, 1 ) );

    return _mm256_or_si256 ( d, e );
}

__m256i left_rotate_192_255 ( __m256i a, int n ) { // 5

    return _mm256_or_si256 ( _mm256_srli_epi64 ( a, 64 - n ), _mm256_permute4x64_epi64 ( _mm256_slli_epi64 ( a, n ), _MM_SHUFFLE ( 0, 3, 2, 1 ) ) );
}

__m256i _mm256_rli_si256 ( __m256i a, int n ) {

    if ( n < 128 ) return n <  64 ? left_rotate_000_063 ( a, n ) : left_rotate_064_127 ( a, n % 64 );
    else           return n < 192 ? left_rotate_128_191 ( a, n % 64 ) : left_rotate_192_255 ( a, n % 64 );
}


__m256i right_rotate_000_063 ( __m256i a, int n ) { // 5

    return _mm256_or_si256 ( _mm256_srli_epi64 ( a, n ), _mm256_permute4x64_epi64 ( _mm256_slli_epi64 ( a, 64 - n ), _MM_SHUFFLE ( 0, 3, 2, 1 ) ) );
}

__m256i right_rotate_064_127 ( __m256i a, int n ) { // 6

    __m256i b = _mm256_srli_epi64 ( a, n );
    __m256i c = _mm256_slli_epi64 ( a, 64 - n );

    __m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 0, 3, 2, 1 ) );
    __m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 1, 0, 3, 2 ) );

    return _mm256_or_si256 ( d, e );
}

__m256i right_rotate_128_191 ( __m256i a, int n ) { // 6

    __m256i b = _mm256_srli_epi64 ( a, n );
    __m256i c = _mm256_slli_epi64 ( a, 64 - n );

    __m256i d = _mm256_permute4x64_epi64 ( b, _MM_SHUFFLE ( 1, 0, 3, 2 ) );
    __m256i e = _mm256_permute4x64_epi64 ( c, _MM_SHUFFLE ( 2, 1, 0, 3 ) );

    return _mm256_or_si256 ( d, e );
}
__m256i right_rotate_192_255 ( __m256i a, int n ) { // 5

    return _mm256_or_si256 ( _mm256_slli_epi64 ( a, 64 - n ), _mm256_permute4x64_epi64 ( _mm256_srli_epi64 ( a, n ), _MM_SHUFFLE ( 2, 1, 0, 3 ) ) );
}

__m256i _mm256_rri_si256 ( __m256i a, int n ) {

    if ( n < 128 ) return n <  64 ? right_rotate_000_063 ( a, n      ) : right_rotate_064_127 ( a, n % 64 );
    else           return n < 192 ? right_rotate_128_191 ( a, n % 64 ) : right_rotate_192_255 ( a, n % 64 );
}

_mm256_permute4x64_epi64 ( ) , .

/ , , , . , / .

, , , / f.e. .

+1

, , . , .

:

  • : ~ 12 - 14
  • : 5
  • : 6

-, . /.

, , .

, , . . , .

#include <stdint.h>
#include <immintrin.h>

class LeftShifter_AVX2{
public:
    LeftShifter_AVX2(uint32_t bits){
        //  Precompute all the necessary values.
        permL = _mm256_sub_epi32(
            _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7),
            _mm256_set1_epi32(bits / 32)
        );
        permR = _mm256_sub_epi32(permL, _mm256_set1_epi32(1));

        bits %= 32;
        shiftL = _mm_cvtsi32_si128(bits);
        shiftR = _mm_cvtsi32_si128(32 - bits);
        __m256i maskL = _mm256_cmpgt_epi32(_mm256_setzero_si256(), permL);
        __m256i maskR = _mm256_cmpgt_epi32(_mm256_setzero_si256(), permR);
        mask = _mm256_or_si256(maskL, _mm256_srl_epi32(maskR, shiftR));
    }

    __m256i rotate(__m256i x) const{
        __m256i L = _mm256_permutevar8x32_epi32(x, permL);
        __m256i R = _mm256_permutevar8x32_epi32(x, permR);
        L = _mm256_sll_epi32(L, shiftL);
        R = _mm256_srl_epi32(R, shiftR);
        return _mm256_or_si256(L, R);
    }
    __m256i shift(__m256i x) const{
        return _mm256_andnot_si256(mask, rotate(x));
    }

private:
    __m256i permL;
    __m256i permR;
    __m128i shiftL;
    __m128i shiftR;
    __m256i mask;
};

:

#include <iostream>
using namespace std;

void print_u8(__m256i x){
    union{
        __m256i v;
        uint8_t s[32];
    };
    v = x;
    for (int c = 0; c < 32; c++){
        cout << (int)s[c] << " ";
    }
    cout << endl;
}

int main(){
    union{
        __m256i x;
        char buffer[32];
    };
    for (int c = 0; c < 32; c++){
        buffer[c] = (char)c;
    }
    print_u8(x);
    print_u8(LeftShifter_AVX2(0).shift(x));
    print_u8(LeftShifter_AVX2(8).shift(x));
    print_u8(LeftShifter_AVX2(32).shift(x));
    print_u8(LeftShifter_AVX2(40).shift(x));
}

:

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 
0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 
0 0 0 0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 
0 0 0 0 0 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26

. .

+6

Source: https://habr.com/ru/post/1695068/


All Articles