Set a separate bit in the AVX register (__m256i), you need a "random access" operator

So I want to set a separate register bit __m256i.

Say my __m256icontains: [ 1 0 1 0 | 1 0 1 0 | ... | 1 0 1 0 ]how do I set and disable the nth bit?

+3
source share
4 answers

There is another implementation:

#include <immintrin.h>
#include <assert.h>

template <bool value> void SetMask(const __m256i & mask, __m256i & vector);

template <> inline void SetMask<true>(const __m256i & mask, __m256i & vector)
{
    vector = _mm256_or_si256(mask, vector);
}

template <> inline void SetMask<false>(const __m256i & mask, __m256i & vector)
{
    vector = _mm256_andnot_si256(mask, vector);
}

template <int position, bool value> void SetBit(__m256i & vector)
{
    const uint8_t mask8 = 1 << (position & 7);
    const __m128i mask128 = _mm_insert_epi8(_mm_setzero_si128(), mask8, (position >> 3)&15);
    const __m256i mask256 = _mm256_inserti128_si256(_mm256_setzero_si256(), mask128, position >> 7);
    SetMask<value>(mask256, vector);
}

int main(int argc, char* argv[])
{
    __m256i a = _mm256_set1_epi8(-1);
    SetBit<50, false>(a);

    __m256i b = _mm256_set1_epi8(0);
    SetBit<50, true>(b);

    return 0;
}
+4
source

This is an implementation of a function that can set a single bit inside a vector:

#include <immintrin.h>
#include <assert.h>

void SetBit(__m256i & vector, size_t position, bool value)
{
    assert(position <= 255);
    uint8_t lut[32] = { 0 };
    lut[position >> 3] = 1 << (position & 7);
    __m256i mask = _mm256_loadu_si256((__m256i*)lut);
    if (value)
        vector = _mm256_or_si256(mask, vector);
    else
        vector = _mm256_andnot_si256(mask, vector);
}

int main(int argc, char* argv[])
{
    __m256i a = _mm256_set1_epi8(-1);
    SetBit(a, 54, false);

    __m256i b = _mm256_set1_epi8(0);
    SetBit(b, 54, true);

    return 0;
}
+3
source

LUT / , , k- avx-256:

inline __m256i setbit_256(__m256i x,int k){
// constants that will (hopefully) be hoisted out of a loop after inlining  
  __m256i indices = _mm256_set_epi32(224,192,160,128,96,64,32,0);
  __m256i one = _mm256_set1_epi32(-1);
  one = _mm256_srli_epi32(one, 31);    // set1(0x1)


  __m256i kvec = _mm256_set1_epi32(k);  
// if 0<=k<=255 then kvec-indices has exactly one element with a value between 0 and 31
  __m256i shiftcounts = _mm256_sub_epi32(kvec, indices);
  __m256i kbit        = _mm256_sllv_epi32(one, shiftcounts);   // shift counts outside 0..31 shift the bit out of the element
                                                               // kth bit set, all 255 other bits zero.
  return _mm256_or_si256(kbit, x);                             // use _mm256_andnot_si256 to unset the k-th bit
}



, .

#include <immintrin.h>

inline __m256i setbit_256(__m256i x,int k){
  __m256i c1, c2, c3;
  __m256i t, y, msk;

  // constants that will (hopefully) be hoisted out of a loop after inlining
  c1=_mm256_set_epi32(7,6,5,4,3,2,1,0);
  c2=_mm256_set1_epi32(-1);
  c3=_mm256_srli_epi32(c2,27);     // set1(0x1f) mask for the shift within elements
  c2=_mm256_srli_epi32(c2,31);     // set1(0x1)

  // create a vector with the kth bit set
  t=_mm256_set1_epi32(k);
  y=_mm256_and_si256(c3,t);        // shift count % 32: distance within each elem
  y=_mm256_sllv_epi32(c2,y);       // set1( 1<<(k%32) )

  t=_mm256_srli_epi32(t,5);        // set1( k>>5 )
  msk=_mm256_cmpeq_epi32(t,c1);    // all-ones in the selected element
  y=_mm256_and_si256(y,msk);       // kth bit set, all 255 other bits zero.

  x=_mm256_or_si256(y,x);   /* use _mm256_andnot_si256 to unset the k-th bit */
  return x;
}

, , , .

asm clang gcc (Godbolt compiler explorer), , . , clang " ", - ( ).

+3

LUT, BTS ( BTR ). , ( , GCC), ( x86).

0F AB/r --- BTS r/m32, r32 --- CF .

, Bit-String , dword, . :

, 31, . 3 5 (3 16- , 5 32- ) , . , .

When accessing a bit in memory, the processor can access 4 bytes, starting with the memory address for the 32-bit operand size, using the following relationship:

Effective Address + (4 * (Bit Offset DIV 32))

In pure assembler (Intel-MASM syntax), it will look like this:

.data
  .align 16
  save db 32 dup(0)    ; 256bit = 32 byte YMM/__m256i temp variable space
  bitNumber dd 254     ; use an UINT for the bit to set (here the second to last)
.code
  mov eax, bitNumber
  ...
  lea edx, save
  movdqa xmmword ptr [edx], xmm0    ; save __m256i to to memory
  bts dword ptr [edx], eax          ; set the 255st bit
  movdqa xmm0, xmmword ptr [edx]    ; read __m256i back to register
  ...

If the variable is already in memory, it will be even easier.


Using the built-in assembly, this will result in the following functions:

static inline
void set_m256i_bit(__m256i * value, uint32_t bit)
{
    // doesn't need to be volatile: we only want to run this for its effect on *value.
    __asm__ ("btsl %[bit], %[memval]\n\t"
             : [memval] "+m" (*value) : [bit] "ri" (bit));
}

static inline
void clear_m256i_bit(__m256i * value, uint32_t bit)
{
    __asm__ ( "btrl %[bit], %[memval]\n\t"
              : [memval] "+m" (*value) : [bit] "ri" (bit));
}

They compile with what you expect in the Godbolt compiler explorer

And some test code similar to the assembler above:

__m256i value = _mm256_set_epi32(0,0,0,0,0,0,0,0);
set_m256i_bit(&value,254);
clear_m256i_bit(&value,254);
+1
source

Source: https://habr.com/ru/post/1694830/


All Articles