How to speed up bit testing

I am thinking about how to speed up bit testing in the following procedure:

void histSubtractFromBits(uint64* cursor, uint16* hist){
    //traverse each bit of the 256-bit-long bitstring by splitting up into 4 bitsets
    std::bitset<64> a(*cursor);
    std::bitset<64> b(*(cursor+1));
    std::bitset<64> c(*(cursor+2));
    std::bitset<64> d(*(cursor+3));
    for(int bit = 0; bit < 64; bit++){
        hist[bit] -= a.test(bit);
    }
    for(int bit = 0; bit < 64; bit++){
        hist[bit+64] -= b.test(bit);
    }
    for(int bit = 0; bit < 64; bit++){
        hist[bit+128] -= c.test(bit);
    }
    for(int bit = 0; bit < 64; bit++){
        hist[bit+192] -= d.test(bit);
    }
}

The actual gcc implementation does a range check for the bit argument , then & -s with a bitmask. I could do it without bits and with my own offset / masking, but I am sure that this will not lead to significant acceleration (tell me if I am wrong and why).

I am not very good at x86-64 assembly, but I know certain instructions for testing bits , and I know that it is theoretically possible to make an inline assembly with gcc .

1) Do you think it's generally worth writing a built-in analogue analogue for the above code?

2) , , .. - /​​ , ?

+4
2

, . , , LSB . - :

uint64_t a = *cursor;
for(int bit = 0; a != 0; bit++, a >>= 1) {
    hist[bit] -= (a & 1);
}

, gcc, __builtin_ffsll

uint64_t a = *cursor;
int next;
for(int bit = 0; (next = __builtin_ffsll(a)) != 0; ) {
    bit += next;
    hist[bit - 1] -= 1;
    a >>= next;
}

, :)

: :

typedef short v8hi __attribute__ ((vector_size (16)));

static v8hi table[256];

void histSubtractFromBits(uint64_t* cursor, uint16_t* hist)
{
    uint8_t* cursor_tmp = (uint8_t*)cursor;
    v8hi* hist_tmp = (v8hi*)hist;
    for(int i = 0; i < 32; i++, cursor_tmp++, hist_tmp++)
    {
        *hist_tmp -= table[*cursor_tmp];
    }
}

void setup_table()
{
    for(int i = 0; i < 256; i++)
    {
        for(int j = 0; j < 8; j++)
        {
            table[i][j] = (i >> j) & 1;
        }
    }
}

SSE-, , , :

        leaq    32(%rdi), %rdx
        .p2align 4,,10
        .p2align 3
.L2:
        movzbl  (%rdi), %eax
        addq    $1, %rdi
        movdqa  (%rsi), %xmm0
        salq    $4, %rax
        psubw   table(%rax), %xmm0
        movdqa  %xmm0, (%rsi)
        addq    $16, %rsi
        cmpq    %rdx, %rdi
        jne     .L2

, , .

+6

- , :

// Assuming your processor has 64-bit words
void histSubtractFromBits(uint64_t const * cursor, uint16* hist)
{
    register uint64_t a = *cursor++;
    register uint64_t b = *cursor++;
    register uint64_t c = *cursor++;
    register uint64_t d = *cursor++;
    register unsigned int i = 0;
    for (i = 0; i < (sizeof(*cursor) * CHAR_BIT; ++i)
    {
        hist[i +   0] += a & 1;
        hist[i +  64] += b & 1;
        hist[i + 128] += c & 1;
        hist[i + 192] += d & 1;
        a >>= 1;
        b >>= 1;
        c >>= 1;
        d >>= 1;
    }
}

, , :

    hist[i +   0] += a & 1;
    a >>= 1;

.

- . , .

+2

Source: https://habr.com/ru/post/1538456/


All Articles