Visits all free slots in the bit field

I have a uint64 array and for all canceled bits (0s), I am doing some evaluations.

Estimates are not very expensive, but very few bits are not set. Profiling says that I spend a lot of time in the search-next-mess logic.

Is there a faster way (on Core2duo)?

My current code may skip many high 1s:

for(int y=0; y<height; y++) {
  uint64_t xbits = ~board[y];
  int x = 0;
  while(xbits) {
    if(xbits & 1) {
      ... with x and y
    }
    x++;
    xbits >>= 1;
  }
}

(And any discussion on how / if SIMD / CUDA-ise it will be an intriguing touch!)

+3
source share
12 answers

Here is a quick micro benchmark; run it if you can get statistics for your system and please add your own algorithms!

Command line:

g++ -o bit_twiddle_mirco_opt bit_twiddle_mirco_opt.cpp -O9 -fomit-frame-pointer -DNDEBUG -march=native

And the code:

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <stdint.h>

static unsigned long get_usecs() {
    struct timeval tv;
    gettimeofday(&tv,NULL);
    return tv.tv_sec*1000000+tv.tv_usec;
}

enum { MAX_HEIGHT = 64 };
uint64_t board[MAX_HEIGHT];
int xsum, ysum;

void evaluate(int x,int y) {
    xsum += x;
    ysum += y;
}

void alphaneo_unrolled_8(int height) {
    for(int y=0; y < height; y++) {
        uint64_t xbits = ~board[y];
        int x = 0;      
        while(xbits) {
            if(xbits & (1 << 0))
                evaluate(x,y);
            if(xbits & (1 << 1))
                evaluate(x+1,y);
            if(xbits & (1 << 2))
                evaluate(x+2,y);
            if(xbits & (1 << 3))
                evaluate(x+3,y);
            if(xbits & (1 << 4))
                evaluate(x+4,y);
            if(xbits & (1 << 5))
                evaluate(x+5,y);
            if(xbits & (1 << 6))
                evaluate(x+6,y);
            if(xbits & (1 << 7))
                evaluate(x+7,y);
            x+=8;
            xbits >>= 8;
        }
    }
}

void will_while(int height) {
    for(int y=0; y<height; y++) {
        uint64_t xbits = ~board[y];
        int x = 0;
        while(xbits) {
            if(xbits & 1)
                evaluate(x,y);
            xbits >>= 1;
            x++;
        }
    }
}

void will_ffs(int height) {
    for(int y=0; y<height; y++) {
        uint64_t xbits = ~board[y];
        int x = __builtin_ffsl(xbits);
        while(x) {
            evaluate(x-1,y);
            xbits >>= x;
            xbits <<= x;
            x = __builtin_ffsl(xbits);
        }
    }
}

void rnd_board(int dim) {
    for(int y=0; y<dim; y++) {
        board[y] = ~(((uint64_t)1 << dim)-1);
        for(int x=0; x<dim; x++)
            if(random() & 1)
                board[y] |= (uint64_t)1 << x;
    }
}

void test(const char* name,void(*func)(int)) {
    srandom(0);
    printf("testing %s... ",name);
    xsum = ysum = 0;
    const unsigned long start = get_usecs();
    for(int i=0; i<100000; i++) {
        const int dim = (random() % MAX_HEIGHT) + 1;
        rnd_board(dim);
        func(dim);
    }
    const unsigned long stop = get_usecs();
    printf("%lu usecs (check %d,%d)\n",stop-start,xsum,ysum);
}

int main() {
    test("will_while()",will_while);
    test("will_ffs()",will_ffs);
    test("alphaneo_unrolled_8()",alphaneo_unrolled_8);
    return 0;
}
+1

Hacker Delight . , , dwords/bytes/nibbles/etc. .

Phenom SSE4a ( , Core2 Duo), POPCNT . :

pop(x & (~x-1))

x & (~x-1) ; POPCNT.

:

    01101111 x
    10010000 ~x
    10001111 ~x-1
    00001111 x & ~x-1
pop(00001111) => 4
+6

, . , "x", ( 8 * byte-in-uint64, "x".

, 1--8 ( , ), , 4 0- ( 0 , , , ), 256 * 4 = 1k.

+3

Assemply, BSF (Bit Scan Forward) . 1 , . IIRC, XOR , 0, , BSF. x86 BSF 32- , . ( 32- , ).

+3

, Loop unwinding, -

for(int y=0; y < height; y++) {

    uint64_t xbits = ~board[y];
    int x = 0;

    while(xbits) {
        if(xbits & (1 << 0)) {
          ... with x and y
        }
        if(xbits & (1 << 1)) {
          ... with x and y
        }
        if(xbits & (1 << 2)) {
          ... with x and y
        }
        if(xbits & (1 << 3)) {
          ... with x and y
        }
        if(xbits & (1 << 4)) {
          ... with x and y
        }
        if(xbits & (1 << 5)) {
          ... with x and y
        }
        if(xbits & (1 << 6)) {
          ... with x and y
        }
        if(xbits & (1 << 7)) {
          ... with x and y
        }
        x+=8;
        xbits >>= 8;
    }
}

7 , 7 , 7 8 ...

, , 1, , ,

while (xbits) {

    if (xbits & 0xF) {

          // Process for the four bits !!!
    }

    xbits >>= 4;
} 

: , : - (

+2

- , . , .

template < int i, int x >
struct process_bit {
    inline static void apply ( int y ) { };
};

template < int x >
struct process_bit < 1, x > {
    inline static void apply ( int y ) {
        evaluate ( x, y );
    }
};

template < int x, int n >
inline void process_nibble_bits ( int y ) {
    process_bit < x & 1, n >::apply( y );
    process_bit < ( x >> 1 ) & 1, n + 1 > ::apply( y );
    process_bit < ( x >> 2 ) & 1, n + 2 > ::apply( y );
    process_bit < ( x >> 3 ) & 1, n + 3 > ::apply( y );
}


template < int n >
inline void process_nibble ( uint64_t xbits, int y ) {
    uint64_t nibble = ( xbits >> n ) & 0xf;
    if ( nibble ) {
        switch ( nibble ) {
            case 0:
            process_nibble_bits < 0, n > ( y );
            break;
            case 1:
            process_nibble_bits < 1, n > ( y );
            break;
            case 2:
            process_nibble_bits < 2, n > ( y );
            break;
            case 3:
            process_nibble_bits < 3, n > ( y );
            break;
            case 4:
            process_nibble_bits < 4, n > ( y );
            break;
            case 5:
            process_nibble_bits < 5, n > ( y );
            break;
            case 6:
            process_nibble_bits < 6, n > ( y );
            break;
            case 7:
            process_nibble_bits < 7, n > ( y );
            break;
            case 8:
            process_nibble_bits < 8, n > ( y );
            break;
            case 9:
            process_nibble_bits < 9, n > ( y );
            break;
            case 10:
            process_nibble_bits < 10, n > ( y );
            break;
            case 11:
            process_nibble_bits < 11, n > ( y );
            break;
            case 12:
            process_nibble_bits < 12, n > ( y );
            break;
            case 13:
            process_nibble_bits < 13, n > ( y );
            break;
            case 14:
            process_nibble_bits < 14, n > ( y );
            break;
            case 15:
            process_nibble_bits < 15, n > ( y );
            break;
        }
    }
}

template < int i, int n >
struct bit_tree {
    inline static void apply ( uint64_t xbits, int y ) {
        // each call to here represents scan of bits in [ n, n + 2i )
        bit_tree < i >> 1, n > ::apply(xbits, y);
        bit_tree < i >> 1, n + i > ::apply(xbits, y);
    };
};


template < int i, int n >
struct bit_tree_with_guard {
    inline static void apply ( uint64_t xbits, int y ) {
        // each call to here represents scan of bits in [ n, n + 2i )
        // so this branch to execute if any in [ n, n + i ) are set

        if ( xbits & ( ( ( ( ( uint64_t ) 1LL ) << i ) - 1 ) << n ) )
            bit_tree < i >> 1, n > ::apply(xbits, y);

        if ( xbits & ( ( ( ( ( uint64_t ) 1LL ) << i ) - 1 ) << ( n + i) ) )
            bit_tree < i >> 1, n + i > ::apply(xbits, y);
    };
};

// put guards on 8 and 16 bit blocks ( for some reason using inheritance is slower ) 
template < int n >
struct bit_tree < 8, n > {
    inline static void apply ( uint64_t xbits, int y ) {
        bit_tree_with_guard < 8, n > ::apply ( xbits, y );
    }
};
template < int n >
struct bit_tree < 16, n > {
    inline static void apply ( uint64_t xbits, int y ) {
        bit_tree_with_guard < 16, n > ::apply ( xbits, y );
    }
};


template < int n >
struct bit_tree < 2, n > {
    inline static void apply ( uint64_t xbits, int y ) {
        process_nibble < n > ( xbits, y );
    }
};


void template_nibbles(int height) {
    for (int y = 0; y < height; y++) {
        uint64_t xbits = ~board[y];
        bit_tree< 32, 0>::apply ( xbits, y );
    }
}

, ffs, , , , :

$ bin\bit_twiddle_micro_opt.exe                                               
testing will_while()... 3375000 usecs (check 1539404233,1539597930)           
testing will_ffs()... 2890625 usecs (check 675191567,1001386403)              
testing alphaneo_unrolled_8()... 3296875 usecs (check 1539404233,1539597930)  
testing template_nibbles()... 3046875 usecs (check 1539404233,1539597930)     

; , . - 16 , ++?

+2

. :

, , 1-:

int x = something;

int lsb = x ^ ((x-1) & x);

i.e. if   x = 100100
a = (x - 1) = 100011 // these two steps turn off the lsb
b = (a & x) = 100000
c = (x ^ b) = 000100 // this step detects the lsb
lsb = c

, , x ^= lsb .

lsb ( ) -, , , .

, ?

+2

- ( , ), , .

+1

, while, ~ board [y], y?

, , 64- - , ", .

, ?

+1

, , . , . . , - , , , .

+1

, , ,

if (xbits != ((uint64_t)-1))
{
   // regular code goes here
}

. , ( ) 64 .

0

Changing the lookup table version: Has a lookup table for the next inappropriate bit for 8 bits. Check the 8-bit blocks and AND for 0xFF, compare to see if the result is all 0xFF. If so, skip, look again at the table?

0
source

Source: https://habr.com/ru/post/1717615/


All Articles