What is the purpose of the loop "xorl% edx,% eax, shrl $ 1,% edx"?

I have the following x86 build code:

  movl   8(%ebp), %edx  //get an argument from the caller
  movl   $0, %eax
  testl  %edx, %edx
  je     .L1            
.L2:                   // what the purpose of this loop body?
  xorl   %edx, %eax
  shrl   $1, %edx
  jne    .L2
.L1:
  andl   $1, %eax

The corresponding C code, which the tutorial gives as follows

int f1(unsigned x)
{
    int y = 0;
    while(x != 0) {
        __________;
    }
    return __________;
 }

The book asks readers to fill in the gap and answer the question "What is he doing?"

I can’t combine the body of the loop in a single C statement. I can say what the loop body does, but I have no idea about its purpose. The tutorial also states that% eax stores the return value. So ... what is the purpose

andl  $1, %eax

I do not know either.

+4
source share
1 answer

, , XOR 32- arg. .

(and $1,%eax), , .

, xor %edx,%eax : xor %edx %eax. .

shr , x . 32 , , , , x 0. (- , XOR, XOR 0 , .)


, , C / C. , y ^= (x>>=1); , x, .

, C, - , ( , x ,). , y ^= x, x>>=1; .

, , ;.

int f1(unsigned x) {
    int y = 0;
    while(x != 0) {
        y ^= x;  x>>=1;      
    }
    return y & 1;
 }

, , gcc5.3 -O3 Godbolt. - xor-zeroing mov $0, %eax gcc ret. (, , gcc, .)


: :

O (n) ( n - x). O (log2 (n)) x86, .

, . ( xorw, 16- xor .)

#untested
parity:
    # no frame-pointer boilerplate

    xor       %eax,%eax        # zero eax (so the upper 24 bits of the int return value are zeroed).  And yes, this is more efficient than mov $0, %eax
                               # so when we set %al later, the whole of %eax will be good.

    movzwl    4(%esp), %edx      # load low 16 bits of `x`.  (zero-extend into the full %edx is for efficiency.  movw 4(%esp), %dx would work too.
    xorw      6(%esp), %dx       # xor the high 16 bits of `x`
    # Two loads instead of a load + copy + shift is probably a win, because cache is fast.
    xor       %dh, %dl           # xor the two 8 bit halves, setting PF according to the result
    setnp      %al               # get the inverse of the CPU parity flag.  Remember that the rest of %eax is already zero, so the result is already zero-extended to 32-bits (int return value)
    ret

, , x86 (PF), 8 " ", xor.

np, PF= 1 : xor = 0. 0 .

, SIMD, , , 32 8 .

Zeroing eax ( xor) , , , set-flags/ setp %al/movzbl %al, %eax, x86: xor, mov ?.


, @EOF, CPUID POPCNT, popcnt , , . ( : xor - add-without-carry, , , ).

GNU C __builtin_parity __builtin_popcnt, , , ( -march=... -mpopcnt), . Intel , , -mpopcnt.

, gcc pure-C . (, clang , , gcc) popcount POPCNT, .: (

godbolt.

int parity_gnuc(unsigned x) {
    return  __builtin_parity(x);
}
    # with -mpopcnt, compiles the same as below
    # without popcnt, compiles to the same upper/lower half XOR algorithm I used, and a setnp
    # using one load and mov/shift for the 32->16 step, and still %dh, %dl for the 16->8 step.

#ifdef __POPCNT__
#include <immintrin.h>
int parity_popcnt(unsigned x) {
    return  _mm_popcnt_u32(x) & 1;
}
#endif

    # gcc does compile this to the optimal code:
    popcnt    4(%esp), %eax
    and       $1, %eax
    ret

. wiki.

+6

Source: https://habr.com/ru/post/1650858/


All Articles