Saving Left Shifted Bits (SHL)

Consider the following Intel 8086 build program:

CX has a nonzero value.

L: ADD AX, AX
   ADC DX, 0
   LOOP L

I was asked to understand the code above and rewrite it to increase efficiency. As far as I understand:

  • Stores the value 2 ^ CX * AX in AX
  • Counts how many times the carry flag was set to 1 in the process and saves it in DX.

Assuming this is correct, I thought that the best code would be the SHL value in AX, CX times. SHL AX, CX However, I could not think of a way to sum the carry bits in the process. (or count the number of “1” bits in the most significant CX bits of the source AX.)

Any recommendations and help are welcome.

+4
4

, , , , . , , . , , , , .

, AX 55312 ( ). CX 4, , , DX .

  • 1: 55312 + 55312 16- , , AX 45088. , DX= 1. CX 3.
  • 2: 45088 + 45088 , , AX 24640.
    DX= 2; CX= 2.
  • 3: 24640 + 24640 , , AX 49280.
    DX= 2; CX= 1.
  • 4: 49280 + 4928 , , AX 33024.
    DX= 3; CX= 0.

, , DX 3. :

1101 1000  0001 0000
                  
bit 15             bit 0

: (1) 4 (CX) 3, DX.

, , - , , , , , .

, , , AX - , CX :

  • CX AX, .
  • AX.

Intel Nehalem, AMD Barcelona - SHR, POPCNT . :

; AX == input value
; CX == number of iterations

neg    cx
add    cx, 16     ; cx = 16 - cx

shr    ax, cl     ; ax = ax << cx

popcnt ax, ax     ; ax = # of 1 bits in ax

. /; 4 . , .

, , POPCNT ? , . /. Pentium - :

; AX == input value
; CX == number of iterations

neg  cx
add  cx, 16     ; cx = 16 - cx

shr  ax, cl     ; ax = ax << cx

; emulate popcnt
mov  dx, ax
shr  dx, 1
and  dx, 21845
sub  ax, dx
mov  cx, ax
and  ax, 13107
shr  cx, 2
and  cx, 13107
add  cx, ax
mov  dx, cx
shr  dx, 4
add  dx, cx
and  dx, 3855
mov  ax, dx
shr  ax, 8
add  ax, dx
and  ax, 63

16- , , , . , , , 8088/8086! , , , , , , , , . 8088/8086, , . 1- XLAT :

LUT DB   0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8
; AX == input value
; CX == number of iterations

neg  cx
add  cx, 16            ; cx = 16 - cx

shr  ax, cl            ; ax = ax << cx

; emulate popcnt using LUT
mov  bx, OFFSET LUT
xlat                   ; equivalent to: mov al, [bx + al]
xchg al, ah
xlat                   ; equivalent to: mov al, [bx + al]
add  al, ah
xor  ah, ah

256 (LUT) , 8088/8086, . :

neg  cx               ; 3         cycles
add  cx, 16           ; 4         cycles
shr  ax, cl           ; 8+(4*CL)  cycles
mov  bx, OFFSET LUT   ; 4         cycles
xlat                  ; 11        cycles
xchg al, ah           ; 4         cycles
xlat                  ; 11        cycles
add  al, ah           ; 3         cycles
xor  ah, ah           ; 3         cycles
                      ;-----------------
                      ; 51+(4*CL) cycles

, - . 8 4 (.. , CL). , . , 50 , 120 .

:

   xor  dx, dx        ; 3 cycles
L: add  ax, ax        ; 3 cycles
   adc  dx, 0         ; 4 cycles
   loop L             ; taken: 17 cycles; not-taken: 5 cycles
                      ;---------------------------------------
                      ; 8+(24*CL) cycles

CX ( ), , . , 32 ; 400 .

, , 8086, . ( , CX ), , , LUT, , , , . :

Graph showing relative performance of two approaches (using estimated number of cycles)

CX , . CX - LOOP s. , LUT, ( , LUT ), CX. , .

. , CX 16. CX 16, "" , , , SHR . CX > 16, , CX -16. , -twiddling , . , "" , , . .

( - - CX 65535 - , LOOP ing. , , LOOP s.)

"" :

    LUT DB   0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8
    ; AX == input value
    ; CX == number of iterations

    mov  bx, 16                          ; 4 cycles
    cmp  cx, bx        ; cx < 16?        ; 3 cycles
    jae  SkipShift                       ; 4 cycles (fall-through); 16 cycles (branch)
    sub  bx, cx                          ; 3 cycles
    mov  cx, bx        ; cx  -= 16       ; 2 cycles
    shr  ax, cl        ; ax <<= cx       ; 8+(4*CL) cycles
SkipShift:
    mov  bx, OFFSET LUT                  ; 4 cycles
    xlat                                 ; 11 cycles
    xchg al, ah                          ; 4 cycles
    xlat                                 ; 11 cycles
    add  al, ah                          ; 3 cycles
    xor  ah, ah                          ; 3 cycles

16 , JAE , , . JAE , , 4 . , 60 , - .

+2

cl :

   shld  dx,ax,cl
   shl   ax,cl
0

shld (, 80286 , x86), :

if CX < 16
    DX = AX SHR (16 - CX)
    AX = AX SHL 16
else
    if CX < 32
        DX = AX SHL (CX - 16)
    else
        DX = 0
    endif
    AX = 0
endif
0

- , , AX DX, popcnt ( 1 ) DX. , 32- . 8086, , 256 , 1 , :

        xor     bx,bx          
        mov     bl,dl                   ;bl = lower bits
        mov     dl,table[bx]            ;dl = lower # bits set
        mov     bl,dh                   ;bl = upper bits
        add     dl,table[bx]            ;dl = total # bits set
;       ...
        xor     dh,dh                   ;optional, clear upper bits dx

32- popcnt edi:

        mov     edx,edi                 ;edx = edi
        shr     edx,1                   ;mov upr 2 bit field bits to lwr
        and     edx,055555555h          ; and mask them
        sub     edi,edx                 ;edi = 2 bit field counts
                                        ; 0->0, 1->1, 2->1, 3->1
        mov     eax,edi
        shr     edi,02h                 ;mov upr 2 bit field counts to lwr
        and     eax,033333333h          ;eax = lwr 2 bit field counts
        and     edi,033333333h          ;edx = upr 2 bit field counts
        add     edi,eax                 ;edi = 4 bit field counts
        mov     eax,edi
        shr     eax,04h                 ;mov upr 4 bit field counts to lwr
        add     eax,edi                 ;eax = 8 bit field counts
        and     eax,00f0f0f0fh          ; after the and
        imul    eax,eax,01010101h       ;eax bit 24->28 = bit count
        shr     eax,018h                ;eax bit 0->4 = bit count
0

Source: https://habr.com/ru/post/1679662/


All Articles