, , , , . , , . , , , , .
, AX 55312 ( ). CX 4, , , DX .
- 1:
55312 + 55312 16- , , AX 45088. , DX= 1. CX 3. - 2:
45088 + 45088 , , AX 24640.
DX= 2; CX= 2. - 3:
24640 + 24640 , , AX 49280.
DX= 2; CX= 1. - 4:
49280 + 4928 , , AX 33024.
DX= 3; CX= 0.
, , DX 3. :
1101 1000 0001 0000
↑ ↑
bit 15 bit 0
: (1) 4 (CX) 3, DX.
, , - , , , , , .
, , , AX - , CX :
Intel Nehalem, AMD Barcelona - SHR, POPCNT . :
; AX == input value
; CX == number of iterations
neg cx
add cx, 16 ; cx = 16 - cx
shr ax, cl ; ax = ax << cx
popcnt ax, ax ; ax = # of 1 bits in ax
. /; 4 . , .
, , POPCNT ? , . /. Pentium - :
; AX == input value
; CX == number of iterations
neg cx
add cx, 16 ; cx = 16 - cx
shr ax, cl ; ax = ax << cx
; emulate popcnt
mov dx, ax
shr dx, 1
and dx, 21845
sub ax, dx
mov cx, ax
and ax, 13107
shr cx, 2
and cx, 13107
add cx, ax
mov dx, cx
shr dx, 4
add dx, cx
and dx, 3855
mov ax, dx
shr ax, 8
add ax, dx
and ax, 63
16- , , , . , , , 8088/8086! , , , , , , , , . 8088/8086, , . 1- XLAT :
LUT DB 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8
; AX == input value
; CX == number of iterations
neg cx
add cx, 16 ; cx = 16 - cx
shr ax, cl ; ax = ax << cx
; emulate popcnt using LUT
mov bx, OFFSET LUT
xlat ; equivalent to: mov al, [bx + al]
xchg al, ah
xlat ; equivalent to: mov al, [bx + al]
add al, ah
xor ah, ah
256 (LUT) , 8088/8086, . :
neg cx ; 3 cycles
add cx, 16 ; 4 cycles
shr ax, cl ; 8+(4*CL) cycles
mov bx, OFFSET LUT ; 4 cycles
xlat ; 11 cycles
xchg al, ah ; 4 cycles
xlat ; 11 cycles
add al, ah ; 3 cycles
xor ah, ah ; 3 cycles
;-----------------
; 51+(4*CL) cycles
, - . 8 4 (.. , CL). , . , 50 , 120 .
:
xor dx, dx ; 3 cycles
L: add ax, ax ; 3 cycles
adc dx, 0 ; 4 cycles
loop L ; taken: 17 cycles; not-taken: 5 cycles
;---------------------------------------
; 8+(24*CL) cycles
CX ( ), , . , 32 ; 400 .
, , 8086, . ( , CX ), , , LUT, , , , . :

CX , . CX - LOOP s. , LUT, ( , LUT ), CX. , .
. , CX 16. CX 16, "" , , , SHR . CX > 16, , CX -16. , -twiddling , . , "" , , . .
( - - CX 65535 - , LOOP ing. , , LOOP s.)
"" :
LUT DB 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8
; AX == input value
; CX == number of iterations
mov bx, 16 ; 4 cycles
cmp cx, bx ; cx < 16? ; 3 cycles
jae SkipShift ; 4 cycles (fall-through); 16 cycles (branch)
sub bx, cx ; 3 cycles
mov cx, bx ; cx -= 16 ; 2 cycles
shr ax, cl ; ax <<= cx ; 8+(4*CL) cycles
SkipShift:
mov bx, OFFSET LUT ; 4 cycles
xlat ; 11 cycles
xchg al, ah ; 4 cycles
xlat ; 11 cycles
add al, ah ; 3 cycles
xor ah, ah ; 3 cycles
16 , JAE , , . JAE , , 4 . , 60 , - .