, LUT , . uint8_t LUT[3][256] , 256, , . 216, LUT 1 2 . struct3 LUT[216] , 3- . x86 , LUT 32- ( ):
struct { uint8_t vals[3]; } LUT[216];
unsigned decode_LUT(uint8_t b, unsigned selector) {
return LUT[b].vals[selector];
}
gcc7 -O3 Godbolt x86-64 AArch64
movzx edi, dil
mov esi, esi
lea rax, LUT[rdi+rdi*2]
movzx eax, BYTE PTR [rax+rsi]
ret
Silly gcc LEA (3 ) LUT disp32 ( , ).
, - .
PIC/PIE , , 2 :
movzx edi, dil
lea rax, LUT[rip] # RIP-relative LEA instead of absolute as part of another addressing mode
mov esi, esi
lea rdx, [rdi+rdi*2]
add rax, rdx
movzx eax, BYTE PTR [rax+rsi]
ret
, ALU - .
ALU . , 64- b*6, b*6*6 b*6*6*6 64- . (b * ((6ULL*6*6<<32) + (36<<16) + 6)
8 6. ( , , 64- ISA, 64- ).
, x86 ARM 6 3 , , Intel movzx r32, r8, movzx.
add eax, eax ; *2
lea eax, [rax + rax*2] ; *3
movzx ecx, al ; 0 cycle latency on Intel
.. repeat for next steps
ARM/AArch64 , add r0, r0, r0 lsl #1 3.
, ( ah/ch/... ) , , / (~ 5 ), . (, 16- , , 1 , ARM).
, gcc , :
unsigned decode_ALU(uint8_t b, unsigned selector) {
uint8_t decoded[3];
uint32_t tmp = b * 6;
decoded[0] = tmp >> 8;
tmp = 6 * (uint8_t)tmp;
decoded[1] = tmp >> 8;
tmp = 6 * (uint8_t)tmp;
decoded[2] = tmp >> 8;
return decoded[selector];
}
movzx edi, dil
mov esi, esi
lea eax, [rdi+rdi*2]
add eax, eax
mov BYTE PTR -3[rsp], ah # store high half of mul-by-6
movzx eax, al # costs 1 cycle: gcc doesn't know about zero-latency movzx?
lea eax, [rax+rax*2]
add eax, eax
mov BYTE PTR -2[rsp], ah
movzx eax, al
lea eax, [rax+rax*2]
shr eax, 7
mov BYTE PTR -1[rsp], al
movzx eax, BYTE PTR -3[rsp+rsi]
ret
4 movzx, 5, 1c ah, Intel HSW/SKL. 2 3 .
, ~ 10 b , = 0. - 13 16 .