Encoding 3 basic 6 digits of 8 bits to decompress performance

Question

Encoding 3 basic 6 digits of 8 bits to decompress performance

I am looking for an efficient way to decompress (from the point of view of a small number of basic ALU ops in the generated code) a way to encode 3 basic 6 digits (ie 3 numbers in the range [0.5]) in 8 bits. Only one is needed at a time, so the approaches that should decode all three to access one are probably not good if the cost of decoding all three is very low.

The obvious method, of course:

x = b%6;   //  8 insns
y = b/6%6; // 13 insns
z = b/36;  //  5 insns

Command metrics are measured on x86_64 with gcc> = 4.8, which knows how to avoid the div.

Another method (using a different encoding):

b *= 6
x = b>>8;
b &= 255;

b *= 6
y = b>>8;
b &= 255;

b *= 6
z = b>>8;

( 8- , [0,215]) , 3 , , .

?

- C, assembly, , .

+4

optimization c bit-manipulation x86-64

R.. 06 . '18 21:10

2

Peter Cordes · Answer 1 · 2018-04-07T09:30:18+0000

, LUT , . uint8_t LUT[3][256] , 256, , . 216, LUT 1 2 . struct3 LUT[216] , 3- . x86 , LUT 32- ( ):

struct { uint8_t vals[3]; } LUT[216];
unsigned decode_LUT(uint8_t b, unsigned selector) {
    return LUT[b].vals[selector];
}

gcc7 -O3 Godbolt x86-64 AArch64

    movzx   edi, dil
    mov     esi, esi                 # zero-extension to 64-bit: goes away when inlining.
    lea     rax, LUT[rdi+rdi*2]      # multiply by 3 and add the base
    movzx   eax, BYTE PTR [rax+rsi]  # then index by selector
    ret

Silly gcc LEA (3 ) LUT disp32 ( , ).

, - .

PIC/PIE , , 2 :

    movzx   edi, dil
    lea     rax, LUT[rip]           # RIP-relative LEA instead of absolute as part of another addressing mode
    mov     esi, esi
    lea     rdx, [rdi+rdi*2]
    add     rax, rdx
    movzx   eax, BYTE PTR [rax+rsi]
    ret

, ALU - .

ALU . , 64- b*6, b*6*6 b*6*6*6 64- . (b * ((6ULL*6*6<<32) + (36<<16) + 6)

8 6. ( , , 64- ISA, 64- ).

, x86 ARM 6 3 , , Intel movzx r32, r8, movzx.

add    eax, eax              ; *2
lea    eax, [rax + rax*2]    ; *3
movzx  ecx, al               ; 0 cycle latency on Intel
.. repeat for next steps

ARM/AArch64 , add r0, r0, r0 lsl #1 3.

, ( ah/ch/... ) , , / (~ 5 ), . (, 16- , , 1 , ARM).

, gcc , :

unsigned decode_ALU(uint8_t b, unsigned selector) {
    uint8_t decoded[3];
    uint32_t tmp = b * 6;
    decoded[0] = tmp >> 8;
    tmp = 6 * (uint8_t)tmp;
    decoded[1] = tmp >> 8;
    tmp = 6 * (uint8_t)tmp;
    decoded[2] = tmp >> 8;

    return decoded[selector];
}

    movzx   edi, dil
    mov     esi, esi
    lea     eax, [rdi+rdi*2]
    add     eax, eax
    mov     BYTE PTR -3[rsp], ah      # store high half of mul-by-6
    movzx   eax, al                   # costs 1 cycle: gcc doesn't know about zero-latency movzx?
    lea     eax, [rax+rax*2]
    add     eax, eax
    mov     BYTE PTR -2[rsp], ah
    movzx   eax, al
    lea     eax, [rax+rax*2]
    shr     eax, 7
    mov     BYTE PTR -1[rsp], al
    movzx   eax, BYTE PTR -3[rsp+rsi]
    ret

4 movzx, 5, 1c ah, Intel HSW/SKL. 2 3 .

, ~ 10 b , = 0. - 13 16 .

R.. · Answer 2 · 2018-04-07T23:55:28+0000

, , : . 50 , . , , , , , . , :

(b * (int[]){2048,342,57}[i] >> 11) % 6;

b - , , i - . 342 57 , GCC 6 36 , 11. %6 /36 (i==2), , .

, , , , , , , .

Encoding 3 basic 6 digits of 8 bits to decompress performance

More articles: