Minimum operation size x86-64 strcen

I am studying the minimum size of the x86-64 strlen opcode for my golf / binary executable code, which should not exceed a certain size (think of the demoscene for simplicity).
The general idea is taken from here , ideas for optimizing size from here and.

Input line address is in rdi, maximum length should be no moreInt32

xor   eax,eax ; 2 bytes
or    ecx,-1  ; 3 bytes
repne scasb   ; 2 bytes
not   ecx     ; 2 bytes
dec   ecx     ; 2 bytes

The end result is ecxat 11 bytes .

The question is setting ecxup-1

Option 1 has already been announced

or ecx,-1 ; 3 bytes

Option 2

lea ecx,[rax-1] ; 3 bytes 

Option 3

stc         ; 1 byte
sbb ecx,ecx ; 2 bytes

Option 4 is arguably the slowest

push -1 ; 2 bytes
pop rcx ; 1 byte

, :
1 ecx
2 rax
3 , ecx?
4 ?

?
, . , , .

+4
2

, rdi . , edi , 2 mov ecx, edi. , - !

, , rdi 0, . , sub edi, edx - rcx. ( , 32 , sub rdi, rdx, , . add/sub; .)

, , , 255 , mov cl, -1 (2 ). rcx 0xFF , . ( Nehalem , RCX, RCX). , mov al, -2/sub al, cl, 8- . .

, rcx , , .


lea ecx,[rax-1] , xor-zeroed eax, 1 uop ​​1 .

, , xor-zeroed, 3- lea 3- , . (. CPU 1 ).


, , cpu, .

, repne scasb . 15 Intel, , Agner Fog, >= 6n >= 2n , n - (.. 2 , , ), lea.

- ecx , lea.

repne scasb, , , , , pcmpeqb/pmovmsbk/cmp. integer cmp/jne , 4 8 ( 0), , , .. "" . , . , = 7 4, 2 1 , dword, 1 . cmp dword [rdi], first_4_bytes / jne; cmp dword [rdi+3], last_4_bytes / jne.


LEA

CPU Sandybridge lea , xor -zero ​​ . xor -zeroing /, uop ROB " ". , RAX. ( xor lea, , RAX , lea , , .)

lea port0 port1 SnB port1/port5 Skylake (2 , SnB). 1- , .

, mov ecx, -1 (5 ), ALU.

AMD Ryzen, lea r32, [m] 64- "" LEA, 2 ​​ 2 c 1. , Ryzen xor-zeroing.


microbenchmark , . , , lea - .

- , . , , , jcc, + . ( , ).

stc/sbb ecx,ecx , AMD sbb ( CF, ). Intel Haswell , sbb 2 uop ( 3 : 2 GP +). 2 , . ( , .)


, , strlen+2, - . dec ecx - 1 32- , x86-64 inc/dec. , /dec 64- .

repne scas ecx = -len - 2 ( ecx = -1), and gives you -x-1 (i.e. + len + 2 - 1`).

 ; eax = 0
 ; ecx = -1
repne scasb      ; ecx = -len - 2
sub   eax, ecx   ; eax = +len + 2
+1

Intel Core i7 4850HQ Haswell 2,3 , . 1000 asm 10 .

asm 100 .

#define lea100 asm{xor   eax,eax};asm { lea ecx,[rax-1] }; // <== Copy pasted 100times
#define or100 asm{xor   eax,eax};asm { or ecx,-1 }; // <== Copy pasted 100times
#define sbb100 asm{xor   eax,eax};asm { stc };asm{sbb ecx,ecx}; // <== Copy pasted 100times
#define stack100 asm ("xor %eax,%eax;.byte 0x6A; .byte 0xFF ;pop %rcx;"); // <== Copy pasted 100times

C asm MacOS

#include <stdio.h>
#include <CoreServices/CoreServices.h>
#include <mach/mach.h>
#include <mach/mach_time.h>
int main(int argc, const char * argv[]) {
    uint64_t        start;
    uint64_t        end;
    uint64_t        elapsed;
    Nanoseconds     elapsedNano;

    uint64_t sum = 0;
    for (int i = 0; i < 10000000 ; i++) {

// this will become
// call       imp___stubs__mach_absolute_time  
// mov        r14, rax
    start = mach_absolute_time();

//10x lea100 for example for total 1000 

// call       imp___stubs__mach_absolute_time
// sub        rax, r14
    end = mach_absolute_time();

    elapsed = end - start;
    elapsedNano = AbsoluteToNanoseconds( *(AbsoluteTime *) &elapsed );
    uint64_t nano = * (uint64_t *) &elapsedNano;
        sum += nano;
    }
    printf("%f\n",sum/10000000.0);
    return 0;
}

xor eax,eax
lea ecx,[rax-1]

205-216

xor eax,eax
or ecx,-1

321-355

xor eax,eax
push -1 
pop rcx 

322-359

xor eax,eax
stc     
sbb ecx,ecx

612-692

0

Source: https://habr.com/ru/post/1696194/


All Articles