How sqrt () works twice as slow as when it was placed in a for loop

I am doing an experiment to profile the time it takes to calculate one sqrt in C code. I have two strategies.

One is to directly measure one sqrt call, and the other is to execute sqrt several times in a for loop, and then calculate the average. C code is very simple and shown as follows:

long long readTSC(void);

int main(int argc, char** argv)
{
    int    n = atoi(argv[1]);
    //v is input of sqrt() making sure compiler won't 
    //precompute the result of sqrt(v) if v is constant
    double v = atof(argv[2]);. 
    long long tm;            //track CPU clock cycles
    double x;                //result of sqrt()
    //-- strategy I ---
    tm = readTSC();     //A function that uses rdtsc instruction to get the number of clock cycles from Intel CPU
    x  = sqrt(v);
    tm = readTSC() - tm;
    printf("x=%15.6\n",x);   //make sure compiler won't optimize out the above sqrt()
    printf("%lld clocks\n",tm);
    double sum = 0.0;
    int i;

    //-- strategy II --
    tm = readTSC();
    for ( i = 0; i < n; i++ )
        sum += sqrt((double) i);
    tm = readTSC() - tm;

    printf("%lld clocks\n",tm);
    printf("%15.6e\n",sum);
    return 0;
}

long long readTSC(void)
{
 /* read the time stamp counter on Intel x86 chips */
  union { long long complete; unsigned int part[2]; } ticks;
  __asm__ ("rdtsc; mov %%eax,%0;mov %%edx,%1"
        : "=mr" (ticks.part[0]),
          "=mr" (ticks.part[1])
        : /* no inputs */
        : "eax", "edx");
  return ticks.complete;
}

Before running the code, I expected that the result of synchronizing strategy I might be slightly smaller than the result of strategy II, since strategy II also takes into account the overhead caused by the for loop and sum.

I use the following command without O3 optimization to compile my code on an Intel Xeon E5-2680 2.7GHz computer.

gcc -o timing -lm timing.c

, 40 , II 21,8 , .

. , sqrt(). , sqrt() ?

call    atof
cvtsi2ss    %eax, %xmm0
movss   %xmm0, -36(%rbp)

//-- timing single sqrt ---
call    readTSC
movq    %rax, -32(%rbp)
movss   -36(%rbp), %xmm1
cvtps2pd    %xmm1, %xmm1
//--- sqrtsd instruction
sqrtsd  %xmm1, %xmm0
ucomisd %xmm0, %xmm0
jp  .L8
je  .L4
.L8:
movapd  %xmm1, %xmm0
//--- C function call sqrt()
call    sqrt
.L4:
movsd   %xmm0, -72(%rbp)
movq    -72(%rbp), %rax
movq    %rax, -24(%rbp)
call    readTSC
//-- end of timing single sqrt ---

subq    -32(%rbp), %rax
movq    %rax, -32(%rbp)
movl    $.LC0, %eax
movsd   -24(%rbp), %xmm0
movq    %rax, %rdi
movl    $1, %eax
call    printf
movl    $.LC1, %eax
movq    -32(%rbp), %rdx
movq    %rdx, %rsi
movq    %rax, %rdi
movl    $0, %eax
call    printf
movl    $0, %eax
movq    %rax, -16(%rbp)

call    readTSC
//-- start of for loop----
movq    %rax, -32(%rbp)
movl    $0, -4(%rbp)
jmp .L5
.L6:
//(double) i
cvtsi2sd    -4(%rbp), %xmm0
//-- C function call sqrt()
call    sqrt
movsd   -16(%rbp), %xmm1
//add sqrt(i) to sum (%xmm0)
addsd   %xmm1, %xmm0
movsd   %xmm0, -16(%rbp)
//i++
addl    $1, -4(%rbp)
.L5:
movl    -4(%rbp), %eax
//check i<n
cmpl    -40(%rbp), %eax
jl  .L6
//-- end of for loop--
//you can skip the rest of the part.
call    readTSC
subq    -32(%rbp), %rax
movq    %rax, -32(%rbp)
movl    $.LC1, %eax
movq    -32(%rbp), %rdx
movq    %rdx, %rsi
movq    %rax, %rdi
movl    $0, %eax
call    printf
movl    $.LC3, %eax
movsd   -16(%rbp), %xmm0
movq    %rax, %rdi
movl    $1, %eax
call    printf
+4
3

, sqrtsd. , ucomisd , L4.

for II sqrt . sqrt, , , , sqrtsd. .

call sqrt sqrtsd, , .

, , , . rdtsc , , , rdtsc sqrtsd rdtsc . , , sqrtsd rdtsc.

, , , sqrt . , , . Agner Fog , Ivy Bridge sqrtsd 1/8 1/14 . for , .

- , .

+4

E5-2680 - Sandy Bridge, , SQRTSD 10 21 /instr. , , - 21,8 . sqrt GLIBC , __ieee754_sqrt, , x86-64 sqrtsd %xmm0, %xmm0.

CPU . , sqrtsd %xmm0, %xmm0 . sqrt , sqrt 21,8 .

, RDTSC . , T_code_block + T_rdtsc_latency. :

(T_code_block * n_iters + T_rdtsc_latency) / n_iters =
= T_code_block + (T_rdtsc_latency / n_iters)

n_iters , .


RDTSC . TSC . , , , . , , , , .

, , taskset syscall sched_setaffinity(2). , . , , , . , . .

+7

The problem is the readTSC () function. To make sure that you can change "Strategy I" to "Strategy II". You will now see that Strategy II took longer. It seems to me that the readTSC () function takes longer when it runs for the first time.

0
source

Source: https://habr.com/ru/post/1584566/


All Articles