I am doing an experiment to profile the time it takes to calculate one sqrt in C code. I have two strategies.
One is to directly measure one sqrt call, and the other is to execute sqrt several times in a for loop, and then calculate the average. C code is very simple and shown as follows:
long long readTSC(void);
int main(int argc, char** argv)
{
int n = atoi(argv[1]);
double v = atof(argv[2]);.
long long tm;
double x;
tm = readTSC();
x = sqrt(v);
tm = readTSC() - tm;
printf("x=%15.6\n",x);
printf("%lld clocks\n",tm);
double sum = 0.0;
int i;
tm = readTSC();
for ( i = 0; i < n; i++ )
sum += sqrt((double) i);
tm = readTSC() - tm;
printf("%lld clocks\n",tm);
printf("%15.6e\n",sum);
return 0;
}
long long readTSC(void)
{
union { long long complete; unsigned int part[2]; } ticks;
__asm__ ("rdtsc; mov %%eax,%0;mov %%edx,%1"
: "=mr" (ticks.part[0]),
"=mr" (ticks.part[1])
:
: "eax", "edx");
return ticks.complete;
}
Before running the code, I expected that the result of synchronizing strategy I might be slightly smaller than the result of strategy II, since strategy II also takes into account the overhead caused by the for loop and sum.
I use the following command without O3 optimization to compile my code on an Intel Xeon E5-2680 2.7GHz computer.
gcc -o timing -lm timing.c
, 40 , II 21,8 , .
. , sqrt(). , sqrt() ?
call atof
cvtsi2ss %eax, %xmm0
movss %xmm0, -36(%rbp)
//-- timing single sqrt ---
call readTSC
movq %rax, -32(%rbp)
movss -36(%rbp), %xmm1
cvtps2pd %xmm1, %xmm1
//--- sqrtsd instruction
sqrtsd %xmm1, %xmm0
ucomisd %xmm0, %xmm0
jp .L8
je .L4
.L8:
movapd %xmm1, %xmm0
//--- C function call sqrt()
call sqrt
.L4:
movsd %xmm0, -72(%rbp)
movq -72(%rbp), %rax
movq %rax, -24(%rbp)
call readTSC
//-- end of timing single sqrt ---
subq -32(%rbp), %rax
movq %rax, -32(%rbp)
movl $.LC0, %eax
movsd -24(%rbp), %xmm0
movq %rax, %rdi
movl $1, %eax
call printf
movl $.LC1, %eax
movq -32(%rbp), %rdx
movq %rdx, %rsi
movq %rax, %rdi
movl $0, %eax
call printf
movl $0, %eax
movq %rax, -16(%rbp)
call readTSC
//-- start of for loop----
movq %rax, -32(%rbp)
movl $0, -4(%rbp)
jmp .L5
.L6:
//(double) i
cvtsi2sd -4(%rbp), %xmm0
//-- C function call sqrt()
call sqrt
movsd -16(%rbp), %xmm1
//add sqrt(i) to sum (%xmm0)
addsd %xmm1, %xmm0
movsd %xmm0, -16(%rbp)
//i++
addl $1, -4(%rbp)
.L5:
movl -4(%rbp), %eax
//check i<n
cmpl -40(%rbp), %eax
jl .L6
//-- end of for loop--
//you can skip the rest of the part.
call readTSC
subq -32(%rbp), %rax
movq %rax, -32(%rbp)
movl $.LC1, %eax
movq -32(%rbp), %rdx
movq %rdx, %rsi
movq %rax, %rdi
movl $0, %eax
call printf
movl $.LC3, %eax
movsd -16(%rbp), %xmm0
movq %rax, %rdi
movl $1, %eax
call printf