When optimizing the inner loop, I came across strange performance behaviors that I find difficult to understand and fix.
The following is a shortened version of the code; Roughly speaking, there is one giant array that is divided into 16 word slots, and I just add the number of leading zeros of words in each fragment. (I actually use popcnt Dan Luu code , but here I chose a simpler instruction with similar performance characteristics for “brevity”. Dan Luu code is based on the answer to this SO question , which, although it has painfully similar weird results, doesn't seem to answers my questions here.)
#include <stdint.h>
#include <time.h>
#include <stdlib.h>
#include <stdio.h>
#define ARRAY_LEN 16
static inline uint64_t clz_array(const uint64_t u[ARRAY_LEN]) {
uint64_t c0 = 0;
for (int i = 0; i < ARRAY_LEN; ++i) {
uint64_t t0;
__asm__ ("lzcnt %1, %0" : "=r"(t0) : "r"(u[i]));
c0 += t0;
}
return c0;
}
double clz_arrays(uint32_t *counts, const uint64_t *arrays, int narrays) {
clock_t t = clock();
for (int i = 0; i < narrays; ++i, arrays += ARRAY_LEN)
counts[i] = clz_array(arrays);
t = clock() - t;
return t * 1e3 / (double)CLOCKS_PER_SEC;
}
void print_stats(double t_ms, long n, double total_MiB) {
double t_s = t_ms / 1e3, thru = (n/1e6) / t_s, band = total_MiB / t_s;
printf("Time: %7.2f ms, %7.2f x 1e6 clz/s, %8.1f MiB/s\n", t_ms, thru, band);
}
int main(int argc, char *argv[]) {
long n = 1 << 20;
if (argc > 1)
n = atol(argv[1]);
long total_bytes = n * ARRAY_LEN * sizeof(uint64_t);
uint64_t *buf = malloc(total_bytes);
uint32_t *counts = malloc(sizeof(uint32_t) * n);
double t_ms, total_MiB = total_bytes / (double)(1 << 20);
printf("Total size: %.1f MiB\n", total_MiB);
t_ms = clz_arrays(counts, buf, n);
t_ms = clz_arrays(counts, buf, n);
print_stats(t_ms, n, total_MiB);
for (long i = 0; i < n*ARRAY_LEN; ++i)
buf[i] = i;
(void) clz_arrays(counts, buf, n);
t_ms = clz_arrays(counts, buf, n);
print_stats(t_ms, n, total_MiB);
free(counts);
free(buf);
return 0;
}
, clz_arrays .
( ):
$ ./clz-timing 10000000
Total size: 1220.7 MiB
Time: 47.78 ms, 209.30 x 1e6 clz/s, 25548.9 MiB/s
Time: 77.41 ms, 129.19 x 1e6 clz/s, 15769.7 MiB/s
, , Intel i7-6700HQ Intel® Core i7-6700HQ 2,60 , 3,5 . lzcnt 3 , 1 (. Agner Fog Skylake), , 8- ( uint64_t) 3,5 3.5e9 cycles/sec x 8 bytes/cycle = 28.0 GiB/s, , . 2,6 20,8 /.
, ,
(4) (), (2), , ?
, , :
perf, , -, LLC , . , , , , , , , , - , objdump -d , . , , .- "" (1) (3) , (4).
- ( "Intel (R) Xeon (R) CPU E5-2620 v3 @2.40 " ).
- GCC 4.9, 7.0 Clang 4.0. Debian, 4.14.
clz_array builtin_popcnt_unrolled_errata_manual Dan Luu, mutatis mutandis.
!