, . - ( /) , .
double complex style SIMD-friendly " " , AVX- . unpacklo/unpackhi , .
, " " ( ) , , FMA. 4 (2 ).
, , , FMA ( FMADDSUB) insn.
gcc - , -ffast-math. , .
#include <complex.h>
void cmul(double complex *restrict dst,
const double complex *restrict A, const double complex *restrict B)
{
for (int i=0; i<4 ; i++) {
dst[i] = A[i] * B[i];
}
}
asm Godbolt-. , ; 64b- > 128b VMODDDUP . Intel (. Agner Fog insn tables), . , gcc 4 VPERMPD /FMA, 4 VPERMPD VSHUFPD. 8 4 .
gcc- intrinsics . (gcc, -, , ABCD, ACBD, in-lane VUNPCKLPD (_mm256_unpacklo_pd)).
Godbolt . , , .
void cmul_manualvec(double complex *restrict dst,
const double complex *restrict A, const double complex *restrict B)
{
__m256d A0 = _mm256_loadu_pd((double*)A);
__m256d A2 = _mm256_loadu_pd((double*)(A+2));
__m256d realA = _mm256_unpacklo_pd(A0, A2);
__m256d imagA = _mm256_unpackhi_pd(A0, A2);
__m256d B0 = _mm256_loadu_pd((double*)B);
__m256d B2 = _mm256_loadu_pd((double*)(B+2));
__m256d realB = _mm256_unpacklo_pd(B0, B2);
__m256d imagB = _mm256_unpackhi_pd(B0, B2);
__m256d realprod = _mm256_mul_pd(realA, realB);
__m256d imagprod = _mm256_mul_pd(imagA, imagB);
__m256d rAiB = _mm256_mul_pd(realA, imagB);
__m256d rBiA = _mm256_mul_pd(realB, imagA);
__m256d real = _mm256_sub_pd(realprod, imagprod);
__m256d imag = _mm256_add_pd(rAiB, rBiA);
__m256d dst0 = _mm256_shuffle_pd(real, imag, 0b0000);
__m256d dst2 = _mm256_shuffle_pd(real, imag, 0b1111);
_mm256_storeu_pd((double*)dst, dst0);
_mm256_storeu_pd((double*)(dst+2), dst2);
}
Godbolt asm output: gcc6.2 -O3 -ffast-math -ffp-contract=fast -march=haswell
vmovupd ymm0, YMMWORD PTR [rsi+32]
vmovupd ymm3, YMMWORD PTR [rsi]
vmovupd ymm1, YMMWORD PTR [rdx]
vunpcklpd ymm5, ymm3, ymm0
vunpckhpd ymm3, ymm3, ymm0
vmovupd ymm0, YMMWORD PTR [rdx+32]
vunpcklpd ymm4, ymm1, ymm0
vunpckhpd ymm1, ymm1, ymm0
vmulpd ymm2, ymm1, ymm3
vmulpd ymm0, ymm4, ymm3
vfmsub231pd ymm2, ymm4, ymm5 # separate mul/sub contracted into FMA
vfmadd231pd ymm0, ymm1, ymm5
vunpcklpd ymm1, ymm2, ymm0
vunpckhpd ymm0, ymm2, ymm0
vmovupd YMMWORD PTR [rdi], ymm1
vmovupd YMMWORD PTR [rdi+32], ymm0
vzeroupper
ret
4 ( ) :
- 4 (32B )
- 2 (32B )
- 6 ( , )
- 2 VMULPD
- 2 VFMA... -
- ( 4 , , 0 , )
- Intel Skylake ( /): 14 = 4c 4 , VMULPD + 4 ( VMULPD) + 4c ( vfmadd231pd) + 1c ( 1c ) + 1c ( )
, . (1 , 2 MUL/FMA/ADD Intel Haswell ). : , , , .
(, 4 ). (. ).
- 6 /
- 6 (HSUBPD - 2 Intel AMD)
- 4
- 2 ( muls FMA)
- (+ ) . Matt 1.0 -1.0, XOR (.. XORPD
-0.0). - Intel Skylake : 11 . 1c (vpermilpd vxorpd ) + 4c ( vmulpd) + 6c (vhsubpd). vmulpd ops, , shuffle vxorpd. .
, , , , 4 . . , , . , 2 32B.
( 4--) FMA ( VXORPD), , FMA. , , , .
, FMA:
, , , . , .
C, . , . 256- , . .
r i r i
a b c d
m n o p
bm bn movshdup (a b) + mulpdbn bm shufpd . ( n m mul)a a movsldup (a b)fmaddsubpd : [a|a]*[m|n] -/+ [bn|bm].
, SSE/AVX ADDSUBPD, / / ( , - ). FMA FMADDSUB132PD, ( , FMSUBADD, ).
4 : 6x shuffle, 2x mul, 2xfmaddsub. , - , , ( ). Skylake = 10c = 1 + 4 + 1 bn bm ( 1 a a), + 4 (FMA). , , .
Bulldozer mul, mul- > fmaddsub FMA (1 ). - , movsldup (a b) mulpd. ( , , .)
- , ( XOR FMA), :
, (a+b) * (a+b) = aa+2ab+bb (r-i)*(r+i) = rr - ii, . , FP , - .