How to build two complex doubles with 256-bit AVX vectors?

Matt Scarpino gives a good explanation (although he admits that he’s not sure if this is the best algorithm, I offer him my thanks) for how to multiply two complex doubles with Intel AVX intrinsics. Here is his method that I tested:

__m256d vec1 = _mm256_setr_pd(4.0, 5.0, 13.0, 6.0);
__m256d vec2 = _mm256_setr_pd(9.0, 3.0, 6.0, 7.0);
__m256d neg  = _mm256_setr_pd(1.0, -1.0, 1.0, -1.0);

/* Step 1: Multiply vec1 and vec2 */
__m256d vec3 = _mm256_mul_pd(vec1, vec2);

/* Step 2: Switch the real and imaginary elements of vec2 */
vec2 = _mm256_permute_pd(vec2, 0x5);

/* Step 3: Negate the imaginary elements of vec2 */
vec2 = _mm256_mul_pd(vec2, neg);  

/* Step 4: Multiply vec1 and the modified vec2 */
__m256d vec4 = _mm256_mul_pd(vec1, vec2);

/* Horizontally subtract the elements in vec3 and vec4 */
vec1 = _mm256_hsub_pd(vec3, vec4);

/* Display the elements of the result vector */
double* res = (double*)&vec1;
printf("%lf %lf %lf %lf\n", res[0], res[1], res[2], res[3]);

My problem is that I want a square of two complex two-locales. I tried using Matt's method like this:

struct cmplx a;
struct cmplx b;

a.r = 2.5341;
a.i = 1.843;

b.r = 1.3941;
b.i = 0.93;

__m256d zzs = squareZ(a, b);

double* res = (double*) &zzs;

printf("\nA: %f + %f,  B: %f + %f\n", res[0], res[1], res[2], res[3]);

Using sophisticated Haskell arithmetic, I confirmed that the results are correct except , as you can see, the real part of B:

A: 3.025014 + 9.340693,  B: 0.000000 + 2.593026

So, I have two questions: is there a better (simpler and / or faster) way to put together two complex doubles with built-in AVX functions? If not, how can I change Matt code for this?

+4
2

. , .

TL: DR: , . .


Squaring : - rAiB rBiA . , 2 mul + 1 FMA + 1 add 2 mul + 2 FMA.

SIMD- , . , - .


cmul_manualvec() :

// squares 4 complex doubles from A[0..3], storing the result in dst[0..3]
void csquare_manual(double complex *restrict dst,
          const double complex *restrict A) {
  cmul_manualvec(dst, A, A);
}

gcc clang , , . clang , . , asm ( Godbolt):

        clang3.9 -O3 -ffast-math -ffp-contract=fast -march=haswell
    vmovupd         ymm0, ymmword ptr [rsi]
    vmovupd         ymm1, ymmword ptr [rsi + 32]
    vunpcklpd       ymm2, ymm0, ymm1
    vunpckhpd       ymm0, ymm0, ymm1   # doing this shuffle first would let the first multiply start a cycle earlier.  Silly compiler.
    vmulpd          ymm1, ymm0, ymm0   # imag*imag
    vfmsub231pd     ymm1, ymm2, ymm2   # real*real - imag*imag
    vaddpd          ymm0, ymm0, ymm0   # imag+imag = 2*imag
    vmulpd          ymm0, ymm2, ymm0   # 2*imag * real
    vunpcklpd       ymm2, ymm1, ymm0
    vunpckhpd       ymm0, ymm1, ymm0
    vmovupd ymmword ptr [rdi], ymm2
    vmovupd ymmword ptr [rdi + 32], ymm0
    vzeroupper
    ret

, , , . , , VADDPD , VMULPD. C asm, . (IIRC, gcc x86, .)

, 4 :

  • 2 ( 4) + 2
  • 4 ( 6),
  • 2 VMULPD ( )
  • 1 FMA + 1 VADDPD ( 2 FMA. VADDPD - , FMA Haswell/Broadwell, Skylake).

- 6 , .

+3

, . - ( /) , .

double complex style SIMD-friendly " " , AVX- . unpacklo/unpackhi , .

, " " ( ) , , FMA. 4 (2 ).

, , , FMA ( FMADDSUB) insn.


gcc - , -ffast-math. , .

#include <complex.h>

// even with -ffast-math -ffp-contract=fast, clang doesn't manage to use vfmaddsubpd, instead using vmulpd and vaddsubpd :(
// gcc does use FMA though.

// auto-vectorizes with a lot of extra shuffles
void cmul(double complex *restrict dst,
          const double complex *restrict A, const double complex *restrict B)
{       // clang and gcc change strategy slightly for i<1 or i<2, vs. i<4
  for (int i=0; i<4 ; i++) {
    dst[i] = A[i] * B[i];
  }
}

asm Godbolt-. , ; 64b- > 128b VMODDDUP . Intel (. Agner Fog insn tables), . , gcc 4 VPERMPD /FMA, 4 VPERMPD VSHUFPD. 8 4 .


gcc- intrinsics . (gcc, -, , ABCD, ACBD, in-lane VUNPCKLPD (_mm256_unpacklo_pd)).

Godbolt . , , .

// multiplies 4 complex doubles each from A and B, storing the result in dst[0..3]
void cmul_manualvec(double complex *restrict dst,
          const double complex *restrict A, const double complex *restrict B)
{
                                         // low element first, little-endian style
  __m256d A0 = _mm256_loadu_pd((double*)A);    // [A0r A0i  A1r A1i ] // [a b c d ]
  __m256d A2 = _mm256_loadu_pd((double*)(A+2));                       // [e f g h ]
  __m256d realA = _mm256_unpacklo_pd(A0, A2);  // [A0r A2r  A1r A3r ] // [a e c g ]
  __m256d imagA = _mm256_unpackhi_pd(A0, A2);  // [A0i A2i  A1i A3i ] // [b f d h ]
  // the in-lane behaviour of this interleaving is matched by the same in-lane behaviour when we recombine.

  __m256d B0 = _mm256_loadu_pd((double*)B);                           // [m n o p]
  __m256d B2 = _mm256_loadu_pd((double*)(B+2));                       // [q r s t]
  __m256d realB = _mm256_unpacklo_pd(B0, B2);                         // [m q o s]
  __m256d imagB = _mm256_unpackhi_pd(B0, B2);                         // [n r p t]

  // desired:  real=rArB - iAiB,  imag=rAiB + rBiA
  __m256d realprod = _mm256_mul_pd(realA, realB);
  __m256d imagprod = _mm256_mul_pd(imagA, imagB);

  __m256d rAiB     = _mm256_mul_pd(realA, imagB);
  __m256d rBiA     = _mm256_mul_pd(realB, imagA);

  // gcc and clang will contract these nto FMA.  (clang needs -ffp-contract=fast)
  // Doing it manually would remove the option to compile for non-FMA targets
  __m256d real     = _mm256_sub_pd(realprod, imagprod);  // [D0r D2r | D1r D3r]
  __m256d imag     = _mm256_add_pd(rAiB, rBiA);          // [D0i D2i | D1i D3i]

  // interleave the separate real and imaginary vectors back into packed format
  __m256d dst0 = _mm256_shuffle_pd(real, imag, 0b0000);  // [D0r D0i | D1r D1i]
  __m256d dst2 = _mm256_shuffle_pd(real, imag, 0b1111);  // [D2r D2i | D3r D3i]
  _mm256_storeu_pd((double*)dst, dst0);
  _mm256_storeu_pd((double*)(dst+2), dst2);   
}

  Godbolt asm output: gcc6.2 -O3 -ffast-math -ffp-contract=fast -march=haswell
    vmovupd         ymm0, YMMWORD PTR [rsi+32]
    vmovupd         ymm3, YMMWORD PTR [rsi]
    vmovupd         ymm1, YMMWORD PTR [rdx]
    vunpcklpd       ymm5, ymm3, ymm0
    vunpckhpd       ymm3, ymm3, ymm0
    vmovupd         ymm0, YMMWORD PTR [rdx+32]
    vunpcklpd       ymm4, ymm1, ymm0
    vunpckhpd       ymm1, ymm1, ymm0
    vmulpd          ymm2, ymm1, ymm3
    vmulpd          ymm0, ymm4, ymm3
    vfmsub231pd     ymm2, ymm4, ymm5     # separate mul/sub contracted into FMA
    vfmadd231pd     ymm0, ymm1, ymm5
    vunpcklpd       ymm1, ymm2, ymm0
    vunpckhpd       ymm0, ymm2, ymm0
    vmovupd         YMMWORD PTR [rdi], ymm1
    vmovupd         YMMWORD PTR [rdi+32], ymm0
    vzeroupper
    ret

4 ( ) :

  • 4 (32B )
  • 2 (32B )
  • 6 ( , )
  • 2 VMULPD
  • 2 VFMA... -
  • ( 4 , , 0 , )
  • Intel Skylake ( /): 14 = 4c 4 , VMULPD + 4 ( VMULPD) + 4c ( vfmadd231pd) + 1c ( 1c ) + 1c ( )

, . (1 , 2 MUL/FMA/ADD Intel Haswell ). : , , , .


(, 4 ). (. ).

  • 6 /
  • 6 (HSUBPD - 2 Intel AMD)
  • 4
  • 2 ( muls FMA)
  • (+ ) . Matt 1.0 -1.0, XOR (.. XORPD -0.0).
  • Intel Skylake : 11 . 1c (vpermilpd vxorpd ) + 4c ( vmulpd) + 6c (vhsubpd). vmulpd ops, , shuffle vxorpd. .

, , , , 4 . . , , . , 2 32B.

( 4--) FMA ( VXORPD), , FMA. , , , .


, FMA:

, , , . , .

C, . , . 256- , . .

// MULTIPLY: for each AVX lane: am-bn, an+bm  
 r i  r i
 a b  c d       // input vectors: a + b*i, etc.
 m n  o p

  • bm bn movshdup (a b) + mulpd
  • bn bm shufpd . ( n m mul)
  • a a movsldup (a b)
  • fmaddsubpd : [a|a]*[m|n] -/+ [bn|bm].

    , SSE/AVX ADDSUBPD, / / ( , - ). FMA FMADDSUB132PD, ( , FMSUBADD, ).

4 : 6x shuffle, 2x mul, 2xfmaddsub. , - , , ( ). Skylake = 10c = 1 + 4 + 1 bn bm ( 1 a a), + 4 (FMA). , , .

Bulldozer mul, mul- > fmaddsub FMA (1 ). - , movsldup (a b) mulpd. ( , , .)


- , ( XOR FMA), :

// SQUARING: for each AVX lane: aa-bb, 2*ab
// ab bb  // movshdup + mul
// bb ab  // ^ -> shufpd

// a  a          // movsldup
// aa-bb  ab+ab  // fmaddsubpd : [a|a]*[a|b] -/+ [bb|ab]
// per 4 results: 6x shuffle, 2x mul, 2xfmaddsub

, (a+b) * (a+b) = aa+2ab+bb (r-i)*(r+i) = rr - ii, . , FP , - .

+6

Source: https://habr.com/ru/post/1654687/


All Articles