Vectorization of sin and cos

I played with Compiler Explorer and ran into an anomaly (I think). If I want the compiler to vectorize the calculationssin , I would write:

#include <cmath>

#define NN 512
typedef float T;
typedef T __attribute__((aligned(NN))) AT;

inline T s(const T x)
{
  return sinf(x);
}

void func(AT* __restrict x, AT* __restrict y, int length)
{
  if (length & NN-1) __builtin_unreachable();
  for (int i = 0; i < length; i++)
  {
    y[i] = s(x[i]);
  }
}

compile with gcc 6.2 -O3 -march=native -ffast-mathand get

func(float*, float*, int):
        testl   %edx, %edx
        jle     .L10
        leaq    8(%rsp), %r10
        andq    $-32, %rsp
        pushq   -8(%r10)
        pushq   %rbp
        movq    %rsp, %rbp
        pushq   %r14
        xorl    %r14d, %r14d
        pushq   %r13
        leal    -8(%rdx), %r13d
        pushq   %r12
        shrl    $3, %r13d
        movq    %rsi, %r12
        pushq   %r10
        addl    $1, %r13d
        pushq   %rbx
        movq    %rdi, %rbx
        subq    $8, %rsp
.L4:
        vmovaps (%rbx), %ymm0
        addl    $1, %r14d
        addq    $32, %r12
        addq    $32, %rbx
        call    _ZGVcN8v_sinf      // YAY! Vectorized trig!
        vmovaps %ymm0, -32(%r12)
        cmpl    %r13d, %r14d
        jb      .L4
        vzeroupper
        addq    $8, %rsp
        popq    %rbx
        popq    %r10
        popq    %r12
        popq    %r13
        popq    %r14
        popq    %rbp
        leaq    -8(%r10), %rsp
.L10:
        ret

But when I add cosineto the function , there is no vectorization:

#include <cmath>

#define NN 512
typedef float T;
typedef T __attribute__((aligned(NN))) AT;

inline T f(const T x)
{
  return cosf(x)+sinf(x);
}

void func(AT* __restrict x, AT* __restrict y, int length)
{
  if (length & NN-1) __builtin_unreachable();
  for (int i = 0; i < length; i++)
  {
    y[i] = f(x[i]);
  }
}

which gives:

func(float*, float*, int):
        testl   %edx, %edx
        jle     .L10
        pushq   %r12
        leal    -1(%rdx), %eax
        pushq   %rbp
        leaq    4(%rdi,%rax,4), %r12
        movq    %rsi, %rbp
        pushq   %rbx
        movq    %rdi, %rbx
        subq    $16, %rsp
.L4:
        vmovss  (%rbx), %xmm0
        leaq    8(%rsp), %rsi
        addq    $4, %rbx
        addq    $4, %rbp
        leaq    12(%rsp), %rdi
        call    sincosf               // No vectorization
        vmovss  12(%rsp), %xmm0
        vaddss  8(%rsp), %xmm0, %xmm0
        vmovss  %xmm0, -4(%rbp)
        cmpq    %rbx, %r12
        jne     .L4
        addq    $16, %rsp
        popq    %rbx
        popq    %rbp
        popq    %r12
.L10:
        ret

I see two good alternatives. Either name the vectorized version sincosfor name the vectorized sinand cossequentially. I tried to add to -fno-builtin-sincosno avail.

Is this a known issue with gcc? Anyway, is there a way to convince gcc to vectorize the latest example?

(Also, is there a way to get gcc <6 to automatically vectorize trigonometric functions?)

+5

Source: https://habr.com/ru/post/1655173/


All Articles