Accurate vector implementation of acosf ()

Simple implementations acosf()can easily get an error estimate of 1.5 ulp with respect to an infinitely accurate (mathematical) result if the platform supports smooth multiple addition (FMA). This means that the results never differ by more than one ulp from a correctly rounded result in rounding mode to the nearest or even.

However, such an implementation usually includes two main branches of code that divide the primary approximation interval [0.1] by about half, as in the code example below. This branching prohibits automatic vectorization by compilers when targeting SIMD architectures.

Is there an alternative algorithmic approach that is easier to automatic vectorization, while maintaining the same error 1.5 times? The platform supports FMA.

/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
    float r, s;
    s = a * a;
    r =             0x1.a7f260p-5f;  // 5.17513156e-2
    r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
    r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
    r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
    r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
    r = r * s;
    r = fmaf (r, a, a);
    return r;
}

/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
    float r;

    r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
    if (r > -0.5625f) {
        /* arccos(x) = pi/2 - arcsin(x) */
        r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
    } else {
        /* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
        r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
    }
    if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
        /* arccos (-x) = pi - arccos(x) */
        r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
    }
    return r;
}
+4
source share
3 answers

A silent version of the code in the question is possible (hardly any redundant work, just some comparisons / combinations to create constants for FMA), but IDK if the compilers will auto-vectorize it.

The main additional work is useless sqrt/ fmaif all the elements had -|a| > -0.5625f, unfortunately, on the critical path.


The argument asinf_coreis equal (r > -0.5625f) ? r : sqrtf (fmaf (0.5f, r, 0.5f)).

In parallel with this, you (or the compiler) can mix the coefficients for the FMA output.

pi/2, float , 2 fmaf,

fmaf( condition?-1:2,  asinf_core_result,  condition ? pi/2 : 0)

, andps SIMD (, x86 SSE).


, - parallelism FP- FMA asinf_core.

, FMA asinf_core, . , asinf_core , , . ( SIMD a_cmp = andnot( a>0.0f, a>=-1.0f), multiplier ^ (-0.0f & a_cmp), multiplier .

FMA 0, pi/2, pi pi + pi/2. ( a r=-|a| , NaN), 2- FP 4 , AVX vpermilps ( ). , 4 , 2- LUT!

, , - . , x86 ( 2 uops 1). Skylake (, vblendvps) ( 5). ILP, , , uop ALU, 5. ( blend Haswell 2 uops 5, , vpermilps ymm,ymm,ymm).

-1, 1, -2 2.


, - ( 8 vblendvps) gcc7.3 -O3 -march=skylake -ffast-math. , autovectorization:/, , gcc rsqrtps + Newton ( FMA?!?), -mrecip=none, .

Autovectorizes 5 vblendvps clang5.0 ( ). . Godbolt. , , , .

// I think this is far more than enough digits for float precision, but wouldn't hurt to use a standard constant instead of what I typed from memory.
static const float pi_2 = 3.1415926535897932384626433 / 2;
static const float pi = 3.1415926535897932384626433;
//static const float pi_plus_pi_2 = 3.1415926535897932384626433 * 3.0 / 2;

/* maximum error UNKNOWN, completely UNTESTED */
float my_acosf_branchless (float a)
{
    float r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
    bool a_in_range = !(a > 0.0f) && (a >= -1.0f);

    bool rsmall = (r > -0.5625f);
    float asinf_arg = rsmall ? r : sqrtf (fmaf (0.5f, r, 0.5f));

    float asinf_res = asinf_core(asinf_arg);

#if 0
    r = fmaf( rsmall?-1.0f:2.0f,  asinf_res,  rsmall ? pi_2 : 0);
    if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
        /* arccos (-x) = pi - arccos(x) */
        r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
    }
#else
    float fma_mul = rsmall? -1.0f:2.0f;
    fma_mul = a_in_range ? -fma_mul : fma_mul;
    float fma_add = rsmall ? pi_2 : 0;
    fma_add = a_in_range ? fma_add + pi : fma_add;
    // to vectorize, turn the 2 conditions into a 2-bit integer.
    // Use vpermilps as a 2-bit LUT of float constants

    // clang doesn't see the LUT trick, but otherwise appears non-terrible at this blending.

    r = fmaf(asinf_res, fma_mul, fma_add);
#endif
    return r;
}

- , 1024 float; . Godbolt.

TODO: .

+2

, , , x [0,1], acos (x) ≈ √ (2 * (1-x)) , . , .

#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>

#define VECTORIZABLE 1
#define ARR_LEN      (1 << 24)
#define MAX_ULP      1 /* deviation from correctly rounded result */

#if VECTORIZABLE  
/* 
 Compute arccos(a) with a maximum error of 1.496766 ulp 
 This uses an idea from Robert Harley posting in comp.arch.arithmetic on 1996/07/12
 https://groups.google.com/forum/#!original/comp.arch.arithmetic/wqCPkCCXqWs/T9qCkHtGE2YJ
*/
float my_acosf (float a)
{
    float r, s, t;
    s = (a < 0.0f) ? 2.0f : (-2.0f);
    t = fmaf (s, a, 2.0f);
    s = sqrtf (t);
    r =              0x1.c86000p-22f;  //  4.25032340e-7
    r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
    r = fmaf (r, t,  0x1.90c5c4p-18f); //  5.97197595e-6
    r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
    r = fmaf (r, t,  0x1.c3f78ap-16f); //  2.69393295e-5
    r = fmaf (r, t,  0x1.e8f446p-14f); //  1.16575764e-4
    r = fmaf (r, t,  0x1.6df072p-11f); //  6.97973708e-4
    r = fmaf (r, t,  0x1.3332a6p-08f); //  4.68746712e-3
    r = fmaf (r, t,  0x1.555550p-05f); //  4.16666567e-2
    r = r * t;
    r = fmaf (r, s, s);
    t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r
    r = (a < 0.0f) ? t : r;
    return r;
}

#else // VECTORIZABLE

/* approximate arcsin(a) on [-0.5625,+0.5625], max ulp err = 0.95080 */
float asinf_core(float a)
{
    float r, s;
    s = a * a;
    r =             0x1.a7f260p-5f;  // 5.17513156e-2
    r = fmaf (r, s, 0x1.29a5cep-6f); // 1.81669723e-2
    r = fmaf (r, s, 0x1.7f0842p-5f); // 4.67568673e-2
    r = fmaf (r, s, 0x1.329256p-4f); // 7.48465881e-2
    r = fmaf (r, s, 0x1.555728p-3f); // 1.66670144e-1
    r = r * s;
    r = fmaf (r, a, a);
    return r;
}

/* maximum error = 1.45667 ulp */
float my_acosf (float a)
{
    float r;

    r = (a > 0.0f) ? (-a) : a; // avoid modifying the "sign" of NaNs
    if (r > -0.5625f) {
        /* arccos(x) = pi/2 - arcsin(x) */
        r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
    } else {
        /* arccos(x) = 2 * arcsin (sqrt ((1-x) / 2)) */
        r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
    }
    if (!(a > 0.0f) && (a >= -1.0f)) { // avoid modifying the "sign" of NaNs
        /* arccos (-x) = pi - arccos(x) */
        r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
    }
    return r;
}
#endif // VECTORIZABLE

int main (void)
{
    double darg, dref;
    float ref, *a, *b;
    uint32_t argi, resi, refi;

    printf ("%svectorizable implementation of acos\n", 
            VECTORIZABLE ? "" : "non-");

    a = (float *)malloc (sizeof(a[0]) * ARR_LEN);
    b = (float *)malloc (sizeof(b[0]) * ARR_LEN);

    argi = 0x00000000;
    do {

        for (int i = 0; i < ARR_LEN; i++) {
            memcpy (&a[i], &argi, sizeof(a[i]));
            argi++;
        }

        for (int i = 0; i < ARR_LEN; i++) {
            b[i] = my_acosf (a[i]);
        }

        for (int i = 0; i < ARR_LEN; i++) {
            darg = (double)a[i];
            dref = acos (darg);
            ref = (float)dref;
            memcpy (&refi, &ref, sizeof(refi));
            memcpy (&resi, &b[i], sizeof(resi));
            if (llabs ((long long int)resi - (long long int)refi) > MAX_ULP) {
                printf ("error > 1 ulp a[i]=% 14.6a  b[i]=% 14.6a  ref=% 14.6a  dref=% 21.13a\n", 
                        a[i], b[i], ref, dref);
                printf ("test FAILED\n");

                return EXIT_FAILURE;
            }
        }

        printf ("^^^^ argi = %08x\n", argi);
    } while (argi);

    printf ("test PASSED\n");

    free (a);
    free (b);

    return EXIT_SUCCESS;
}

, AVX2 , Compiler Explorer. , , , , Clang. Clang, , , -ffast-math, , , sqrtf() , rsqrt. , . -fno-honor-nans, -fno-math-errno, -fno-trapping-math, my_acosf() , .

AVX2 + FMA intrinsics, :

#include "immintrin.h"

/* maximum error = 1.496766 ulp */
__m256 _mm256_acos_ps (__m256 x)
{
    const __m256 zero= _mm256_set1_ps ( 0.0f);
    const __m256 two = _mm256_set1_ps ( 2.0f);
    const __m256 mtwo= _mm256_set1_ps (-2.0f);
    const __m256 c0  = _mm256_set1_ps ( 0x1.c86000p-22f); //  4.25032340e-7
    const __m256 c1  = _mm256_set1_ps (-0x1.0258fap-19f); // -1.92483935e-6
    const __m256 c2  = _mm256_set1_ps ( 0x1.90c5c4p-18f); //  5.97197595e-6
    const __m256 c3  = _mm256_set1_ps (-0x1.55668cp-19f); // -2.54363249e-6
    const __m256 c4  = _mm256_set1_ps ( 0x1.c3f78ap-16f); //  2.69393295e-5
    const __m256 c5  = _mm256_set1_ps ( 0x1.e8f446p-14f); //  1.16575764e-4
    const __m256 c6  = _mm256_set1_ps ( 0x1.6df072p-11f); //  6.97973708e-4
    const __m256 c7  = _mm256_set1_ps ( 0x1.3332a6p-8f);  //  4.68746712e-3
    const __m256 c8  = _mm256_set1_ps ( 0x1.555550p-5f);  //  4.16666567e-2
    const __m256 pi0 = _mm256_set1_ps ( 0x1.ddcb02p+0f);  //  1.86637890e+0
    const __m256 pi1 = _mm256_set1_ps ( 0x1.aee9d6p+0f);  //  1.68325555e+0
    __m256 s, r, t, m;

    s = two;
    t = mtwo;
    m = _mm256_cmp_ps (x, zero, _CMP_LT_OQ);
    t = _mm256_blendv_ps (t, s, m);
    t = _mm256_fmadd_ps (x, t, s);
    s = _mm256_sqrt_ps (t);
    r = c0;
    r = _mm256_fmadd_ps (r, t, c1);
    r = _mm256_fmadd_ps (r, t, c2);
    r = _mm256_fmadd_ps (r, t, c3);
    r = _mm256_fmadd_ps (r, t, c4);
    r = _mm256_fmadd_ps (r, t, c5);
    r = _mm256_fmadd_ps (r, t, c6);
    r = _mm256_fmadd_ps (r, t, c7);
    r = _mm256_fmadd_ps (r, t, c8);
    r = _mm256_mul_ps (r, t);
    r = _mm256_fmadd_ps (r, s, s);
    t = _mm256_sub_ps (zero, r);
    t = _mm256_fmadd_ps (pi0, pi1, t);
    r = _mm256_blendv_ps (r, t, m);
    return r;
}
+4

, .

, gcc copysignf() , . copysignf() .

gcc 4.9, gcc -std=c99 -O3 -m64 -Wall -march=haswell -fno-math-errno. sqrtf() vsqrtps. Godbolt .

#include <stdio.h>
#include <immintrin.h>
#include <math.h>

float acosf_cpsgn (float a)
{
    float r, s, t, pi2;
/*    s = (a < 0.0f) ? 2.0f : (-2.0f); */
    s = copysignf(2.0f, -a);
    t = fmaf (s, a, 2.0f);
    s = sqrtf (t);
    r =              0x1.c86000p-22f;  //  4.25032340e-7
    r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
    r = fmaf (r, t,  0x1.90c5c4p-18f); //  5.97197595e-6
    r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
    r = fmaf (r, t,  0x1.c3f78ap-16f); //  2.69393295e-5
    r = fmaf (r, t,  0x1.e8f446p-14f); //  1.16575764e-4
    r = fmaf (r, t,  0x1.6df072p-11f); //  6.97973708e-4
    r = fmaf (r, t,  0x1.3332a6p-08f); //  4.68746712e-3
    r = fmaf (r, t,  0x1.555550p-05f); //  4.16666567e-2
    r = r * t;
    r = fmaf (r, s, s);
/*    t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r      */
/*    r = (a < 0.0f) ? t : r;                                           */
    r = copysignf(r, a);
    pi2 = 0x1.ddcb02p+0f * 0.5f;                   /* no rounding here  */
    pi2 = pi2 - copysignf(pi2, a);                 /* no rounding here  */
    t = fmaf (pi2, 0x1.aee9d6p+0f, r);   // PI-r
    return t;
}



float my_acosf (float a)
{
    float r, s, t;
    s = (a < 0.0f) ? 2.0f : (-2.0f);
    t = fmaf (s, a, 2.0f);
    s = sqrtf (t);
    r =              0x1.c86000p-22f;  //  4.25032340e-7
    r = fmaf (r, t, -0x1.0258fap-19f); // -1.92483935e-6
    r = fmaf (r, t,  0x1.90c5c4p-18f); //  5.97197595e-6
    r = fmaf (r, t, -0x1.55668cp-19f); // -2.54363249e-6
    r = fmaf (r, t,  0x1.c3f78ap-16f); //  2.69393295e-5
    r = fmaf (r, t,  0x1.e8f446p-14f); //  1.16575764e-4
    r = fmaf (r, t,  0x1.6df072p-11f); //  6.97973708e-4
    r = fmaf (r, t,  0x1.3332a6p-08f); //  4.68746712e-3
    r = fmaf (r, t,  0x1.555550p-05f); //  4.16666567e-2
    r = r * t;
    r = fmaf (r, s, s);
    t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r); // PI-r
    r = (a < 0.0f) ? t : r;
    return r;
}


/* The code from the next 2 functions is copied from the godbold link in Peter cordes'  */
/* answer https://stackoverflow.com/a/49091530/2439725  and modified                    */
int autovec_test_a (float *__restrict dst, float *__restrict src) {
    dst = __builtin_assume_aligned(dst,32);
    src = __builtin_assume_aligned(src,32);
    for (int i=0 ; i<1024 ; i++ ) {
        dst[i] = my_acosf(src[i]);
    }
    return 0;
}

int autovec_test_b (float *__restrict dst, float *__restrict src) {
    dst = __builtin_assume_aligned(dst,32);
    src = __builtin_assume_aligned(src,32);
    for (int i=0 ; i<1024 ; i++ ) {
        dst[i] = acosf_cpsgn(src[i]);
    }
    return 0;
}
+2

Source: https://habr.com/ru/post/1694378/


All Articles