, , , x [0,1], acos (x) ≈ √ (2 * (1-x)) , . , .
#include <stdlib.h>
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <math.h>
#define VECTORIZABLE 1
#define ARR_LEN (1 << 24)
#define MAX_ULP 1
#if VECTORIZABLE
float my_acosf (float a)
{
float r, s, t;
s = (a < 0.0f) ? 2.0f : (-2.0f);
t = fmaf (s, a, 2.0f);
s = sqrtf (t);
r = 0x1.c86000p-22f;
r = fmaf (r, t, -0x1.0258fap-19f);
r = fmaf (r, t, 0x1.90c5c4p-18f);
r = fmaf (r, t, -0x1.55668cp-19f);
r = fmaf (r, t, 0x1.c3f78ap-16f);
r = fmaf (r, t, 0x1.e8f446p-14f);
r = fmaf (r, t, 0x1.6df072p-11f);
r = fmaf (r, t, 0x1.3332a6p-08f);
r = fmaf (r, t, 0x1.555550p-05f);
r = r * t;
r = fmaf (r, s, s);
t = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, 0.0f - r);
r = (a < 0.0f) ? t : r;
return r;
}
#else
float asinf_core(float a)
{
float r, s;
s = a * a;
r = 0x1.a7f260p-5f;
r = fmaf (r, s, 0x1.29a5cep-6f);
r = fmaf (r, s, 0x1.7f0842p-5f);
r = fmaf (r, s, 0x1.329256p-4f);
r = fmaf (r, s, 0x1.555728p-3f);
r = r * s;
r = fmaf (r, a, a);
return r;
}
float my_acosf (float a)
{
float r;
r = (a > 0.0f) ? (-a) : a;
if (r > -0.5625f) {
r = fmaf (0x1.ddcb02p-1f, 0x1.aee9d6p+0f, asinf_core (r));
} else {
r = 2.0f * asinf_core (sqrtf (fmaf (0.5f, r, 0.5f)));
}
if (!(a > 0.0f) && (a >= -1.0f)) {
r = fmaf (0x1.ddcb02p+0f, 0x1.aee9d6p+0f, -r);
}
return r;
}
#endif
int main (void)
{
double darg, dref;
float ref, *a, *b;
uint32_t argi, resi, refi;
printf ("%svectorizable implementation of acos\n",
VECTORIZABLE ? "" : "non-");
a = (float *)malloc (sizeof(a[0]) * ARR_LEN);
b = (float *)malloc (sizeof(b[0]) * ARR_LEN);
argi = 0x00000000;
do {
for (int i = 0; i < ARR_LEN; i++) {
memcpy (&a[i], &argi, sizeof(a[i]));
argi++;
}
for (int i = 0; i < ARR_LEN; i++) {
b[i] = my_acosf (a[i]);
}
for (int i = 0; i < ARR_LEN; i++) {
darg = (double)a[i];
dref = acos (darg);
ref = (float)dref;
memcpy (&refi, &ref, sizeof(refi));
memcpy (&resi, &b[i], sizeof(resi));
if (llabs ((long long int)resi - (long long int)refi) > MAX_ULP) {
printf ("error > 1 ulp a[i]=% 14.6a b[i]=% 14.6a ref=% 14.6a dref=% 21.13a\n",
a[i], b[i], ref, dref);
printf ("test FAILED\n");
return EXIT_FAILURE;
}
}
printf ("^^^^ argi = %08x\n", argi);
} while (argi);
printf ("test PASSED\n");
free (a);
free (b);
return EXIT_SUCCESS;
}
, AVX2 , Compiler Explorer. , , , , Clang. Clang, , , -ffast-math, , , sqrtf() , rsqrt. , . -fno-honor-nans, -fno-math-errno, -fno-trapping-math, my_acosf() , .
AVX2 + FMA intrinsics, :
#include "immintrin.h"
__m256 _mm256_acos_ps (__m256 x)
{
const __m256 zero= _mm256_set1_ps ( 0.0f);
const __m256 two = _mm256_set1_ps ( 2.0f);
const __m256 mtwo= _mm256_set1_ps (-2.0f);
const __m256 c0 = _mm256_set1_ps ( 0x1.c86000p-22f);
const __m256 c1 = _mm256_set1_ps (-0x1.0258fap-19f);
const __m256 c2 = _mm256_set1_ps ( 0x1.90c5c4p-18f);
const __m256 c3 = _mm256_set1_ps (-0x1.55668cp-19f);
const __m256 c4 = _mm256_set1_ps ( 0x1.c3f78ap-16f);
const __m256 c5 = _mm256_set1_ps ( 0x1.e8f446p-14f);
const __m256 c6 = _mm256_set1_ps ( 0x1.6df072p-11f);
const __m256 c7 = _mm256_set1_ps ( 0x1.3332a6p-8f);
const __m256 c8 = _mm256_set1_ps ( 0x1.555550p-5f);
const __m256 pi0 = _mm256_set1_ps ( 0x1.ddcb02p+0f);
const __m256 pi1 = _mm256_set1_ps ( 0x1.aee9d6p+0f);
__m256 s, r, t, m;
s = two;
t = mtwo;
m = _mm256_cmp_ps (x, zero, _CMP_LT_OQ);
t = _mm256_blendv_ps (t, s, m);
t = _mm256_fmadd_ps (x, t, s);
s = _mm256_sqrt_ps (t);
r = c0;
r = _mm256_fmadd_ps (r, t, c1);
r = _mm256_fmadd_ps (r, t, c2);
r = _mm256_fmadd_ps (r, t, c3);
r = _mm256_fmadd_ps (r, t, c4);
r = _mm256_fmadd_ps (r, t, c5);
r = _mm256_fmadd_ps (r, t, c6);
r = _mm256_fmadd_ps (r, t, c7);
r = _mm256_fmadd_ps (r, t, c8);
r = _mm256_mul_ps (r, t);
r = _mm256_fmadd_ps (r, s, s);
t = _mm256_sub_ps (zero, r);
t = _mm256_fmadd_ps (pi0, pi1, t);
r = _mm256_blendv_ps (r, t, m);
return r;
}