erfcx(), , :
. M. Shepherd and J. G. Laframboise, " (1 + 2 x) exp (x 2) erfc x 0 ≤ x < ∞." , 36, № 153, 1981 ., . 249-253 ()
, , . , . K (x - K)/(x + K) . "" , .
erfcx . , , erfcf. FMA .
:
float my_erfcxf (float x)
{
float a, d, e, m, p, q, r, s, t;
a = fmaxf (x, 0.0f - x);
m = a - 2.0f;
p = a + 2.0f;
#if FAST_RCP_SSE
r = fast_recipf_sse (p);
#else
r = 1.0f / p;
#endif
q = m * r;
t = fmaf (q + 1.0f, -2.0f, a);
e = fmaf (q, -a, t);
q = fmaf (r, e, q);
p = 0x1.f10000p-15f;
p = fmaf (p, q, 0x1.521cc6p-13f);
p = fmaf (p, q, -0x1.6b4ffep-12f);
p = fmaf (p, q, -0x1.6e2a7cp-10f);
p = fmaf (p, q, 0x1.3c1d7ep-10f);
p = fmaf (p, q, 0x1.1cc236p-07f);
p = fmaf (p, q, -0x1.069940p-07f);
p = fmaf (p, q, -0x1.bc1b6cp-05f);
p = fmaf (p, q, 0x1.4ff8acp-03f);
p = fmaf (p, q, -0x1.54081ap-03f);
p = fmaf (p, q, -0x1.7bf5cep-04f);
p = fmaf (p, q, 0x1.1ba03ap-02f);
d = a + 0.5f;
#if FAST_RCP_SSE
r = fast_recipf_sse (d);
#else
r = 1.0f / d;
#endif
r = r * 0.5f;
q = fmaf (p, r, r);
t = q + q;
e = (p - q) + fmaf (t, -a, 1.0f);
r = fmaf (e, r, q);
if (a > 0x1.fffffep127f) r = 0.0f;
if (x < 0.0f) {
s = x * x;
d = fmaf (x, x, -s);
e = expf (s);
r = e - r;
r = fmaf (e, d + d, r);
r = r + e;
if (e > 0x1.fffffep127f) r = e;
}
return r;
}
expf(). Intel, 13.1.3.198 /fp:strict, 2,00450 ulps 2,38412 ulps . , , expf() 2.5 ulps.
, , , , , . , erfcxf() , . , , SSE ( < 2,0 ulps), .
/* Fast reciprocal approximation. HW approximation plus Newton iteration */
float fast_recipf_sse (float a)
{
__m128 t;
float e, r;
t = _mm_set_ss (a);
t = _mm_rcp_ss (t);
_mm_store_ss (&r, t);
e = fmaf (0.0f - a, r, 1.0f);
r = fmaf (e, r, r);
return r;
}
erfcx() erfcxf(), . , , . , . Intel /fp:strict 2 32 , 2,83788 ulps 2,77856 ulps .
double my_erfcx (double x)
{
double a, d, e, m, p, q, r, s, t;
a = fmax (x, 0.0 - x);
m = a - 4.0;
p = a + 4.0;
r = 1.0 / p;
q = m * r;
t = fma (q + 1.0, -4.0, a);
e = fma (q, -a, t);
q = fma (r, e, q);
p = 0x1.edcad78fc8044p-31;
p = fma (p, q, 0x1.b1548f14735d1p-30);
p = fma (p, q, -0x1.a1ad2e6c4a7a8p-27);
p = fma (p, q, -0x1.1985b48f08574p-26);
p = fma (p, q, 0x1.c6a8093ac4f83p-24);
p = fma (p, q, 0x1.31c2b2b44b731p-24);
p = fma (p, q, -0x1.b87373facb29fp-21);
p = fma (p, q, 0x1.3fef1358803b7p-22);
p = fma (p, q, 0x1.7eec072bb0be3p-18);
p = fma (p, q, -0x1.78a680a741c4ap-17);
p = fma (p, q, -0x1.9951f39295cf4p-16);
p = fma (p, q, 0x1.3be1255ce180bp-13);
p = fma (p, q, -0x1.a1df71176b791p-13);
p = fma (p, q, -0x1.8d4aaa0099bc8p-11);
p = fma (p, q, 0x1.49c673066c831p-8);
p = fma (p, q, -0x1.0962386ea02b7p-6);
p = fma (p, q, 0x1.3079edf465cc3p-5);
p = fma (p, q, -0x1.0fb06dfedc4ccp-4);
p = fma (p, q, 0x1.7fee004e266dfp-4);
p = fma (p, q, -0x1.9ddb23c3e14d2p-4);
p = fma (p, q, 0x1.16ecefcfa4865p-4);
p = fma (p, q, 0x1.f7f5df66fc349p-7);
p = fma (p, q, -0x1.1df1ad154a27fp-3);
p = fma (p, q, 0x1.dd2c8b74febf6p-3);
d = a + 0.5;
r = 1.0 / d;
r = r * 0.5;
q = fma (p, r, r);
t = q + q;
e = (p - q) + fma (t, -a, 1.0);
r = fma (e, r, q);
if (a > 0x1.fffffffffffffp1023) r = 0.0;
if (x < 0.0) {
s = x * x;
d = fma (x, x, -s);
e = exp (s);
r = e - r;
r = fma (e, d + d, r);
r = r + e;
if (e > 0x1.fffffffffffffp1023) r = e;
}
return r;
}