sgn(-0.0f) -0.0f +0.0f, @Cory Nelson. . , NaN.
// return -0.0 for x=-0.0, otherwise the same as Cory (except for NaN which neither handle well)
__m128 sgn_fast(__m128 x)
{
__m128 negzero = _mm_set1_ps(-0.0f);
// using _mm_setzero_ps() here might actually be better without AVX, since xor-zeroing is as cheap as a copy but starts a new dependency chain
//__m128 nonzero = _mm_cmpneq_ps(x, negzero); // -0.0 == 0.0 in IEEE floating point
__m128 nonzero = _mm_cmpneq_ps(x, _mm_setzero_ps());
__m128 x_signbit = _mm_and_ps(x, negzero);
__m128 zeroone = _mm_and_ps(nonzero, _mm_set1_ps(1.0f));
return _mm_or_ps(zeroone, x_signbit);
}
NaN, , +/- 1.0f, NaN. ( _mm_cmpneq_ps() , x NaN: . CMPPD).
AVX , Cory ( clang3.9 Godbolt). . gcc , MOVAPS , MOVAPS, xmm0.
xorps xmm1, xmm1
cmpneqps xmm1, xmm0
andps xmm0, xmmword ptr [rip + .LCPI0_0] # x_signbit
andps xmm1, xmmword ptr [rip + .LCPI0_1] # zeroone
orps xmm0, xmm1
- cmpneqps + andps + orps, , , 3 + 1 + 1 Intel Haswell. Cory cmpps , Skylake. , .
NaN, -1.0f, -/+0.0f, 1.0f NaN, , all-ones NaN.
_mm_cmpunord_ps(x,x), NaN-. (, , cmpneqps)or, NaN.
// return -0.0 for x=-0.0. Return -NaN for any NaN
__m128 sgn_fast_nanpropagating(__m128 x)
{
__m128 negzero = _mm_set1_ps(-0.0f);
__m128 nonzero = _mm_cmpneq_ps(x, _mm_setzero_ps());
__m128 x_signbit = _mm_and_ps(x, negzero);
__m128 nanmask = _mm_cmpunord_ps(x,x);
__m128 x_sign_or_nan = _mm_or_ps(x_signbit, nanmask); // apply it here instead of to the final result for better ILP
__m128 zeroone = _mm_and_ps(nonzero, _mm_set1_ps(1.0f));
return _mm_or_ps(zeroone, x_sign_or_nan);
}
. MOVAPS AVX.
, - SSE4.1 BLENDVPS, . .
, SSSE3 _mm_sign_epi32(set1(1), x), -1, 0 1 . -0.0f -> -1 , ANDing _mm_cmpneq_ps(x, _mm_setzero_ps())
// returns -1 for x = -0.0f
__m128i sgn_verysloppy_int_ssse3(__m128 x) {
__m128i one = _mm_set1_epi32(1);
__m128i sign = _mm_sign_epi32(one, _mm_castps_si128(x));
return sign;
}
// correct results for all inputs
// NaN -> -1 or 1 according to its sign bit, never 0
__m128i sgn_int_ssse3(__m128 x) {
__m128i one = _mm_set1_epi32(1);
__m128i sign = _mm_sign_epi32(one, _mm_castps_si128(x));
__m128 nonzero = _mm_cmpneq_ps(x, _mm_setzero_ps());
return _mm_and_si128(sign, _mm_castps_si128(nonzero));
}