How to implement sign function using SSE3?

Question

How to implement sign function using SSE3?

1) Is there a way to effectively implement a function signature using SSE3 (without SSE4) with the following characteristics?

the input is a floating point vector __m128.
the output should also be __m128with [-1.0f, 0.0f, 1.0f] as its value

I tried this, but it did not work (although I think it should):

inputVal = _mm_set_ps(-0.5, 0.5, 0.0, 3.0);
comp1 = _mm_cmpgt_ps(_mm_setzero_ps(), inputVal);
comp2 = _mm_cmpgt_ps(inputVal, _mm_setzero_ps());
comp1 = _mm_castsi128_ps(_mm_castps_si128(comp1));
comp2 = _mm_castsi128_ps(_mm_castps_si128(comp2));
signVal = _mm_sub_ps(comp1, comp2);

2) Is there a way to create the "flag" function (I'm not sure about the correct name). Namely, if A > Bthe result will be 1and 0otherwise. The result should be a floating point ( __m128), like its input.

UPDATE . It seems Corey Nelson's answer will work here:

__m128 greatherThanFlag = _mm_and_ps(_mm_cmpgt_ps(valA, valB), _mm_set1_ps(1.0f));    
__m128 lessThanFlag = _mm_and_ps(_mm_cmplt_ps(valA, valB), _mm_set1_ps(1.0f));

+4

x86 vectorization x86-64 sse simd

Royi 24 . '16 17:26

4

sgn(-0.0f) -0.0f +0.0f, @Cory Nelson. . , NaN.

0.0 1.0 x != 0.0f
x .

// return -0.0 for x=-0.0, otherwise the same as Cory (except for NaN which neither handle well)
__m128 sgn_fast(__m128 x)
{
    __m128 negzero = _mm_set1_ps(-0.0f);

    // using _mm_setzero_ps() here might actually be better without AVX, since xor-zeroing is as cheap as a copy but starts a new dependency chain
    //__m128 nonzero = _mm_cmpneq_ps(x, negzero);  // -0.0 == 0.0 in IEEE floating point
    __m128 nonzero = _mm_cmpneq_ps(x, _mm_setzero_ps());

    __m128 x_signbit = _mm_and_ps(x, negzero);

    __m128 zeroone = _mm_and_ps(nonzero, _mm_set1_ps(1.0f));
    return _mm_or_ps(zeroone, x_signbit);
}

NaN, , +/- 1.0f, NaN. ( _mm_cmpneq_ps() , x NaN: . CMPPD).

AVX , Cory ( clang3.9 Godbolt). . gcc , MOVAPS , MOVAPS, xmm0.

    xorps   xmm1, xmm1
    cmpneqps        xmm1, xmm0
    andps   xmm0, xmmword ptr [rip + .LCPI0_0]    # x_signbit
    andps   xmm1, xmmword ptr [rip + .LCPI0_1]    # zeroone
    orps    xmm0, xmm1

- cmpneqps + andps + orps, , , 3 + 1 + 1 Intel Haswell. Cory cmpps , Skylake. , .

NaN, -1.0f, -/+0.0f, 1.0f NaN, , all-ones NaN.

_mm_cmpunord_ps(x,x), NaN-. (, , cmpneqps)
or, NaN.

// return -0.0 for x=-0.0.  Return -NaN for any NaN
__m128 sgn_fast_nanpropagating(__m128 x)
{
    __m128 negzero = _mm_set1_ps(-0.0f);
    __m128 nonzero = _mm_cmpneq_ps(x, _mm_setzero_ps());

    __m128 x_signbit = _mm_and_ps(x, negzero);
    __m128 nanmask   = _mm_cmpunord_ps(x,x);
    __m128 x_sign_or_nan = _mm_or_ps(x_signbit, nanmask);   // apply it here instead of to the final result for better ILP

    __m128 zeroone = _mm_and_ps(nonzero, _mm_set1_ps(1.0f));
    return _mm_or_ps(zeroone, x_sign_or_nan);
}

. MOVAPS AVX.

, - SSE4.1 BLENDVPS, . .

, SSSE3 _mm_sign_epi32(set1(1), x), -1, 0 1 . -0.0f -> -1 , ANDing _mm_cmpneq_ps(x, _mm_setzero_ps())

// returns -1 for x = -0.0f
__m128i sgn_verysloppy_int_ssse3(__m128 x) {
  __m128i one = _mm_set1_epi32(1);
  __m128i sign = _mm_sign_epi32(one, _mm_castps_si128(x));
  return sign;
}

// correct results for all inputs
// NaN -> -1 or 1 according to its sign bit, never 0
__m128i sgn_int_ssse3(__m128 x) {
  __m128i one = _mm_set1_epi32(1);
  __m128i sign = _mm_sign_epi32(one, _mm_castps_si128(x));

  __m128  nonzero = _mm_cmpneq_ps(x, _mm_setzero_ps());
    return _mm_and_si128(sign, _mm_castps_si128(nonzero));
}

+4

Peter Cordes 27 . '16 23:25

signum float, int32_t, NaN s, .

, 3 ( X - 0 1, MSB - ):

0 X X X X X X X X X X X X X X 1, > 0 ( > 0.0f float)
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0, == 0 ( == 0.0f float)
1 X X X X X X X X X X X X X X X, < 0 ( <= 0.0f float)

, -0.0f:

1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0, == -0.0f == 0.0f float

sign-point .

, SSE3 ( SSSE3), :

inline __m128i _mm_signum_ps(__m128 a)
{
    __m128i x = _mm_castps_si128(a);

    __m128i zero = _mm_setzero_si128();
    __m128i m0 = _mm_cmpgt_epi32(x, zero);
    __m128i m1 = _mm_cmplt_epi32(x, zero);
    __m128i m2 = _mm_cmpeq_epi32(x, _mm_set1_epi32(0x80000000));

    __m128i p = _mm_and_si128(m0, _mm_set1_epi32(+1));
    __m128i n = _mm_and_si128(m1, _mm_set1_epi32(-1));

    return _mm_andnot_si128(m2, _mm_or_si128(p, n));
}

inline __m128i _mm_signum_ps(__m128 a)
{
    __m128i x = _mm_castps_si128(a);

    __m128i zr = _mm_setzero_si128();
    __m128i m0 = _mm_cmpeq_epi32(x, _mm_set1_epi32(0x80000000));
    __m128i mp = _mm_cmpgt_epi32(x, zr);
    __m128i mn = _mm_cmplt_epi32(x, zr);

    return _mm_or_si128(
      _mm_andnot_si128(m0, mn),
      _mm_and_si128(mp, _mm_set1_epi32(1))
    );
}

, _mm_cmplt_ps _mm_cmplt_epi32/_mm_cmpeq_epi32 -0.0f 1 , - / , , , . . , , . :

inline __m128i _mm_signum_ps(__m128 a)
{
    __m128i x = _mm_castps_si128(a);
    __m128 zerops = _mm_setzero_ps();

    __m128i mn = _mm_castps_si128(_mm_cmplt_ps(a, zerops));
    __m128i mp = _mm_cmpgt_epi32(x, _mm_castps_si128(zerops));

    return _mm_or_si128(mn, _mm_and_si128(mp, _mm_set1_epi32(1)));
}

-march=x86-64 -msse3 -O3 clang 3.9

_mm_signum_ps(float __vector(4)):                # @_mm_signum2_ps(float __vector(4))
        xorps   xmm1, xmm1                       # fp domain
        movaps  xmm2, xmm0                       # fp domain
        cmpltps xmm2, xmm1                       # fp domain
        pcmpgtd xmm0, xmm1                       # int domain
        psrld   xmm0, 31                         # int domain
        por     xmm0, xmm2                       # int domain
        ret

cmpltps, 1 <= 1. , , SSSE3 _mm_sign_epi32.

, ( / ), Peter.

+3

plasmacel 27 . '16 9:05

, , 0/-1 int float, .

():

inputVal = _mm_set_ps(-0.5, 0.5, 0.0, 3.0);
comp1 = _mm_cmpgt_ps(_mm_setzero_ps(), inputVal);
comp2 = _mm_cmpgt_ps(inputVal, _mm_setzero_ps());
comp1 = _mm_cvtepi32_ps(_mm_castps_si128(comp1)); // 0/-1 => 0.0f/-1.0f
comp2 = _mm_cvtepi32_ps(_mm_castps_si128(comp2));
signVal = _mm_sub_ps(comp1, comp2);

, , Cory, , .

+2

Paul R 24 . '16 17:39

Cory Nelson · Accepted Answer · 2016-12-24T17:40:32+0000

, , , :

__m128 sign(__m128 x)
{
    __m128 zero = _mm_setzero_ps();

    __m128 positive = _mm_and_ps(_mm_cmpgt_ps(x, zero), _mm_set1_ps(1.0f));
    __m128 negative = _mm_and_ps(_mm_cmplt_ps(x, zero), _mm_set1_ps(-1.0f));

    return _mm_or_ps(positive, negative);
}

, :

__m128i sign(__m128 x)
{
    __m128 zero = _mm_setzero_ps();

    __m128 positive = _mm_and_ps(_mm_cmpgt_ps(x, zero),
                                 _mm_castsi128_ps(_mm_set1_epi32(1)));
    __m128 negative = _mm_cmplt_ps(x, zero);

    return _mm_castps_si128(_mm_or_ps(positive, negative));
}

How to implement sign function using SSE3?

More articles: