Validation for nans with intrinsics in C ++

I am new to using intrinsics, but I wanted to write a function that takes a vector of 4 paired numbers a > 1e-5 ? std::sqrt(a) : 0.0. My first instinct was this:

#include <immintrin.h>
__m256d f(__m256d a)
{
  __m256d is_valid = a > _mm256_set1_pd(1e-5);
  __m256d sqrt_val = _mm256_sqrt_pd(a);
  return is_valid * sqrt_val;
}

which according to gcc.godbolt.com compiles into the following

f(double __vector(4)):
    vsqrtpd  ymm1, ymm0
    vcmpgtpd ymm0, ymm0, YMMWORD PTR .LC0[rip]
    vmulpd   ymm0, ymm1, ymm0
    ret
.LC0:
    .long   2296604913
    .long   1055193269
    .long   2296604913
    .long   1055193269
    .long   2296604913
    .long   1055193269
    .long   2296604913
    .long   1055193269

but I worry what will happen if it sqrt_valcontains nan. I do not think that 0.0 * nanwill work. what are the best practices to do here?

Edit

After reading the comment from @ChrisCooper (and @njuffa), I was linked to another stack overflow answer, and so I will check for equality itself, and then andthis with my result.

#include <immintrin.h>
__m256d f(__m256d a)
{
  __m256d is_valid = a > _mm256_set1_pd(1e-5);
  __m256d sqrt_val = _mm256_sqrt_pd(a);
  __m256d result = is_valid * sqrt_val;
  __m256d cmpeq = result == result;
  return  _mm256_and_pd(cmpeq, result);
} 

which compiles to the next

f(double __vector(4)):
    vsqrtpd  ymm1, ymm0
    vcmpgtpd ymm0, ymm0, YMMWORD PTR .LC0[rip]
    vmulpd   ymm0, ymm1, ymm0
    vcmpeqpd ymm1, ymm0, ymm0
    vandpd   ymm0, ymm1, ymm0
    ret
.LC0:
    .long   2296604913
    .long   1055193269
    .long   2296604913
    .long   1055193269
    .long   2296604913
    .long   1055193269
    .long   2296604913
    .long   1055193269
+4
source share
1

AVX, , . , , .

, 1s ( TRUE) 0s ( FALSE). , ANDing mask vsqrtpd. 0.0 IEEE-754 - 0s.

, , . , , NaN ( NaN FALSE), "O". , NaN ( , , ), "Q". , _CMP_GT_OQ.

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>
#include <immintrin.h>

__m256d f (__m256d a)
{
   double em5 = 1e-5;
   __m256d v_em5 = _mm256_broadcast_sd (&em5);
   __m256d v_sqrt = _mm256_sqrt_pd (a);
   __m256d v_mask = _mm256_cmp_pd (a, v_em5, _CMP_GT_OQ);
   __m256d v_res = _mm256_and_pd (v_sqrt, v_mask);
   return v_res;
}

int main (void)
{
    __m256d arg, res;
    double args[4] = {2e-5, sqrt(-1.0), 1e-6, -1.0};
    double ress [4] = {0};

    memcpy (&arg, args, sizeof(arg));
    res = f (arg);
    memcpy (ress, &res, sizeof(res));

    printf ("args = % 23.16e  % 23.16e  % 23.16e  % 23.16e\n", 
            args[0], args[1], args[2], args[3]);
    printf ("ress = % 23.16e  % 23.16e  % 23.16e  % 23.16e\n", 
            ress[0], ress[1], ress[2], ress[3]);
    return EXIT_SUCCESS;
}

Intel C, :

args =  2.0000000000000002e-005  -1.#IND000000000000e+000   9.9999999999999995e-007  -1.0000000000000000e+000
ress =  4.4721359549995798e-003   0.0000000000000000e+000   0.0000000000000000e+000   0.0000000000000000e+000

1.#IND000000000000e+000 - QNaN, INDEFINITE.

+5

Source: https://habr.com/ru/post/1629417/


All Articles