I am trying to learn how to use SSE, one of the programs that I required to use modular division, and so I wrote this to do this (sorry this exceeded):
__m128i SSEModDiv(__m128i input, __m128i divisors)
{
//Error Checking (div by zero)
/*__m128i zeros = _mm_set1_epi32(0);
__m128i error = _mm_set1_epi32(-1);
__m128i zerocheck = _mm_cmpeq_epi32(zeros, divisors);
if (_mm_extract_epi16(zerocheck, 0) != 0)
return error;
if (_mm_extract_epi16(zerocheck, 2) != 0)
return error;
if (_mm_extract_epi16(zerocheck, 4) != 0)
return error;
if (_mm_extract_epi16(zerocheck, 6) != 0)
return error;*/
//Now for the real work
__m128 inputf = _mm_cvtepi32_ps(input);
__m128 divisorsf = _mm_cvtepi32_ps(divisors);
/*__m128 recip = _mm_rcp_ps(divisorsf); //Takes reciprocal
__m128 divided = _mm_mul_ps(inputf, recip); //multiplies by reciprical values*/
__m128 divided = _mm_div_ps(inputf, divisorsf);
__m128i intermediateint = _mm_cvttps_epi32(divided); //makes an integer version truncated
__m128 intermediate = _mm_cvtepi32_ps(intermediateint);
__m128 multiplied = _mm_mul_ps(intermediate, divisorsf); //multiplies the intermediate with the divisors
__m128 mods = _mm_sub_ps(inputf, multiplied); //subtracts to get moduli
return _mm_cvtps_epi32(mods);
}
The problem is that it is about as fast as the module of each of the four 32-bit integers individually in the release and about 10 times slower in debugging (found by profiling).
Can someone tell me how to make this function faster?
-I cannot use SVML because I use Visual Studio -
source
share