Fast SSE Threshold Algorithm

Question

Fast SSE Threshold Algorithm

I am trying to come up with a very fast threshold algorithm using SSE to replace this:

uint8_t *pSrc, *pDst; // Assume pSrc and pDst point to valid data // Handle left edge *pDst++ = *pSrc++; // Likeness filter for (uint32_t k = 2; k < width; k++, pSrc++, pDst++) if ((*pDst - *pSrc) * (*pDst - *pSrc) > 100 /*THRESHOLD_SQUARED*/) { *pDst = *pSrc; } } // Handle right edge *pDst++ = *pSrc++;

So far I have this:

 const uint8_t THRESHOLD = 10; __attribute__((aligned (16))) static const uint8_t mask[16] = { THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD }; __m128i xmm1, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9; xmm1 = _mm_load_si128((__m128i const *)mask); xmm6 = _mm_setzero_si128(); uint8_t *pSrc, *pDst; // Assume pSrc and pDst point to valid data // I have other code with another mask for the first 16 entries for (uint32_t k = 16; k < (width - 16); k += 16, pSrc += 16, pDst += 16) { xmm3 = _mm_load_si128((__m128i const *)pDst); xmm4 = _mm_load_si128((__m128i const *)pSrc); xmm5 = _mm_unpacklo_epi8(xmm3, xmm6); xmm7 = _mm_unpackhi_epi8(xmm3, xmm6); xmm8 = _mm_unpacklo_epi8(xmm4, xmm6); xmm9 = _mm_unpackhi_epi8(xmm4, xmm6); xmm5 = _mm_sub_epi16(xmm5, xmm8); xmm7 = _mm_sub_epi16(xmm7, xmm9); xmm5 = _mm_abs_epi16(xmm5); xmm7 = _mm_abs_epi16(xmm7); xmm5 = _mm_packs_epi16(xmm5, xmm7); xmm5 = _mm_cmpgt_epi8(xmm5, xmm1); xmm3 = _mm_blendv_epi8(xmm3, xmm4, xmm5); _mm_store_si128((__m128i *)pDst, xmm3); } // I have other code with another mask for the last 16 entries

I have an idea to use a different type of algorithm to process the absolute value of the difference between the two values (mainly for staying in U8 (uchar) space):

 a' = a >> 1; b' = b >> 1; diff = (abs(sub(a' - b')) << 1) + ((a ^ b) & 1);

It will take 8 SSE instructions instead of the 9 above (not counting any additional register moves that the compiler generates), but I'm not sure if it is faster due to delays with delays.

Does any other SSE expert have better suggestions (using up to SSE 4.2)?

Update 1 - thanks to Yves offer!

 const uint8_t THRESHOLD = 10; __attribute__((aligned (16))) static const uint8_t mask[16] = { THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD }; __m128i xmm1, xmm3, xmm4, xmm5, xmm6, xmm7; xmm1 = _mm_load_si128((__m128i const *)mask); xmm6 = _mm_setzero_si128(); uint8_t *pSrc, *pDst; // Assume pSrc and pDst point to valid data // I have other code with another mask for the first 16 entries for (uint32_t k = 16; k < (width - 16); k += 16, pSrc += 16, pDst += 16) { xmm3 = _mm_load_si128((__m128i const *)pDst); xmm4 = _mm_load_si128((__m128i const *)pSrc); xmm5 = _mm_subs_epu8(xmm3, xmm4); xmm7 = _mm_subs_epu8(xmm4, xmm3); xmm5 = _mm_adds_epu8(xmm5, xmm7); xmm5 = _mm_subs_epu8(xmm5, xmm1); xmm5 = _mm_cmpeq_epi8(xmm5, xmm6); xmm4 = _mm_blendv_epi8(xmm4, xmm3, xmm5); _mm_store_si128((__m128i *)pDst, xmm4); } // I have other code with another mask for the last 16 entries

+6

performance optimization algorithm sse simd

Chipk Oct 26 '14 at 5:45

source share

2 answers

There are some useful features from the Simd library:

 inline __m128i Combine(__m128i mask, __m128i positive, __m128i negative) { return _mm_or_si128(_mm_and_si128(mask, positive), _mm_andnot_si128(mask, negative)); } inline __m128i AbsDifferenceU8(__m128i a, __m128i b) { return _mm_sub_epi8(_mm_max_epu8(a, b), _mm_min_epu8(a, b)); } inline __m128i LesserOrEqual8u(__m128i a, __m128i b) { return _mm_cmpeq_epi8(_mm_min_epu8(a, b), a); }

So, SSE2 optimization will look like this:

 __m128i t = _mm_set1_epi8(threshold); for (uint32_t k = 16; k < width - 16; pSrc += 16, pDst += 16) { __m128i src = _mm_load_si128((__m128i*)pSrc); __m128i dst = _mm_load_si128((__m128i*)pDst); __m128i mask = LesserOrEqual8u(AbsDifferenceU8(src, dst), t); _mm_strore_si128((__m128i*)pDst, Combine(mask, dst, src); }

+1

Ermig Oct 27 '14 at 14:27

source share

Yves daoust · Accepted Answer · 2014-10-26T17:04:48+0000

There is an effective alternative to calculating the absolute difference using arithmetic saturation.

Indeed, saturated subtraction computes A - B = Max(A - B, 0) , so that |AB| = (A - B) + (B - A) |AB| = (A - B) + (B - A) .

 Diff= _mm_adds_epu8(_mm_subs_epu8(A, B), _mm_subs_epu8(B, A));

The amount will not be saturated. Thus, you are left unsigned 16 x 8 bits and get the maximum bandwidth.

Fast SSE Threshold Algorithm

More articles: