I am trying to come up with a very fast threshold algorithm using SSE to replace this:
uint8_t *pSrc, *pDst; // Assume pSrc and pDst point to valid data // Handle left edge *pDst++ = *pSrc++; // Likeness filter for (uint32_t k = 2; k < width; k++, pSrc++, pDst++) if ((*pDst - *pSrc) * (*pDst - *pSrc) > 100 /*THRESHOLD_SQUARED*/) { *pDst = *pSrc; } } // Handle right edge *pDst++ = *pSrc++;
So far I have this:
const uint8_t THRESHOLD = 10; __attribute__((aligned (16))) static const uint8_t mask[16] = { THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD }; __m128i xmm1, xmm3, xmm4, xmm5, xmm6, xmm7, xmm8, xmm9; xmm1 = _mm_load_si128((__m128i const *)mask); xmm6 = _mm_setzero_si128(); uint8_t *pSrc, *pDst; // Assume pSrc and pDst point to valid data // I have other code with another mask for the first 16 entries for (uint32_t k = 16; k < (width - 16); k += 16, pSrc += 16, pDst += 16) { xmm3 = _mm_load_si128((__m128i const *)pDst); xmm4 = _mm_load_si128((__m128i const *)pSrc); xmm5 = _mm_unpacklo_epi8(xmm3, xmm6); xmm7 = _mm_unpackhi_epi8(xmm3, xmm6); xmm8 = _mm_unpacklo_epi8(xmm4, xmm6); xmm9 = _mm_unpackhi_epi8(xmm4, xmm6); xmm5 = _mm_sub_epi16(xmm5, xmm8); xmm7 = _mm_sub_epi16(xmm7, xmm9); xmm5 = _mm_abs_epi16(xmm5); xmm7 = _mm_abs_epi16(xmm7); xmm5 = _mm_packs_epi16(xmm5, xmm7); xmm5 = _mm_cmpgt_epi8(xmm5, xmm1); xmm3 = _mm_blendv_epi8(xmm3, xmm4, xmm5); _mm_store_si128((__m128i *)pDst, xmm3); } // I have other code with another mask for the last 16 entries
I have an idea to use a different type of algorithm to process the absolute value of the difference between the two values ââ(mainly for staying in U8 (uchar) space):
a' = a >> 1; b' = b >> 1; diff = (abs(sub(a' - b')) << 1) + ((a ^ b) & 1);
It will take 8 SSE instructions instead of the 9 above (not counting any additional register moves that the compiler generates), but I'm not sure if it is faster due to delays with delays.
Does any other SSE expert have better suggestions (using up to SSE 4.2)?
Update 1 - thanks to Yves offer!
const uint8_t THRESHOLD = 10; __attribute__((aligned (16))) static const uint8_t mask[16] = { THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD, THRESHOLD }; __m128i xmm1, xmm3, xmm4, xmm5, xmm6, xmm7; xmm1 = _mm_load_si128((__m128i const *)mask); xmm6 = _mm_setzero_si128(); uint8_t *pSrc, *pDst; // Assume pSrc and pDst point to valid data // I have other code with another mask for the first 16 entries for (uint32_t k = 16; k < (width - 16); k += 16, pSrc += 16, pDst += 16) { xmm3 = _mm_load_si128((__m128i const *)pDst); xmm4 = _mm_load_si128((__m128i const *)pSrc); xmm5 = _mm_subs_epu8(xmm3, xmm4); xmm7 = _mm_subs_epu8(xmm4, xmm3); xmm5 = _mm_adds_epu8(xmm5, xmm7); xmm5 = _mm_subs_epu8(xmm5, xmm1); xmm5 = _mm_cmpeq_epi8(xmm5, xmm6); xmm4 = _mm_blendv_epi8(xmm4, xmm3, xmm5); _mm_store_si128((__m128i *)pDst, xmm4); } // I have other code with another mask for the last 16 entries