, , , , . _mm_malloc. , . ( , 32- ?)
- , , dispVals. (_mm256_maskstore_epi32 read-modify-write, all-ones.)
, . " " .
_mm_set1* . VPBROADCASTD , GP, movd - GP , , . ,
const __m256i add1 = _mm256_set1_epi32( 1 );
__m256i dvec = _mm256_setzero_si256();
for (d;d...;d++) {
dvec = _mm256_add_epi32(dvec, add1);
}
:
, , . blend (_mm256_blendv_epi8) - , (-) , . Blend = masked move .
, 16b, 32b, , . Intel 16b gp (movsz , mov), . dRPtr uint16_t. , , ( !). , _mm256_extracti128_si256( mask, 0 ) , 128 low128, reg src vmovsx, .
( uop ), . ( , vmovdqu vpminuw , ).
, :
for(i) { for(j) {
const __m256i add1 = _mm256_set1_epi16( 1 );
__m256i dvec = _mm256_setzero_si256();
__m256i minCosts = _mm256_set1_epi16( MAXCOST );
__m256i minDisps = _mm256_setzero_si256();
for (int d=0 ; d < numDisp && j+d < cstep ;
d++, dvec = _mm256_add_epi16(dvec, add1))
{
__m256i newMinCosts = _mm256_min_epu16( minCosts, asPtr[(i*numDisp*cstep)+(d*cstep)+j]) );
__m256i mask = _mm256_cmpgt_epi16( minCosts, newMinCosts );
minDisps = _mm256_blendv_epi8(minDisps, dvec, mask);
minCosts = newMinCosts;
}
int index = (i*cstep)+j;
_mm256_storeu_si256 (dRPtr + index, __m256i minDisps);
}}
, : minCosts0/minDisps0 minCosts1/minDisps1, . minDisps , 5 ( vpadd, , ). 6 uops (blendv 2), . 1,5 / ( ) haswell, dep 2 . ( , ). , : .
, ,
pminuw p1/p5. ( p2/p3)pcmpgtw p1/p5vpblendvb - 2 uops p5.padduw p1/p5movdqa reg,reg p0/p1/p5 ( ). Unrolling - minCosts = newMinCosts, newMinCosts .- fused
sub/jge ( ) p6. ( PTEST + jcc dvec ). add/sub p0/p1/p5/p6, jcc.
, 2,5 , , p1/p5. 2 4 / movdqa. Haswell 4 , uops , - . (48 .) uops CPU - ..
_mm256_min_epu16 (pminuw) - . 3 4 . , , op - , , .
(AVX ). , 4 /, , , .
/ insn.