AVX2 Winner Search-Take-All Disparity

I am optimizing the winner-take part of the non-conformity assessment algorithm using AVX2. My scalar routine is accurate, but with QVGA resolution and 48 differences, the runtime is disappointingly slow at ~ 14 ms on my laptop. I create both LR and RL images, but for simplicity I will only include code for searching RL here.

My scalar procedure:

int MAXCOST = 32000;
for (int i = maskRadius; i < rstep-maskRadius; i++) {

    // WTA "RL" Search:
    for (int j = maskRadius; j+maskRadius < cstep; j++) {
        int minCost = MAXCOST;
        int minDisp = 0;
        for (int d = 0; d < numDisp && j+d < cstep; d++) {
            if (asPtr[(i*numDisp*cstep)+(d*cstep)+j] < minCost) {
                minCost = asPtr[(i*numDisp*cstep)+(d*cstep)+j];
                minDisp = d;
            }
        }
        dRPtr[(i*cstep)+j] = minDisp;
    }
}

My attempt to use AVX2:

int MAXCOST = 32000;
int* dispVals = (int*) _mm_malloc( sizeof(int32_t)*16, 32 );

for (int i = maskRadius; i < rstep-maskRadius; i++) {

    // WTA "RL" Search AVX2:
    for( int j = 0; j < cstep-16; j+=16) {

        __m256i minCosts = _mm256_set1_epi16( MAXCOST );
        __m128i loMask   = _mm_setzero_si128();
        __m128i hiMask   = _mm_setzero_si128();

        for (int d = 0; d < numDisp && j+d < cstep; d++) {
            // Grab 16 costs to compare
            __m256i costs = _mm256_loadu_si256((__m256i*) (asPtr[(i*numDisp*cstep)+(d*cstep)+j]));

            // Get the new minimums
            __m256i newMinCosts = _mm256_min_epu16( minCosts, costs );

            // Compare new mins to old to build mask to store minDisps
            __m256i mask   = _mm256_cmpgt_epi16( minCosts, newMinCosts );
            __m128i loMask = _mm256_extracti128_si256( mask, 0 );
            __m128i hiMask = _mm256_extracti128_si256( mask, 1 );
            // Sign extend to 32bits
            __m256i loMask32 = _mm256_cvtepi16_epi32( loMask );
            __m256i hiMask32 = _mm256_cvtepi16_epi32( hiMask );

            __m256i currentDisp = _mm256_set1_epi32( d );
            // store min disps with mask
            _mm256_maskstore_epi32( dispVals, loMask32, currentDisp );    // RT error, why?
            _mm256_maskstore_epi32( dispVals+8, hiMask32, currentDisp );  // RT error, why?

            // Set minCosts to newMinCosts
            minCosts = newMinCosts;
        }

        // Write the WTA minimums one-by-one to the RL disparity image
        int index = (i*cstep)+j;
        for( int k = 0; k < 16; k++ ) {
            dRPtr[index+k] = dispVals[k];
        }
    }
}
_mm_free( dispVals );

The spatial mismatch image (DSI) has a size of HxWxD (320x240x48), which I lay out horizontally for better memory access, so that each line has a size of WxD.

. , , , , , 3x3 5x5. "". asPtr, .

, , . . - , , . , , , , , 16 (, QVGA: 320x240), SIMD ( ).

, , , OpenCV-. .

, . VS 2012 Express Update 4. , - . , , , , __m256i ..

, ~ 14 ~ 8, . - i7-4980HQ, AVX2 .

Info image

+4
2

, , , , . _mm_malloc. , . ( , 32- ?)

- , , dispVals. (_mm256_maskstore_epi32 read-modify-write, all-ones.)

, . " " .

_mm_set1* . VPBROADCASTD , GP, movd - GP , , . ,

const __m256i add1 = _mm256_set1_epi32( 1 );
__m256i dvec = _mm256_setzero_si256();
for (d;d...;d++) {
    dvec = _mm256_add_epi32(dvec, add1);
}

:
 , , . blend (_mm256_blendv_epi8) - , (-) , . Blend = masked move .

, 16b, 32b, , . Intel 16b gp (movsz , mov), . dRPtr uint16_t. , , ( !). , _mm256_extracti128_si256( mask, 0 ) , 128 low128, reg src vmovsx, .

( uop ), . ( , vmovdqu vpminuw , ).

, :

// totally untested, didn't even check that this compiles.
for(i) { for(j) {
// inner loop, compiler can hoist these constants.
const __m256i add1 = _mm256_set1_epi16( 1 );
__m256i dvec = _mm256_setzero_si256();
__m256i minCosts = _mm256_set1_epi16( MAXCOST );
__m256i minDisps = _mm256_setzero_si256();

for (int d=0 ; d < numDisp && j+d < cstep ;
     d++, dvec = _mm256_add_epi16(dvec, add1))
{
    __m256i newMinCosts = _mm256_min_epu16( minCosts, asPtr[(i*numDisp*cstep)+(d*cstep)+j]) );
    __m256i mask   = _mm256_cmpgt_epi16( minCosts, newMinCosts );
    minDisps = _mm256_blendv_epi8(minDisps, dvec, mask); // 2 uops, latency=2
    minCosts = newMinCosts;
}

// put sign extension here if making dRPtr uint16_t isn't an option.
int index = (i*cstep)+j;
_mm256_storeu_si256 (dRPtr + index, __m256i minDisps);
}}

, : minCosts0/minDisps0 minCosts1/minDisps1, . minDisps , 5 ( vpadd, , ). 6 uops (blendv 2), . 1,5 / ( ) haswell, dep 2 . ( , ). , : .

, ,

  • pminuw p1/p5. ( p2/p3)
  • pcmpgtw p1/p5
  • vpblendvb - 2 uops p5.
  • padduw p1/p5
  • movdqa reg,reg p0/p1/p5 ( ). Unrolling - minCosts = newMinCosts, newMinCosts .
  • fused sub/jge ( ) p6. ( PTEST + jcc dvec ). add/sub p0/p1/p5/p6, jcc.

, 2,5 , , p1/p5. 2 4 / movdqa. Haswell 4 , uops , - . (48 .) uops CPU - ..

_mm256_min_epu16 (pminuw) - . 3 4 . , , op - , , .

(AVX ). , 4 /, , , .

/ insn.

+2

, , . , ..

, :

int MAXCOST = 32000, numDispXcstep = numDisp*cstep;
for (int i = maskRadius; i < rstep - maskRadius; i+=numDispXcstep) {
    for (int j = maskRadius; j < cstep - maskRadius; j++) {
        int minCost = MAXCOST, minDisp = 0;
        for (int d = 0; d < numDispXcstep - j; d+=cstep) {
            if (asPtr[i+j+d] < minCost) {
                minCost = asPtr[i+j+d];
                minDisp = d;
            }
        }
        dRPtr[i/numDisp+j] = minDisp;
    }
}

, , . , "i" - , "d" "j", , .... , , .

+2

Source: https://habr.com/ru/post/1589113/


All Articles