Comparing two pairs of 4 variables and returning the number of matches?

Question

Comparing two pairs of 4 variables and returning the number of matches?

Given the following structure:

struct four_points {
    uint32_t a, b, c, d;
}

What would be the fastest way to compare two such structures and return the number of variables that match (at any position)?

For example:

four_points s1 = {0, 1, 2, 3};
four_points s2 = {1, 2, 3, 4};

I would look for result 3, since the three numbers match between the two structures. However, given the following:

four_points s1 = {1, 0, 2, 0};
four_points s2 = {0, 1, 9, 7};

Then I would expect a result of only 2, because only two variables coincide between any structure (despite the fact that there are two zeros in the first).

, , . , , , .

- , , ?

+4

performance c sorting compare sorting-network

CMPXCHG8B 14 . '15 12:15

1

Peter Cordes · Accepted Answer · 2015-09-14T21:21:38+0000

, . , , .

? , . , - . , , , .

, , .

, , , OR , . x86 asm, gcc . ( , ).

godbolt. (g++ 32- x86 setcc, 8 . . , - 24 ... , gcc 4.9.2 , clang on godbolt)

// 8-bit types used because x86 setcc instruction only sets the low 8 of a register
// leaving the other bits unmodified.
// Doing a 32bit add from that creates a partial register slowdown on Intel P6 and Sandybridge CPU families
// Also, compilers like to insert movzx (zero-extend) instructions
// because I guess they don't realize the previous high bits are all zero.
// (Or they're tuning for pre-sandybridge Intel, where the stall is worse than SnB inserting the extra uop itself).

// The return type is 8bit because otherwise clang decides it should generate
// things as 32bit in the first place, and does zero-extension -> 32bit adds.
int8_t match4_ordups(const four_points *s1struct, const four_points *s2struct)
{
    const int32_t *s1 = &s1struct->a; // TODO: check if this breaks aliasing rules
    const int32_t *s2 = &s2struct->a;
    // ignore duplicates by combining with OR instead of addition
    int8_t matches = 0;

    for (int j=0 ; j<4 ; j++) {
        matches |= (s1[0] == s2[j]);
    }

    for (int i=1; i<4; i++) { // i=0 iteration is broken out above
        uint32_t s1i = s1[i];

        int8_t notdup = 1; // is s1[i] a duplicate of s1[0.. i-1]?
        for (int j=0 ; j<i ; j++) {
            notdup &= (uint8_t) (s1i != s1[j]);  // like dup |= (s1i == s1[j]); but saves a NOT
        }

        int8_t mi = // match this iteration?
            (s1i == s2[0]) |
            (s1i == s2[1]) |
            (s1i == s2[2]) |
            (s1i == s2[3]);
    // gcc and clang insist on doing 3 dependent OR insns regardless of parens, not that it matters

        matches += mi & notdup;
    }
    return matches;
}

// see the godbolt link for a main() simple test harness.

128b, 4- 32- (, x86 SSE2), s1 , 4 . icc - , autovectorize match4_ordups ( godbolt.)

movemask, , . .

: :

{ 1d 1c 1b 1a }
  == == == ==   packed-compare with
{ 2d 2c 2b 2a }

{ 1a 1d 1c 1b }
  == == == ==   packed-compare with
{ 2d 2c 2b 2a }

{ 1b 1a 1d 1c }  # if dups didn't matter: do this shuffle on s2
  == == == ==   packed-compare with
{ 2d 2c 2b 2a }

{ 1c 1b 1a 1d } # if dups didn't matter: this result from { 1a ... }
  == == == ==   packed-compare with
{ 2d 2c 2b 2a }                                           { 2b ...

3 16 . ORs, , . - = -1 ( ) . XOR , . v1 + = v2 . .

16 , , - , . , , , , , . 16 .

. , s2 - s1. int _mm_movemask_ps (__m128 a), , . (Nehalem , popcnt, 4- .)

ORs s1, s2 . , , (. ).

#include <stdint.h>
#include <immintrin.h>

typedef struct four_points {
    int32_t a, b, c, d;
} four_points;
//typedef uint32_t four_points[4];

// small enough to inline, only 62B of x86 instructions (gcc 4.9.2)
static inline int match4_sse_noS2dup(const four_points *s1pointer, const four_points *s2pointer)
{
    __m128i s1 = _mm_loadu_si128((__m128i*)s1pointer);
    __m128i s2 = _mm_loadu_si128((__m128i*)s2pointer);
    __m128i s1b= _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 2, 1));
    // no shuffle needed for first compare
    __m128i match = _mm_cmpeq_epi32(s1 , s2);  //{s1.d==s2.d?-1:0, 1c==2c, 1b==2b, 1a==2a }
    __m128i s1c= _mm_shuffle_epi32(s1, _MM_SHUFFLE(1, 0, 3, 2));
    s1b = _mm_cmpeq_epi32(s1b, s2);
    match = _mm_or_si128(match, s1b);  // merge dups by ORing instead of adding

    // note that we shuffle the original vector every time
    // multiple short dependency chains are better than one long one.
    __m128i s1d= _mm_shuffle_epi32(s1, _MM_SHUFFLE(2, 1, 0, 3));
    s1c = _mm_cmpeq_epi32(s1c, s2);
    match = _mm_or_si128(match, s1c);
    s1d = _mm_cmpeq_epi32(s1d, s2);

    match = _mm_or_si128(match, s1d);    // match = { s2.a in s1?,  s2.b in s1?, etc. }

    // turn the the high bit of each 32bit element into a bitmap of s2 elements that have matches anywhere in s1
    // use float movemask because integer movemask does 8bit elements.
    int matchmask = _mm_movemask_ps (_mm_castsi128_ps(match));

    return _mm_popcnt_u32(matchmask);  // or use a 4b lookup table for CPUs with SSE2 but not popcnt
}

. , s2 . , - , , gcc , , .

, 128b . 32- , 128b. . Agner Fog. 8 , , , .

, . IACA , Haswell 4,05 17 ( , . parallelism, , movmsk (2) popcnt (3)). AVX, gcc - movdqa, .

AVX2 match4 , 256b. AVX2 128b, 256b . , 2 4 (AVX-512) , , . , s1, s2s , 32B . AVX2 128b .

`s2`

, s2 , .

#### comparing S2 with itself to mask off duplicates
{  0 2d 2c 2b }
{ 2d 2c 2b 2a }     == == ==

{  0  0 2d 2c }
{ 2d 2c 2b 2a }        == ==

{  0  0  0 2d }
{ 2d 2c 2b 2a }           ==

Hmm, , , . s1 , ), 0. (SSE PALIGNR, 16B, - , . . , .)

update: , . 6 s2 s2, , .

, OR. ( ).
S2. , d==a, .
, . Byte-wise shuffle, . ( , , . ).

- ( , ). , sin-sin sin, s1, , , . , , . ( , , - .)

static inline
int match4_sse(const four_points *s1pointer, const four_points *s2pointer)
{
    // IACA_START
    __m128i s1 = _mm_loadu_si128((__m128i*)s1pointer);
    __m128i s2 = _mm_loadu_si128((__m128i*)s2pointer);
    // s1a = unshuffled = s1.a in the low element
    __m128i s1b= _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 2, 1));
    __m128i s1c= _mm_shuffle_epi32(s1, _MM_SHUFFLE(1, 0, 3, 2));
    __m128i s1d= _mm_shuffle_epi32(s1, _MM_SHUFFLE(2, 1, 0, 3));

    __m128i match = _mm_cmpeq_epi32(s1 , s2);  //{s1.d==s2.d?-1:0, 1c==2c, 1b==2b, 1a==2a }
    s1b = _mm_cmpeq_epi32(s1b, s2);
    match = _mm_or_si128(match, s1b);  // merge dups by ORing instead of adding

    s1c = _mm_cmpeq_epi32(s1c, s2);
    match = _mm_or_si128(match, s1c);
    s1d = _mm_cmpeq_epi32(s1d, s2);
    match = _mm_or_si128(match, s1d);
    // match = { s2.a in s1?,  s2.b in s1?, etc. }

    // s1 vs s2 all done, now prepare a mask for it based on s2 dups

/*
 * d==b   c==a   b==a  d==a   #s2b
 * d==c   c==b   b==a  d==a   #s2c
 *    OR together -> s2bc
 *  d==abc     c==ba    b==a    0  pshufb(s2bc) (packed as zero or non-zero bytes within the each element)
 * !(d==abc) !(c==ba) !(b==a)  !0   pcmpeq setzero -> AND mask for s1_vs_s2 match
 */
    __m128i s2b = _mm_shuffle_epi32(s2, _MM_SHUFFLE(1, 0, 0, 3));
    __m128i s2c = _mm_shuffle_epi32(s2, _MM_SHUFFLE(2, 1, 0, 3));
    s2b = _mm_cmpeq_epi32(s2b, s2);
    s2c = _mm_cmpeq_epi32(s2c, s2);

    __m128i s2bc= _mm_or_si128(s2b, s2c);
    s2bc = _mm_shuffle_epi8(s2bc, _mm_set_epi8(-1,-1,0,12,  -1,-1,-1,8, -1,-1,-1,4,  -1,-1,-1,-1));
    __m128i dupmask = _mm_cmpeq_epi32(s2bc, _mm_setzero_si128());
    // see below for alternate insn sequences that can go here.

    match = _mm_and_si128(match, dupmask);
    // turn the the high bit of each 32bit element into a bitmap of s2 matches
    // use float movemask because integer movemask does 8bit elements.
    int matchmask = _mm_movemask_ps (_mm_castsi128_ps(match));

    int ret = _mm_popcnt_u32(matchmask);  // or use a 4b lookup table for CPUs with SSE2 but not popcnt
    // IACA_END
    return ret;
}

SSSE3 pshufb. pcmpeq ( a pxor ) (bslli(s2bc, 12)), OR AND.

d==bc  c==ab b==a a==d = s2b|s2c
d==a   0     0    0    = byte-shift-left(s2b) = s2d0
d==abc c==ab b==a a==d = s2abc
d==abc c==ab b==a 0    = mask(s2abc).  Maybe use PBLENDW or MOVSS from s2d0 (which we know has zeros) to save loading a 16B mask.

__m128i s2abcd = _mm_or_si128(s2b, s2c);
//s2bc = _mm_shuffle_epi8(s2bc, _mm_set_epi8(-1,-1,0,12,  -1,-1,-1,8, -1,-1,-1,4,  -1,-1,-1,-1));
//__m128i dupmask = _mm_cmpeq_epi32(s2bc, _mm_setzero_si128());
__m128i s2d0 = _mm_bslli_si128(s2b, 12);  // d==a  0  0  0
s2abcd = _mm_or_si128(s2abcd, s2d0);
__m128i dupmask = _mm_blend_epi16(s2abcd, s2d0, 0 | (2 | 1));
//__m128i dupmask = _mm_and_si128(s2abcd, _mm_set_epi32(-1, -1, -1, 0));

match = _mm_andnot_si128(dupmask, match);  // ~dupmask & match;  first arg is the one that inverted

MOVSS; AMD, FP. PBLENDW - SSE4.1. popcnt AMD K10, PBLENDW ( PhenomII , , ). , K10 pshufb, SSE4.1 POPCNT PBLENDW. ( PSHUFB, -).

movemask s2bc op. , , , movemask , ANDN . BMI1 , , Skylake Celerons Pentiums . ( , IMO. , BMI .)

unsigned int dupmask = _mm_movemask_ps(cast(s2bc));
dupmask |= dupmask << 3;  // bit3 = d==abc.  garbage in bits 4-6, careful if using AVX2 to do two structs at once
        // only 2 instructions.  compiler can use lea r2, [r1*8] to copy and scale
dupmask &= ~1;  // clear the low bit

unsigned int matchmask = _mm_movemask_ps(cast(match));
matchmask &= ~dupmask;   // ANDN is in BMI1 (Haswell), so this will take 2 instructions
return _mm_popcnt_u32(matchmask);

AMD XOP VPPERM ( ) -shuffle OR, s2b s2c.

Hmm, pshufb , , pcmpeqd a pxor. , D-. , .

, , . , OR AND port0 ( Intel), . pxor , ( Intel SnB).

, IACA.

PBLENDW PSHUFB (22 , -AVX), PSHUFB ( 7.1c, 7.4c, PBLENDW , .) IACA , , PANDN PBLENDW, 7.4c, . Port0 , IDK , PBLENDW.

, .

, , .

s2 - , s2 s1, , 4, . , , , , .

:

s2 . 0. ANDN s1 vs s2.
, s2 , popcnt.
s2.d s2 ( ). , 3 . , , PTEST / SETCC ( popcount). (PTEST _mm_setr_epi32(0, -1, -1, -1), c,b,a, d==d). (c == a | c == b) b == a . Intel Haswell 4 ALU, 3 , 6. AMD .
shuffle s2, , , . , movemask → 4- -?

Comparing two pairs of 4 variables and returning the number of matches?

s2

, .

More articles:

`s2`