, . , , .
? , . , - . , , , .
, , .
, , , OR , . x86 asm, gcc . ( , ).
godbolt. (g++ 32- x86 setcc, 8 . . , - 24 ... , gcc 4.9.2 , clang on godbolt)
int8_t match4_ordups(const four_points *s1struct, const four_points *s2struct)
{
const int32_t *s1 = &s1struct->a;
const int32_t *s2 = &s2struct->a;
int8_t matches = 0;
for (int j=0 ; j<4 ; j++) {
matches |= (s1[0] == s2[j]);
}
for (int i=1; i<4; i++) {
uint32_t s1i = s1[i];
int8_t notdup = 1;
for (int j=0 ; j<i ; j++) {
notdup &= (uint8_t) (s1i != s1[j]);
}
int8_t mi =
(s1i == s2[0]) |
(s1i == s2[1]) |
(s1i == s2[2]) |
(s1i == s2[3]);
matches += mi & notdup;
}
return matches;
}
128b, 4- 32- (, x86 SSE2), s1 , 4 . icc - , autovectorize match4_ordups ( godbolt.)
movemask, , . .
: :
{ 1d 1c 1b 1a }
== == == == packed-compare with
{ 2d 2c 2b 2a }
{ 1a 1d 1c 1b }
== == == == packed-compare with
{ 2d 2c 2b 2a }
{ 1b 1a 1d 1c }
== == == == packed-compare with
{ 2d 2c 2b 2a }
{ 1c 1b 1a 1d }
== == == == packed-compare with
{ 2d 2c 2b 2a } { 2b ...
3 16 . ORs, , . - = -1 ( ) . XOR , . v1 + = v2 . .
16 , , - , . , , , , , . 16 .
. , s2 - s1. int _mm_movemask_ps (__m128 a), , . (Nehalem , popcnt, 4- .)
ORs s1, s2 . , , (. ).
#include <stdint.h>
#include <immintrin.h>
typedef struct four_points {
int32_t a, b, c, d;
} four_points;
static inline int match4_sse_noS2dup(const four_points *s1pointer, const four_points *s2pointer)
{
__m128i s1 = _mm_loadu_si128((__m128i*)s1pointer);
__m128i s2 = _mm_loadu_si128((__m128i*)s2pointer);
__m128i s1b= _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 2, 1));
__m128i match = _mm_cmpeq_epi32(s1 , s2);
__m128i s1c= _mm_shuffle_epi32(s1, _MM_SHUFFLE(1, 0, 3, 2));
s1b = _mm_cmpeq_epi32(s1b, s2);
match = _mm_or_si128(match, s1b);
__m128i s1d= _mm_shuffle_epi32(s1, _MM_SHUFFLE(2, 1, 0, 3));
s1c = _mm_cmpeq_epi32(s1c, s2);
match = _mm_or_si128(match, s1c);
s1d = _mm_cmpeq_epi32(s1d, s2);
match = _mm_or_si128(match, s1d);
int matchmask = _mm_movemask_ps (_mm_castsi128_ps(match));
return _mm_popcnt_u32(matchmask);
}
. , s2 . , - , , gcc , , .
, 128b . 32- , 128b. . Agner Fog. 8 , , , .
, . IACA , Haswell 4,05 17 ( , . parallelism, , movmsk (2) popcnt (3)). AVX, gcc - movdqa, .
AVX2 match4 , 256b. AVX2 128b, 256b . , 2 4 (AVX-512) , , . , s1, s2s , 32B . AVX2 128b .
s2
, s2 , .
#
{ 0 2d 2c 2b }
{ 2d 2c 2b 2a } == == ==
{ 0 0 2d 2c }
{ 2d 2c 2b 2a } == ==
{ 0 0 0 2d }
{ 2d 2c 2b 2a } ==
Hmm, , , . s1 , ), 0. (SSE PALIGNR, 16B, - , . . , .)
update: , . 6 s2 s2, , .
, OR. ( ).
S2. , d==a, .
, . Byte-wise shuffle, . ( , , . ).
- ( , ). , sin-sin sin, s1, , , . , , . ( , , - .)
static inline
int match4_sse(const four_points *s1pointer, const four_points *s2pointer)
{
__m128i s1 = _mm_loadu_si128((__m128i*)s1pointer);
__m128i s2 = _mm_loadu_si128((__m128i*)s2pointer);
__m128i s1b= _mm_shuffle_epi32(s1, _MM_SHUFFLE(0, 3, 2, 1));
__m128i s1c= _mm_shuffle_epi32(s1, _MM_SHUFFLE(1, 0, 3, 2));
__m128i s1d= _mm_shuffle_epi32(s1, _MM_SHUFFLE(2, 1, 0, 3));
__m128i match = _mm_cmpeq_epi32(s1 , s2);
s1b = _mm_cmpeq_epi32(s1b, s2);
match = _mm_or_si128(match, s1b);
s1c = _mm_cmpeq_epi32(s1c, s2);
match = _mm_or_si128(match, s1c);
s1d = _mm_cmpeq_epi32(s1d, s2);
match = _mm_or_si128(match, s1d);
__m128i s2b = _mm_shuffle_epi32(s2, _MM_SHUFFLE(1, 0, 0, 3));
__m128i s2c = _mm_shuffle_epi32(s2, _MM_SHUFFLE(2, 1, 0, 3));
s2b = _mm_cmpeq_epi32(s2b, s2);
s2c = _mm_cmpeq_epi32(s2c, s2);
__m128i s2bc= _mm_or_si128(s2b, s2c);
s2bc = _mm_shuffle_epi8(s2bc, _mm_set_epi8(-1,-1,0,12, -1,-1,-1,8, -1,-1,-1,4, -1,-1,-1,-1));
__m128i dupmask = _mm_cmpeq_epi32(s2bc, _mm_setzero_si128());
match = _mm_and_si128(match, dupmask);
int matchmask = _mm_movemask_ps (_mm_castsi128_ps(match));
int ret = _mm_popcnt_u32(matchmask);
return ret;
}
SSSE3 pshufb. pcmpeq ( a pxor ) (bslli(s2bc, 12)), OR AND.
d==bc c==ab b==a a==d = s2b|s2c
d==a 0 0 0 = byte-shift-left(s2b) = s2d0
d==abc c==ab b==a a==d = s2abc
d==abc c==ab b==a 0 = mask(s2abc). Maybe use PBLENDW or MOVSS from s2d0 (which we know has zeros) to save loading a 16B mask.
__m128i s2abcd = _mm_or_si128(s2b, s2c);
//s2bc = _mm_shuffle_epi8(s2bc, _mm_set_epi8(-1,-1,0,12, -1,-1,-1,8, -1,-1,-1,4, -1,-1,-1,-1));
//__m128i dupmask = _mm_cmpeq_epi32(s2bc, _mm_setzero_si128());
__m128i s2d0 = _mm_bslli_si128(s2b, 12); // d==a 0 0 0
s2abcd = _mm_or_si128(s2abcd, s2d0);
__m128i dupmask = _mm_blend_epi16(s2abcd, s2d0, 0 | (2 | 1));
//__m128i dupmask = _mm_and_si128(s2abcd, _mm_set_epi32(-1, -1, -1, 0));
match = _mm_andnot_si128(dupmask, match); // ~dupmask & match; first arg is the one that inverted
MOVSS; AMD, FP. PBLENDW - SSE4.1. popcnt AMD K10, PBLENDW ( PhenomII , , ). , K10 pshufb, SSE4.1 POPCNT PBLENDW. ( PSHUFB, -).
movemask s2bc op. , , , movemask , ANDN . BMI1 , , Skylake Celerons Pentiums . ( , IMO. , BMI .)
unsigned int dupmask = _mm_movemask_ps(cast(s2bc));
dupmask |= dupmask << 3;
dupmask &= ~1;
unsigned int matchmask = _mm_movemask_ps(cast(match));
matchmask &= ~dupmask;
return _mm_popcnt_u32(matchmask);
AMD XOP VPPERM ( ) -shuffle OR, s2b s2c.
Hmm, pshufb , , pcmpeqd a pxor. , D-. , .
, , . , OR AND port0 ( Intel), . pxor , ( Intel SnB).
, IACA.
PBLENDW PSHUFB (22 , -AVX), PSHUFB ( 7.1c, 7.4c, PBLENDW , .) IACA , , PANDN PBLENDW, 7.4c, . Port0 , IDK , PBLENDW.
, .
, , .
s2 - , s2 s1, , 4, . , , , , .
:
s2 . 0. ANDN s1 vs s2.
, s2 , popcnt.
s2.d s2 ( ). , 3 . , , PTEST / SETCC ( popcount). (PTEST _mm_setr_epi32(0, -1, -1, -1), c,b,a, d==d). (c == a | c == b) b == a . Intel Haswell 4 ALU, 3 , 6. AMD .
shuffle s2, , , . , movemask → 4- -?