(, <= 16 ), .
, , PaulR.
, (, 10% ), . .
:
int Merge3(const int *aArr, int aCnt, const int *bArr, int bCnt, int *dst) {
int i = 0, j = 0, k = 0;
while (i < aCnt - 32 && j < bCnt - 32) {
for (int t = 0; t < 32; t++) {
int aX = aArr[i], bX = bArr[j];
dst[k] = (aX < bX ? aX : bX);
k += (aX != bX);
i += (aX <= bX);
j += (aX >= bX);
}
}
while (i < aCnt && j < bCnt) {
...
:
- (32 ).
(i < aCnt && j < bCnt) t < 32. , . - . ,
cmov, setXX, . : , .
:
- ( ) (4 + 4) , , , 4- ():
4.95ns 4.65ns --- .
- ( ) 4 x 4 , 16- , -, _mm256_permutevar8x32_epi32 128- LUT, 8 , , _mm_movemask_ps + 16-entry LUT + _mm_shuffle_epi8 4 : 4.00ns 4.65ns --- .
+ LUT.
P.S. , . , 2 .