-
__m256i idx1 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1);
__m256i idx2 = _mm256_setr_epi32(1,2,3,4,5,6,7,0);
__m256i t1 = _mm256_shuffle_epi8 (t0, idx1);
__m256i t2 = _mm256_permute2x128_si256(t1, t1, 1);
__m256i t3 = _mm256_blend_epi16(t1,t2,0x80);
__m256i t4 = _mm256_permutevar8x32_epi32(t0, idx2);
__m256i s = _mm256_add_epi16(t0, _mm256_add_epi16(t3,t4));
.
#include <stdio.h>
#include <x86intrin.h>
int main(void) {
short x[16];
for(int i=0; i<16; i++) x[i] = i;
__m256i idx1 = _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1);
__m256i idx2 = _mm256_setr_epi32(1,2,3,4,5,6,7,0);
__m256i t0 = _mm256_loadu_si256((__m256i*)x);
__m256i t1 = _mm256_shuffle_epi8 (t0, idx1);
__m256i t2 = _mm256_permute2x128_si256(t1, t1, 1);
__m256i t3 = _mm256_blend_epi16(t1,t2,0x80);
__m256i t4 = _mm256_permutevar8x32_epi32(t0, idx2);
__m256i s = _mm256_add_epi16(t0, _mm256_add_epi16(t3,t4));
short y[16];
_mm256_storeu_si256((__m256i*)y, t0);
for(int i=0; i<16; i++) printf("%2x ", y[i]); puts("");
_mm256_storeu_si256((__m256i*)y, t3);
for(int i=0; i<16; i++) printf("%2x ", y[i]); puts("");
_mm256_storeu_si256((__m256i*)y, t4);
for(int i=0; i<16; i++) printf("%2x ", y[i]); puts("");
_mm256_storeu_si256((__m256i*)y, s);
for(int i=0; i<16; i++) printf("%2x ", y[i]); puts("");
}
0 1 2 3 4 5 6 7 8 9 a b c d e f
1 2 3 4 5 6 7 8 9 a b c d e f 0
2 3 4 5 6 7 8 9 a b c d e f 0 1
3 6 9 c f 12 15 18 1b 1e 21 24 27 2a 1d 10