(VCL) . , , , .
Stgatilov VCL, ( ). :
void tran8x4_AVX(float *a, float *b) {
Vec8f tmp0, tmp1, tmp2, tmp3;
Vec8f row0, row1, row2, row3;
row0 = Vec8f().load(&a[8*0]);
row1 = Vec8f().load(&a[8*1]);
row2 = Vec8f().load(&a[8*2]);
row3 = Vec8f().load(&a[8*3]);
tmp0 = blend8f<0, 1, 8, 9, 4, 5, 12, 13>(row0, row1);
tmp2 = blend8f<2, 3, 10, 11, 6, 7, 14, 15>(row0, row1);
tmp1 = blend8f<0, 1, 8, 9, 4, 5, 12, 13>(row2, row3);
tmp3 = blend8f<2, 3, 10, 11, 6, 7, 14, 15>(row2, row3);
row0 = blend8f<0, 2, 8, 10, 4, 6, 12, 14>(tmp0, tmp1);
row1 = blend8f<1, 3, 9, 11, 5, 7, 13, 15>(tmp0, tmp1);
row2 = blend8f<0, 2, 8, 10, 4, 6, 12, 14>(tmp2, tmp3);
row3 = blend8f<1, 3, 9, 11, 5, 7, 13, 15>(tmp2, tmp3);
row0.get_low().store(&b[ 4*0]);
row1.get_low().store(&b[ 4*1]);
row2.get_low().store(&b[ 4*2]);
row3.get_low().store(&b[ 4*3]);
row0.get_high().store(&b[ 4*4]);
row1.get_high().store(&b[ 4*5]);
row2.get_high().store(&b[ 4*6]);
row3.get_high().store(&b[ 4*7]);
}
(g++ -S -O3 -mavx test.cpp)
vmovups 32(%rdi), %ymm4
vmovups 64(%rdi), %ymm3
vmovups (%rdi), %ymm1
vmovups 96(%rdi), %ymm0
vshufps $68, %ymm4, %ymm1, %ymm2
vshufps $68, %ymm0, %ymm3, %ymm5
vshufps $238, %ymm4, %ymm1, %ymm1
vshufps $238, %ymm0, %ymm3, %ymm0
vshufps $136, %ymm5, %ymm2, %ymm4
vshufps $221, %ymm5, %ymm2, %ymm2
vshufps $136, %ymm0, %ymm1, %ymm3
vshufps $221, %ymm0, %ymm1, %ymm0
vmovups %xmm4, (%rsi)
vextractf128 $0x1, %ymm4, %xmm4
vmovups %xmm2, 16(%rsi)
vextractf128 $0x1, %ymm2, %xmm2
vmovups %xmm3, 32(%rsi)
vextractf128 $0x1, %ymm3, %xmm3
vmovups %xmm0, 48(%rsi)
vextractf128 $0x1, %ymm0, %xmm0
vmovups %xmm4, 64(%rsi)
vmovups %xmm2, 80(%rsi)
vmovups %xmm3, 96(%rsi)
vmovups %xmm0, 112(%rsi)
vzeroupper
ret
.cfi_endproc
#include <stdio.h>
#include "vectorclass.h"
void tran8x4(float *a, float *b) {
for(int i=0; i<4; i++) {
for(int j=0; j<8; j++) {
b[j*4+i] = a[i*8+j];
}
}
}
void tran8x4_AVX(float *a, float *b) {
Vec8f tmp0, tmp1, tmp2, tmp3;
Vec8f row0, row1, row2, row3;
row0 = Vec8f().load(&a[8*0]);
row1 = Vec8f().load(&a[8*1]);
row2 = Vec8f().load(&a[8*2]);
row3 = Vec8f().load(&a[8*3]);
tmp0 = blend8f<0, 1, 8, 9, 4, 5, 12, 13>(row0, row1);
tmp2 = blend8f<2, 3, 10, 11, 6, 7, 14, 15>(row0, row1);
tmp1 = blend8f<0, 1, 8, 9, 4, 5, 12, 13>(row2, row3);
tmp3 = blend8f<2, 3, 10, 11, 6, 7, 14, 15>(row2, row3);
row0 = blend8f<0, 2, 8, 10, 4, 6, 12, 14>(tmp0, tmp1);
row1 = blend8f<1, 3, 9, 11, 5, 7, 13, 15>(tmp0, tmp1);
row2 = blend8f<0, 2, 8, 10, 4, 6, 12, 14>(tmp2, tmp3);
row3 = blend8f<1, 3, 9, 11, 5, 7, 13, 15>(tmp2, tmp3);
row0.get_low().store(&b[ 4*0]);
row1.get_low().store(&b[ 4*1]);
row2.get_low().store(&b[ 4*2]);
row3.get_low().store(&b[ 4*3]);
row0.get_high().store(&b[ 4*4]);
row1.get_high().store(&b[ 4*5]);
row2.get_high().store(&b[ 4*6]);
row3.get_high().store(&b[ 4*7]);
}
int main() {
float a[32], b1[32], b2[32];
for(int i=0; i<32; i++) a[i] = i;
for(int i=0; i<4; i++) {
for(int j=0; j<8; j++) {
printf("%2.0f ", a[i*8+j]);
} puts("");
}
tran8x4(a,b1);
tran8x4_AVX(a,b2);
puts("");
for(int i=0; i<8; i++) {
for(int j=0; j<4; j++) {
printf("%2.0f ", b1[i*4+j]);
} puts("");
}
puts("");
for(int i=0; i<8; i++) {
for(int j=0; j<4; j++) {
printf("%2.0f ", b2[i*4+j]);
} puts("");
}
}