, 4 _, .
TL: DR: , 4, phase_current[i+0..i+3]
ADD- ( , ). - , /.
-sum (, SIMD log2(vector_width)
shuffle + vector_width
). , , , ( . . .
, phase_increment_step
( ) . , USEFUL_FUNC(phase_current);
, , phase_current
+=
. useful_func
- increment_step.
- 4 SIMD, 1 . , Intel, 1 , 4 , useful_func
. USEFUL_FUNC, , ( , SIMD integer , , , 2 ).
, , .
, , sum , 4 . 4 ( SIMD, , useful_func
).
step
, step*2
, step*3
,... n
: sum(1..n) = n*(n+1)/2
. 0, 1, 3, 6, 10, 15, 21, 28,... (https://oeis.org/A000217). ( phase_increment
).
4. (n+4)*(n+5)/2 - n*(n+1)/2
4*n + 10
. , 4. , 4 2- , 4*4 = 16
. , phase_increment
, SIMD 16*phase_increment_step
.
, ( 4, 16, ). , :
S = increment_step (constant)
inc0 = increment initial value
p0 = phase_current initial value
[ 0*S, 1*S, 2*S, 3*S ]
[ 4*S, 5*S, 6*S, 7*S ]
[ p0, p0+(inc0+S), p0+(inc0+S)+(inc0+2*S), p0+(inc0+S)+(inc0+2*S)+(inc0+3*S) ]
[ p0, p0+inc0+S, p0+2*inc0+3*S, p0+3*inc0+6*S ]
[ p0+4*inc0+10*S, p0+5*inc0+15*S, p0+6*inc0+21*S, p0+7*inc0+28*S ]
4*n + 10
:
// first 4 vectors of of phase_current
[ p0, p0+1*inc0+ 1*S, p0+2*inc0+3*S, p0+ 3*inc0+ 6*S ]
[ p0+4*inc0+10*S, p0+5*inc0+15*S, p0+6*inc0+21*S, p0+ 7*inc0+28*S ]
[ p0+8*inc0+36*S, p0+9*inc0+45*S, p0+10*inc0+55*S, p0+11*inc0+66*S ]
[ p0+12*inc0+78*S, p0+13*inc0+91*S, p0+14*inc0+105*S, p0+15*inc0+120*S ]
first 3 vectors of phase_increment (subtract consecutive phase_current vectors):
[ 4*inc0+10*S, 4*inc0 + 14*S, 4*inc0 + 18*S, 4*inc0 + 22*S ]
[ 4*inc0+26*S, 4*inc0 + 30*S, 4*inc0 + 34*S, 4*inc0 + 38*S ]
[ 4*inc0+42*S, 4*inc0 + 46*S, 4*inc0 + 50*S, 4*inc0 + 54*S ]
first 2 vectors of phase_increment_step:
[ 16*S, 16*S, 16*S, 16*S ]
[ 16*S, 16*S, 16*S, 16*S ]
Yes, as expected, a constant vector works for phase_increment_step
, , , Intel SSE/AVX:
#include <stdint.h>
#include <immintrin.h>
void USEFUL_FUNC(__m128i);
void double_integral(uint32_t phase_start, uint32_t phase_increment_start, uint32_t phase_increment_step, unsigned blockSize)
{
__m128i pstep1 = _mm_set1_epi32(phase_increment_step);
uint32_t inc0=phase_increment_start, S=phase_increment_step;
__m128i pincr = _mm_setr_epi32(4*inc0 + 10*S, 4*inc0 + 14*S, 4*inc0 + 18*S, 4*inc0 + 22*S);
__m128i phase = _mm_setr_epi32(phase_start, phase_start+1*inc0+ 1*S, phase_start+2*inc0+3*S, phase_start + 3*inc0+ 6*S );
__m128i pstep_stride = _mm_slli_epi32(pstep1, 4);
for (unsigned i = 0; i < blockSize; ++i) {
USEFUL_FUNC(phase);
pincr = _mm_add_epi32(pincr, pstep_stride);
phase = _mm_add_epi32(phase, pincr);
}
}
: SIMD , x86 SSE/AVX, . fooobar.com/tags/sse/..., SIMD Insomniac Games (GDC 2015), , SIMD , .