In addition to SSE copies, AVX copies, and std :: copies. . Suppose we need to vectorize some loop as follows: 1) vectorize the first loop-packet (which is a multiple of 8) via AVX. 2) divide the remainder of the cycle into two batches. Vectorization of a batch that is a multiple of 4 via SSE. 3) Process the remaining batch of the entire cycle through a sequential procedure. Consider an example of copying arrays:
#include <immintrin.h> template<int length, int unroll_bound_avx = length & (~7), int unroll_tail_avx = length - unroll_bound_avx, int unroll_bound_sse = unroll_tail_avx & (~3), int unroll_tail_last = unroll_tail_avx - unroll_bound_sse> void simd_copy(float *src, float *dest) { auto src_ = src; auto dest_ = dest; //Vectorize first part of loop via AVX for(; src_!=src+unroll_bound_avx; src_+=8, dest_+=8) { __m256 buffer = _mm256_load_ps(src_); _mm256_store_ps(dest_, buffer); } //Vectorize remainder part of loop via SSE for(; src_!=src+unroll_bound_sse+unroll_bound_avx; src_+=4, dest_+=4) { __m128 buffer = _mm_load_ps(src_); _mm_store_ps(dest_, buffer); } //Process residual elements for(; src_!=src+length; ++src_, ++dest_) *dest_ = *src_; } int main() { const int sz = 15; float *src = (float *)_mm_malloc(sz*sizeof(float), 16); float *dest = (float *)_mm_malloc(sz*sizeof(float), 16); float a=0; std::generate(src, src+sz, [&](){return ++a;}); simd_copy<sz>(src, dest); _mm_free(src); _mm_free(dest); }
Is it right to use SSE and AVX? Should AVX-SSE transitions be avoided?
source share