C ++ Centralizing the use of SIMD

I have a library and many projects depending on this library. I want to optimize certain procedures inside the library using SIMD extensions. However, it’s important for me to stay portable, so it should be pretty abstract to the user. At first I say that I do not want to use some other great library that does the trick. I really want to understand if what I want is possible and to what extent.

My first idea was to have a “vector” wrapper class, that using SIMD is transparent to the user, and a “scalar” vector class can be used if there is no SIMD extension on the target machine. A naive thought occurred to me to use a preprocessor to select one vector class from many, depending on the purpose of compiling the library. So, one scalar vector class, one with SSE (something like this basically: http://fastcpp.blogspot.de/2011/12/simple-vector3-class-with-sse-support.html) and so on ... all with the same interface. This gives me good performance, but it will mean that I will have to compile the library for any SIM ISA that I use. Rather, I would like to dynamically evaluate the capabilities of the processor at runtime and choose the "best" implementation.

So, my second premise was to have a common “vector” class with abstract methods. The processor-evaluator function will return instances of the optimal implementation. Obviously, this will lead to ugly code, but a pointer to a vector object can be stored in a smart pointing container that simply delegates calls to the vector object. Actually, I would prefer this method because of its abstraction, but I'm not sure if calling virtual methods will actually kill the performance that I get using the SIMD extensions.

The last option that I found out is to optimize entire routines and select the best option at runtime. I do not like this idea because it forces me to perform whole functions several times. I would prefer to do this once, using my idea of ​​a vector class, I would like to do something like this, for example:

void Memcopy(void *dst, void *src, size_t size)
{
    vector v;
    for(int i = 0; i < size; i += v.size())
    {
        v.load(src);
        v.store(dst);
        dst += v.size();
        src += v.size();
    }
}

, "" , . , . , 4 SSE 1 . , , ? , , , .

: http://compeng.uni-frankfurt.de/?vc , , .

+4
3

. 1, 2, 4, 8 16 float 1, 2, 4, 8 double. CPU , : SSE2, SSE4.1, AVX, AVX + FMA AVX512.

, 1, - . SIMD-, : Agner Fog Vector Class Library. .

VCL , AVX, , SSE ( AVX512 SSE). AVX ( AVX512), , .

//#include "vectorclass.h"
void Memcopy(void *dst, void *src, size_t size)
{
    Vec8f v; //eight floats using AVX hardware or AVX emulated with SSE twice.
    for(int i = 0; i < size; i +=v.size())
    {
        v.load(src);
        v.store(dst);
        dst += v.size();
        src += v.size();
    }
}

( memcpy . temroal-, IVB rep movsb ). , , , , vector Vec8f.

VLC, CPU, , /​​, . , , .

CPU. :

: , . TYPE float, double doubledouble N 1, 2, 4, 8 16. doubledouble , VCL. Vec1f, Vec4f, Vec8f, Vec16f, Vec1d, Vec2d, Vec4d, Vec8d, doubleledouble1, doubleedouble2, doubleedouble4, doubleledouble8.

template<typename TYPE, unsigned N>
static inline intn calc(floatn const &cx, floatn const &cy, floatn const &cut, int32_t maxiter) {
    floatn x = cx, y = cy;
    intn n = 0; 
    for(int32_t i=0; i<maxiter; i++) {
        floatn x2 = square(x), y2 = square(y);
        floatn r2 = x2 + y2;
        booln mask = r2<cut;
        if(!horizontal_or(mask)) break;
        add_mask(n,mask);
        floatn t = x*y; mul2(t);
        x = x2 - y2 + cx;
        y = t + cy;
    }
    return n;
}

, SIMD , . , .

:

g++ -m64 -c -Wall -g -std=gnu++11 -O3 -fopenmp -mfpmath=sse -msse2          -Ivectorclass  kernel.cpp -okernel_sse2.o
g++ -m64 -c -Wall -g -std=gnu++11 -O3 -fopenmp -mfpmath=sse -msse4.1        -Ivectorclass  kernel.cpp -okernel_sse41.o
g++ -m64 -c -Wall -g -std=gnu++11 -O3 -fopenmp -mfpmath=sse -mavx           -Ivectorclass  kernel.cpp -okernel_avx.o
g++ -m64 -c -Wall -g -std=gnu++11 -O3 -fopenmp -mfpmath=sse -mavx2 -mfma    -Ivectorclass  kernel.cpp -okernel_avx2.o
g++ -m64 -c -Wall -g -std=gnu++11 -O3 -fopenmp -mfpmath=sse -mavx2 -mfma    -Ivectorclass  kernel_fma.cpp -okernel_fma.o
g++ -m64 -c -Wall -g -std=gnu++11 -O3 -fopenmp -mfpmath=sse -mavx512f -mfma -Ivectorclass  kernel.cpp -okernel_avx512.o
g++ -m64 -Wall -Wextra -std=gnu++11 -O3 -fopenmp -mfpmath=sse -msse2 -Ivectorclass frac.cpp vectorclass/instrset_detect.cpp kernel_sse2.o kernel_sse41.o kernel_avx.o kernel_avx2.o kernel_avx512.o kernel_fma.o -o frac

:

int iset = instrset_detect();
fp_float1  = NULL; 
fp_floatn  = NULL;
fp_double1 = NULL;
fp_doublen = NULL;
fp_doublefloat1  = NULL;
fp_doublefloatn  = NULL;
fp_doubledouble1 = NULL;
fp_doubledoublen = NULL;
fp_float128 = NULL;
fp_floatn_fma = NULL;
fp_doublen_fma = NULL;

if (iset >= 9) {
    fp_float1  = &manddd_AVX512<float,1>;
    fp_floatn  = &manddd_AVX512<float,16>;
    fp_double1 = &manddd_AVX512<double,1>;
    fp_doublen = &manddd_AVX512<double,8>;
    fp_doublefloat1  = &manddd_AVX512<doublefloat,1>;
    fp_doublefloatn  = &manddd_AVX512<doublefloat,16>;
    fp_doubledouble1 = &manddd_AVX512<doubledouble,1>;
    fp_doubledoublen = &manddd_AVX512<doubledouble,8>;
}
else if (iset >= 8) {
    fp_float1  = &manddd_AVX<float,1>;
    fp_floatn  = &manddd_AVX2<float,8>;
    fp_double1 = &manddd_AVX2<double,1>;
    fp_doublen = &manddd_AVX2<double,4>;
    fp_doublefloat1  = &manddd_AVX2<doublefloat,1>;
    fp_doublefloatn  = &manddd_AVX2<doublefloat,8>;
    fp_doubledouble1 = &manddd_AVX2<doubledouble,1>;
    fp_doubledoublen = &manddd_AVX2<doubledouble,4>;
}
....

, . .

+1

, , CPU . v.load(), v.store() v.size() CPU, , . .


, , , CPU . (, memcpy , ).

API/ABI, , () / . , , , . .


, , . SSE2 x86-64, SSE2 32- x86 ( , Athlon XP Pentium III) , , x86. init CPUID . , " " , - SSSE3, SSE2-. SSSE3, , , SSE2, , SSE2.

. , , , , . x86. , , CPU , ++ std: atomic ( , , ). , , , . .


x264 ( h.264) . . x264_mc_init_mmx(), . ( CPU , MMX AVX2). , libx264 "encoder init". , , - constructor/init , .


, ++ ey (++ ish? ?), , , , , .

+5

Z . , . Memcopy - , ( ), SIMD, , , , . ( , ) SSE- . - :   typedef void (* MEM_COPY_FUNC) (void *, const void *, size_t);

extern MEM_COPY_FUNC memCopyPointer;

- , Z :      void MemCopyTemplate (void * pDest, const void * prc, size_t size)   {       VectorType v;        * pDst, * pSrc;        uint32;

    pDst = (byte *)pDest;
    pSrc = (byte *)prc;

    mask = (2 << v.GetSize()) - 1;
    while(size & mask)
    {
        *pDst++ = *pSrc++;
    }

    while(size)
    {
        v.Load(pSrc);
        v.Store(pDst);

        pDst += v.GetSize();
        pSrc += v.GetSize();
        size -= v.GetSize();
    }
}

, , CPUID

memCopyPointer = MemCopyTemplate<ScalarVector>;

memCopyPointer = MemCopyTemplate<SSEVector>;

as you both suggested. Many thanks.

0
source

Source: https://habr.com/ru/post/1609839/


All Articles