How to save a vector in a fixed place in memory using Altivec

I know from the tutorial that non-standard loading and saving may look like this:

//Load a vector from an unaligned location in memory __vector unsigned char LoadUnaligned(const unsigned char * src ) { __vector unsigned char permuteVector = vec_lvsl(0, src); __vector unsigned char low = vec_ld( 0, src); __vector unsigned char high = vec_ld( 16, src); return vec_perm( low, high, permuteVector); } //Store a vector to an unaligned location in memory void StoreUnaligned(__vector unsigned char v, __vector unsigned char * dst) { //Load the surrounding area __vector unsigned char low = vec_ld( 0, dst); __vector unsigned char high = vec_ld( 16, dst); //Prepare the constants that we need __vector unsigned char permuteVector = vec_lvsr( 0, (int*) dst); __vector signed char oxFF = vec_splat_s8( -1 ); __vector signed char ox00 = vec_splat_s8( 0 ); //Make a mask for which parts of the vectors to swap out __vector unsigned char mask = vec_perm( ox00, oxFF, permuteVector ); //Right rotate our input data v = vec_perm( v, v, permuteVector ); //Insert our data into the low and high vectors low = vec_sel( v, low, mask ); high = vec_sel( high, v, mask ); //Store the two aligned result vectors vec_st( low, 0, dst); vec_st( high, 16, dst); } 

It looks awful. So much work to store one vector! And has a corresponding performance loss.

 void SomeFuncA(const unsigned char * src, size_t size, unsigned char * dst) { for(size_t i = 0; i < size; i += 16) { __vector unsigned char a = vec_ld(0, src + i); //simple work vec_st(a, 0, dst + i); } } void SomeFuncU(const unsigned char * src, size_t size, unsigned char * dst) { for(size_t i = 0; i < size; i += 16) { __vector unsigned char a = LoadUnaligned(src + i); //simple work StoreUnaligned(dst + i, a); } } 

The second function works 3-4 times slower than the first. Since I cannot control the alignment of the input and output memory, then I have to implement both versions. How can I minimize performance loss for an unclassified case?

+5
source share
1 answer

First of all, I want to note that if you save the Altivec vector to unchanged memory many times, you do not need to save the previous state of memory in the middle of the array only at the beginning and at the end. Thus, there is a useful function and class in the Simd Library that implement this functionality:

 typedef __vector uint8_t v128_u8; const v128_u8 K8_00 = vec_splat_u8(0x00); const v128_u8 K8_FF = vec_splat_u8(0xFF); template <bool align> inline v128_u8 Load(const uint8_t * p); template <> inline v128_u8 Load<false>(const uint8_t * p) { v128_u8 lo = vec_ld(0, p); v128_u8 hi = vec_ld(16, p); return vec_perm(lo, hi, vec_lvsl(0, p)); } template <> inline v128_u8 Load<true>(const uint8_t * p) { return vec_ld(0, p); } template <bool align> struct Storer; template <> struct Storer<true> { template <class T> Storer(T * ptr) :_ptr((uint8_t*)ptr) { } template <class T> inline void First(T value) { vec_st((v128_u8)value, 0, _ptr); } template <class T> inline void Next(T value) { _ptr += 16; vec_st((v128_u8)value, 0, _ptr); } inline void Flush() { } private: uint8_t * _ptr; }; template <> struct Storer<false> { template <class T> inline Storer(T * ptr) :_ptr((uint8_t*)ptr) { _perm = vec_lvsr(0, _ptr); _mask = vec_perm(K8_00, K8_FF, _perm); } template <class T> inline void First(T value) { _last = (v128_u8)value; v128_u8 background = vec_ld(0, _ptr); v128_u8 foreground = vec_perm(_last, _last, _perm); vec_st(vec_sel(background, foreground, _mask), 0, _ptr); } template <class T> inline void Next(T value) { _ptr += 16; vec_st(vec_perm(_last, (v128_u8)value, _perm), 0, _ptr); _last = (v128_u8)value; } inline void Flush() { v128_u8 background = vec_ld(16, _ptr); v128_u8 foreground = vec_perm(_last, _last, _perm); vec_st(vec_sel(foreground, background, _mask), 16, _ptr); } private: uint8_t * _ptr; v128_u8 _perm; v128_u8 _mask; v128_u8 _last; }; 

Its use will be as follows:

 template<bool align> void SomeFunc(const unsigned char * src, size_t size, unsigned char * dst) { Storer<align> _dst(dst); __vector unsigned char a = Load<align>(src); //simple work _dst.First(a);// save first block for(size_t i = 16; i < size; i += 16) { __vector unsigned char a = Load<align>(src + i); //simple work _dst.Next(a);// save body } _dst.Flush(); // save tail } 

Loss of performance will be 30-40% compared to the aligned version. This is unpleasant, of course, but bearable.

An additional advantage is code reduction - all functions (aligned and not aligned) have the same implementation.

+4
source

Source: https://habr.com/ru/post/1242705/


All Articles