Optimization of neon assembly functions

Question

Optimization of neon assembly functions

I am working on my own Android application that should run on an ARMv7 device. For some reason, I need to do heavy computations on vectors (short and / or floating). I performed some build function using NEON commands to speed up the calculation. I got a speed factor of 1.5, which is not bad. I am wondering if I can improve these features to go even faster.

So the question is: what changes can I make to improve these features?

    //add to float vectors.
//the result could be put in scr1 instead of dst
void add_float_vector_with_neon3(float* dst, float* src1, float* src2, int count)
{

    asm volatile (
           "1:                                                        \n"
           "vld1.32         {q0}, [%[src1]]!                          \n"
           "vld1.32         {q1}, [%[src2]]!                          \n"
           "vadd.f32        q0, q0, q1                                \n"
           "subs            %[count], %[count], #4                    \n"
           "vst1.32         {q0}, [%[dst]]!                           \n"
           "bgt             1b                                        \n"
           : [dst] "+r" (dst)
           : [src1] "r" (src1), [src2] "r" (src2), [count] "r" (count)
           : "memory", "q0", "q1"
      );
}

//multiply a float vector by a scalar.
//the result could be put in scr1 instead of dst
void mul_float_vector_by_scalar_with_neon3(float* dst, float* src1, float scalar, int count)
{

    asm volatile (

            "vdup.32         q1, %[scalar]                              \n"
            "2:                                                         \n"
            "vld1.32         {q0}, [%[src1]]!                           \n"
            "vmul.f32        q0, q0, q1                                 \n"
            "subs            %[count], %[count], #4                     \n"
            "vst1.32         {q0}, [%[dst]]!                            \n"
            "bgt             2b                                         \n"
            : [dst] "+r" (dst)
            : [src1] "r" (src1), [scalar] "r" (scalar), [count] "r" (count)
            : "memory", "q0", "q1"
      );
}

//add to short vector -> no problem of coding limits
//the result should be put in in a dest different from src1 and scr2
void add_short_vector_with_neon3(short* dst, short* src1, short* src2, int count)
{

    asm volatile (
           "3:                                                        \n"
           "vld1.16         {q0}, [%[src1]]!                          \n"
           "vld1.16         {q1}, [%[src2]]!                          \n"
           "vadd.i16        q0, q0, q1                                \n"
           "subs            %[count], %[count], #8                    \n"
           "vst1.16         {q0}, [%[dst]]!                           \n"
           "bgt             3b                                        \n"
           : [dst] "+r" (dst)
           : [src1] "r" (src1), [src2] "r" (src2), [count] "r" (count)
           : "memory", "q0", "q1"
      );
}

//multiply a short vector by a float vector and put the result bach into a short vector
//the result should be put in in a dest different from src1
void mul_short_vector_by_float_vector_with_neon3(short* dst, short* src1, float* src2, int count)
{
    asm volatile (
        "4:                                                         \n"
        "vld1.16        {d0}, [%[src1]]!                            \n"
        "vld1.32        {q1}, [%[src2]]!                            \n"
        "vmovl.s16      q0, d0                                      \n"
        "vcvt.f32.s32   q0, q0                                      \n"
        "vmul.f32       q0, q0, q1                                  \n"
        "vcvt.s32.f32   q0, q0                                      \n"
        "vmovn.s32      d0, q0                                      \n"
        "subs            %[count], %[count], #4                     \n"
        "vst1.16         {d0}, [%[dst]]!                            \n"
        "bgt             4b                                         \n"
        : [dst] "+r" (dst)
        : [src1] "r" (src1), [src2] "r" (src2), [count] "r" (count)
        : "memory", "d0", "q0", "q1"

    );
}

Thanks in advance!

+4

c android inline-assembly neon native

Madmax007 Oct 30 '15 at 11:50

source share

3 answers

Josejulio · Answer 1 · 2015-11-10T19:18:09+0000

You can try expanding your loop to process more elements per loop.

add_float_vector_with_neon3 10 (- ) 4 , 16 21 . http://pulsar.webshaker.net/ccc/sample-34e5f701

, ( 16), , .

user3528438 · Answer 2 · 2015-10-30T14:00:35+0000

, instrinsics.

, , .

, GCC, , load/store, ALU. , , , GCC .

GCC CFLAGS=-std=gnu11 -O3 -fgcse-lm -fgcse-sm -fgcse-las -fgcse-after-reload -mcpu=cortex-a9 -mfloat-abi=hard -mfpu=neon -fPIE -Wall . Loop , , . .

#include <arm_neon.h>

#define ASSUME_ALIGNED_FLOAT_128(ptr) ((float *)__builtin_assume_aligned((ptr), 16))

__attribute__((optimize("unroll-loops")))
void add_float_vector_with_neon3(      float *restrict dst,
                                 const float *restrict src1,
                                 const float *restrict src2, 
                                 size_t size)
{
    for(int i=0;i<size;i+=4){
        float32x4_t inFloat41  = vld1q_f32(ASSUME_ALIGNED_FLOAT_128(src1));
        float32x4_t inFloat42  = vld1q_f32(ASSUME_ALIGNED_FLOAT_128(src2));
        float32x4_t outFloat64 = vaddq_f32 (inFloat41, inFloat42);
        vst1q_f32 (ASSUME_ALIGNED_FLOAT_128(dst), outFloat64);
        src1+=4;
        src2+=4;
        dst+=4;
    }
}

MadMax007 · Answer 3 · 2015-11-30T16:21:55+0000

, , , , :

void add_float_vector_with_neon3(float* dst, float* src1, float* src2, int count)
{
    asm volatile (
            "1:                                 \n"
            "vld1.32 {q0,q1}, [%[src1]]!        \n"
            "vld1.32 {q2,q3}, [%[src2]]!        \n"
            "vadd.f32 q0, q0, q2                \n"
            "vadd.f32 q1, q1, q3                \n"
            "vld1.32 {q4,q5}, [%[src1]]!        \n"
            "vld1.32 {q6,q7}, [%[src2]]!        \n"
            "vadd.f32 q4, q4, q6                \n"
            "vadd.f32 q5, q5, q7                \n"
            "subs %[count], %[count], #16       \n"
            "vst1.32 {q0, q1}, [%[dst]]!        \n"
            "vst1.32 {q4, q5}, [%[dst]]!        \n"
            "bgt             1b                 \n"
            : [dst] "+r" (dst)
            : [src1] "r" (src1), [src2] "r" (src2), [count] "r" (count)
            : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7"
      );
}

(pulsar.webshaker.net/ccc/index.php) CPU/float, :

, firstQuartile, thirdQuartile, minVal, maxVal (, 1000 )

: 3564, 3206, 5126, 1761, 12144

: 3567, 3080, 4877, 3018, 11683

, , ...

Optimization of neon assembly functions

More articles: