I am working on my own Android application that should run on an ARMv7 device. For some reason, I need to do heavy computations on vectors (short and / or floating). I performed some build function using NEON commands to speed up the calculation. I got a speed factor of 1.5, which is not bad. I am wondering if I can improve these features to go even faster.
So the question is: what changes can I make to improve these features?
void add_float_vector_with_neon3(float* dst, float* src1, float* src2, int count)
{
asm volatile (
"1: \n"
"vld1.32 {q0}, [%[src1]]! \n"
"vld1.32 {q1}, [%[src2]]! \n"
"vadd.f32 q0, q0, q1 \n"
"subs %[count], %[count], #4 \n"
"vst1.32 {q0}, [%[dst]]! \n"
"bgt 1b \n"
: [dst] "+r" (dst)
: [src1] "r" (src1), [src2] "r" (src2), [count] "r" (count)
: "memory", "q0", "q1"
);
}
void mul_float_vector_by_scalar_with_neon3(float* dst, float* src1, float scalar, int count)
{
asm volatile (
"vdup.32 q1, %[scalar] \n"
"2: \n"
"vld1.32 {q0}, [%[src1]]! \n"
"vmul.f32 q0, q0, q1 \n"
"subs %[count], %[count], #4 \n"
"vst1.32 {q0}, [%[dst]]! \n"
"bgt 2b \n"
: [dst] "+r" (dst)
: [src1] "r" (src1), [scalar] "r" (scalar), [count] "r" (count)
: "memory", "q0", "q1"
);
}
void add_short_vector_with_neon3(short* dst, short* src1, short* src2, int count)
{
asm volatile (
"3: \n"
"vld1.16 {q0}, [%[src1]]! \n"
"vld1.16 {q1}, [%[src2]]! \n"
"vadd.i16 q0, q0, q1 \n"
"subs %[count], %[count], #8 \n"
"vst1.16 {q0}, [%[dst]]! \n"
"bgt 3b \n"
: [dst] "+r" (dst)
: [src1] "r" (src1), [src2] "r" (src2), [count] "r" (count)
: "memory", "q0", "q1"
);
}
void mul_short_vector_by_float_vector_with_neon3(short* dst, short* src1, float* src2, int count)
{
asm volatile (
"4: \n"
"vld1.16 {d0}, [%[src1]]! \n"
"vld1.32 {q1}, [%[src2]]! \n"
"vmovl.s16 q0, d0 \n"
"vcvt.f32.s32 q0, q0 \n"
"vmul.f32 q0, q0, q1 \n"
"vcvt.s32.f32 q0, q0 \n"
"vmovn.s32 d0, q0 \n"
"subs %[count], %[count], #4 \n"
"vst1.16 {d0}, [%[dst]]! \n"
"bgt 4b \n"
: [dst] "+r" (dst)
: [src1] "r" (src1), [src2] "r" (src2), [count] "r" (count)
: "memory", "d0", "q0", "q1"
);
}
Thanks in advance!