How to find the maximum element in an array of 8 bytes, 8 shorts or 8 ints? I may need only the position of the max element, the value of the maximum element, or both of them.
For example :
unsigned FindMax8(const uint32_t src[8])
{
unsigned ret = 0;
for (unsigned i=0; i<8; ++i)
{
if (src[i] > src[ret])
ret = i;
}
return ret;
}
In -O2clang, it loops, but it does not use neon, which should give a decent boost of boost (because it eliminates many data-dependent branches?)
For an approach with 8 bytes and 8 shorts, it should be simpler, since the entire array can be loaded into a single q-register. For arm64, this should be much simpler with vmaxv_u16, but how can I make it effective in 32-bit neon?
Marc , , GTI auto vectorizer neon64:
ldr q0, [x0, 16]
ld1r {v2.4s}, [x0]
ldr q1, [x0]
umax v0.4s, v0.4s, v2.4s
umax v0.4s, v0.4s, v1.4s
umaxv s0, v0.4s
umov w0, v0.s[0]
, , uint32x4_t res, , , - . , , .
( ):
32- .
vst1q_u32 (src, res), C- .
vmov 32- , vget_lane_u64 , - .
, :
unsigned compute(unsigned short *input)
{
uint32x4_t result = vld1q_u32((uint32_t*)(input));
uint32x2_t res01 = vget_low_u32(result);
uint32x2_t res23 = vget_high_u32(result);
uint64_t xres01 = vget_lane_u64(vreinterpret_u64_u32(res01), 0);
uint64_t xres23 = vget_lane_u64(vreinterpret_u64_u32(res23), 0);
unsigned ret = 0;
uint32_t xmax0 = (uint32_t)(xres01 & 0xffffffff);
uint32_t xmax1 = (uint32_t)(xres01 >> 32);
uint32_t xmax2 = (uint32_t)(xres23 & 0xffffffff);
uint32_t xmax3 = (uint32_t)(xres23 >> 32);
if (xmax1 > xmax0)
{
xmax0 = xmax1;
ret = 1;
}
if (xmax2 > xmax0)
{
xmax0 = xmax2;
ret = 2;
}
if (xmax3 > xmax0)
ret = 3;
return ret;
}
, , :
- vmax/vpmax find max element
- u32x4_t max
- vceq set max 0xffffffff
- u32x4_t
{1u<<31, 1u<<30, 1u<<29, 1u<<28 } - vand
- add vorr, 4 .
- vclz
, , . , . , . - , 80% , . ? c- regs 20-30%. , , vst1_u32 - , .
?
Update:
, . , , , , 3-4 . , , , , , . , , .