我希望这个代码示例可以帮助您:
inline int32x4_t Correlation(const int8x16_t & a, const int8x16_t & b)
{
int16x8_t lo = vmull_s8(vget_low_s8(a), vget_low_s8(b));
int16x8_t hi = vmull_s8(vget_high_s8(a), vget_high_s8(b));
return vaddq_s32(vpaddlq_s16(lo), vpaddlq_s16(hi));
}
void CorrelationSum(const int8_t * a, const int8_t * b, size_t bStride, size_t size, int32_t * sum)
{
int32x4_t sums = vdupq_n_s32(0);
for (size_t i = 0; i < size; i += 16)
sums = vaddq_s32(sums, Correlation(vld1q_s8(a + i), vld1q_s8(b + i)));
*sum = vgetq_lane_s32(sums, 0) + vgetq_lane_s32(sums, 1) + vgetq_lane_s32(sums, 2) + vgetq_lane_s32(sums, 3);
}
注:本例基于函数
Simd::Neon::CorrelationSum()
inline int8x16_t Load(const int8_t * p)
{
#ifdef __GNUC__
__builtin_prefetch(p + 384);
#endif
return vld1q_s8(p);
}
使用预取可以提高15-20%的性能。