Skip to content

Commit c69cb8f

Browse files
yaozhongxiaomattkretz
yaozhongxiao
authored andcommitted
[simd_neon]refine the S_to_bits to improve "find_first/last_set" for neon
find_first_set and find_last_set method is not optimal for neon, it need to be improved by synthesized with horizontal adds(vaddv) which will reduce the generated assembly code; in the following cases, vaddvq_s16 will generate 2 instructions but vpadd_s16 will generate 4 instrunctions: ``` # vaddvq_s16 vaddvq_s16(__asint); // addv h0, v1.8h // smov w1, v0.h[0] # vpadd_s16 vpaddq_s16(vpaddq_s16(vpaddq_s16(__asint, __zero), __zero), __zero)[0] // addp v1.8h,v1.8h,v2.8h // addp v1.8h,v1.8h,v2.8h // addp v1.8h,v1.8h,v2.8h // smov w1, v1.h[0] # ```
1 parent eb06f48 commit c69cb8f

File tree

1 file changed

+14
-3
lines changed

1 file changed

+14
-3
lines changed

experimental/bits/simd_neon.h

+14-3
Original file line numberDiff line numberDiff line change
@@ -314,8 +314,7 @@ struct _MaskImplNeonMixin
314314
});
315315
__asint &= __bitsel;
316316
#ifdef __aarch64__
317-
return vpaddq_s16(vpaddq_s16(vpaddq_s16(__asint, __zero), __zero),
318-
__zero)[0];
317+
return vaddvq_s16(__asint);
319318
#else
320319
return vpadd_s16(
321320
vpadd_s16(vpadd_s16(__lo64(__asint), __hi64(__asint)), __zero),
@@ -331,7 +330,7 @@ struct _MaskImplNeonMixin
331330
});
332331
__asint &= __bitsel;
333332
#ifdef __aarch64__
334-
return vpaddq_s32(vpaddq_s32(__asint, __zero), __zero)[0];
333+
return vaddvq_s32(__asint);
335334
#else
336335
return vpadd_s32(vpadd_s32(__lo64(__asint), __hi64(__asint)),
337336
__zero)[0];
@@ -354,8 +353,12 @@ struct _MaskImplNeonMixin
354353
return static_cast<_I>(__i < _Np ? 1 << __i : 0);
355354
});
356355
__asint &= __bitsel;
356+
#ifdef __aarch64__
357+
return vaddv_s8(__asint);
358+
#else
357359
return vpadd_s8(vpadd_s8(vpadd_s8(__asint, __zero), __zero),
358360
__zero)[0];
361+
#endif
359362
}
360363
else if constexpr (sizeof(_Tp) == 2)
361364
{
@@ -365,12 +368,20 @@ struct _MaskImplNeonMixin
365368
return static_cast<_I>(__i < _Np ? 1 << __i : 0);
366369
});
367370
__asint &= __bitsel;
371+
#ifdef __aarch64__
372+
return vaddv_s16(__asint);
373+
#else
368374
return vpadd_s16(vpadd_s16(__asint, __zero), __zero)[0];
375+
#endif
369376
}
370377
else if constexpr (sizeof(_Tp) == 4)
371378
{
372379
__asint &= __make_vector<_I>(0x1, 0x2);
380+
#ifdef __aarch64__
381+
return vaddv_s32(__asint);
382+
#else
373383
return vpadd_s32(__asint, __zero)[0];
384+
#endif
374385
}
375386
else
376387
__assert_unreachable<_Tp>();

0 commit comments

Comments
 (0)