Skip to content

Commit 329c890

Browse files
committed
perf : Do only one iteration of refinement for FP16 inv
FP16 only has 10 explicitly stored bits of mantissa, so one iteration of newton-raphson refinement should be precise enough. Numerical results showed both one iteration and two iterations of newton-raphson refinement had worst ULP being 1. Signed-off-by: Ben Niu <[email protected]>
1 parent 531a496 commit 329c890

File tree

2 files changed

+0
-3
lines changed

2 files changed

+0
-3
lines changed

src/core/NEON/NEMath.inl

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -634,15 +634,13 @@ inline float16x4_t vinv_f16(float16x4_t x)
634634
{
635635
float16x4_t recip = vrecpe_f16(x);
636636
recip = vmul_f16(vrecps_f16(x, recip), recip);
637-
recip = vmul_f16(vrecps_f16(x, recip), recip);
638637
return recip;
639638
}
640639

641640
inline float16x8_t vinvq_f16(float16x8_t x)
642641
{
643642
float16x8_t recip = vrecpeq_f16(x);
644643
recip = vmulq_f16(vrecpsq_f16(x, recip), recip);
645-
recip = vmulq_f16(vrecpsq_f16(x, recip), recip);
646644
return recip;
647645
}
648646

src/core/NEON/SVEMath.inl

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,6 @@ inline svfloat16_t svinv_f16_z(svbool_t pg, svfloat16_t x)
8282
{
8383
auto recip = svrecpe_f16(x);
8484
recip = svmul_f16_z(pg, svrecps_f16(x, recip), recip);
85-
recip = svmul_f16_z(pg, svrecps_f16(x, recip), recip);
8685
return recip;
8786
}
8887

0 commit comments

Comments
 (0)