Do one iteration of newton-raphson refinement for FP16 inv

mcfi · mcfi · commit 85cad90ffa4e · 2025-10-01T14:35:05.000-07:00
FP16 only has 10 explicitly stored bits of mantissa, so one
iteration of refinement should be precise enough. Numerical
results showed both one iteration and two iterations of
newton-raphson refinement had worst ULP being 1.

Signed-off-by: Ben Niu &lt;benniu@meta.com&gt;
diff --git a/src/core/NEON/NEMath.inl b/src/core/NEON/NEMath.inl
@@ -634,15 +634,13 @@ inline float16x4_t vinv_f16(float16x4_t x)
 {
     float16x4_t recip = vrecpe_f16(x);
     recip             = vmul_f16(vrecps_f16(x, recip), recip);
-    recip             = vmul_f16(vrecps_f16(x, recip), recip);
     return recip;
 }
 
 inline float16x8_t vinvq_f16(float16x8_t x)
 {
     float16x8_t recip = vrecpeq_f16(x);
     recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
-    recip             = vmulq_f16(vrecpsq_f16(x, recip), recip);
     return recip;
 }
 
diff --git a/src/core/NEON/SVEMath.inl b/src/core/NEON/SVEMath.inl
@@ -82,7 +82,6 @@ inline svfloat16_t svinv_f16_z(svbool_t pg, svfloat16_t x)
 {
     auto recip = svrecpe_f16(x);
     recip      = svmul_f16_z(pg, svrecps_f16(x, recip), recip);
-    recip      = svmul_f16_z(pg, svrecps_f16(x, recip), recip);
     return recip;
 }
 

Original file line number	Diff line number	Diff line change
`@@ -634,15 +634,13 @@ inline float16x4_t vinv_f16(float16x4_t x)`
`634`	`634`	`{`
`635`	`635`	`float16x4_t recip = vrecpe_f16(x);`
`636`	`636`	`recip = vmul_f16(vrecps_f16(x, recip), recip);`
`637`		`- recip = vmul_f16(vrecps_f16(x, recip), recip);`
`638`	`637`	`return recip;`
`639`	`638`	`}`
`640`	`639`
`641`	`640`	`inline float16x8_t vinvq_f16(float16x8_t x)`
`642`	`641`	`{`
`643`	`642`	`float16x8_t recip = vrecpeq_f16(x);`
`644`	`643`	`recip = vmulq_f16(vrecpsq_f16(x, recip), recip);`
`645`		`- recip = vmulq_f16(vrecpsq_f16(x, recip), recip);`
`646`	`644`	`return recip;`
`647`	`645`	`}`
`648`	`646`
Original file line number	Diff line number	Diff line change
`@@ -82,7 +82,6 @@ inline svfloat16_t svinv_f16_z(svbool_t pg, svfloat16_t x)`
`82`	`82`	`{`
`83`	`83`	`auto recip = svrecpe_f16(x);`
`84`	`84`	`recip = svmul_f16_z(pg, svrecps_f16(x, recip), recip);`
`85`		`- recip = svmul_f16_z(pg, svrecps_f16(x, recip), recip);`
`86`	`85`	`return recip;`
`87`	`86`	`}`
`88`	`87`