Skip to content

Commit

Permalink
Clear possible signaling NaNs in excess parts of wide words
Browse files Browse the repository at this point in the history
  • Loading branch information
HenryHRich committed Oct 16, 2024
1 parent 8fad095 commit f6102f7
Show file tree
Hide file tree
Showing 5 changed files with 47 additions and 19 deletions.
6 changes: 3 additions & 3 deletions jsrc/va.h
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,7 @@
// This was done with poor choice of address modes; needs to be rerun

// fz=bit0 = commutative,
// bits 1-2=incomplete argument filling: 00=none, 01=incomplete y must be filled with 0 (to avoid isub oflo), 10=incomplete x must be filled with 1 (for fdiv NaN), 11=both x & y must be filled with 0
// bits 1-2=incomplete argument filling: 00=none, 01=incomplete y must be filled with 0 (to avoid isub oflo), 10=incomplete x must be filled with 1.0, y with 0 (for fdiv NaN), 11=both x & y must be filled with 0
// bit3 set for int-to-float on x, bit4 for int-to-float on y
// bit5 set to suppress loop-unrolling
// bit6 set for bool-to-int on x, bit7 for bool-to-int on y
Expand Down Expand Up @@ -310,7 +310,7 @@
endmask = _mm256_loadu_si256((__m256i*)(validitymask+NPAR-alignreq)); /* mask for 00=0000, 01=1000, 10=1100, 11=1110, 100=1111 */ \
if(xy&2)LDBID(xx,XAD(fz),fz,0x8,0x40,0x100) if(xy&1)LDBID(yy,YAD(fz),fz,0x10,0x80,0x200) \
if(xy&2)CVTBID(xx,xx,fz,0x8,0x40,0x100) if(xy&1)CVTBID(yy,yy,fz,0x10,0x80,0x200) \
if((fz)&2)yy=_mm256_and_pd(_mm256_castsi256_pd(endmask),yy); /* init incomplete fetch */ \
if((fz)&6)yy=_mm256_and_pd(_mm256_castsi256_pd(endmask),yy); /* init incomplete fetch, also in PRIMMASK */ \
if((fz)&4)if((fz)&2)xx=_mm256_and_pd(_mm256_castsi256_pd(endmask),xx); else xx=_mm256_blendv_pd(_mm256_broadcast_sd(&zone.real),xx,_mm256_castsi256_pd(endmask)); \
zzop; _mm256_maskstore_pd(z, endmask, zz); PRMINCR(xy,fz,alignreq) /* need mask store in case inplace */ \
if((xy)==2)yy=xysav; if((xy)==1)xx=xysav; /* restore repeated arg, which would have been masked */ \
Expand All @@ -333,7 +333,7 @@

#define PRMMASK(zzop,xy,fz) if(xy&2)LDBID(xx,XAD(fz),fz,0x8,0x40,0x100) if(xy&1)LDBID(yy,YAD(fz),fz,0x10,0x80,0x200) \
if(xy&2)CVTBID(xx,xx,fz,0x8,0x40,0x100) if(xy&1)CVTBID(yy,yy,fz,0x10,0x80,0x200) \
if((fz)&2)yy=_mm256_and_pd(_mm256_castsi256_pd(endmask),yy); /* init incomplete fetch */ \
if((fz)&6)yy=_mm256_and_pd(_mm256_castsi256_pd(endmask),yy); /* init incomplete fetch */ \
if((fz)&4)if((fz)&2)xx=_mm256_and_pd(_mm256_castsi256_pd(endmask),xx); else xx=_mm256_blendv_pd(_mm256_broadcast_sd(&zone.real),xx,_mm256_castsi256_pd(endmask)); \
zzop; _mm256_maskstore_pd(z, endmask, zz);

Expand Down
2 changes: 1 addition & 1 deletion jsrc/vcompsc.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
#include "ve.h"
#include "vcomp.h"

// fz=bit0 = commutative, bit1 set if incomplete y must be filled with 0 (to avoid isub oflo), bit2 set if incomplete x must be filled with i (for fdiv NaN),
// fz=bit0 = commutative, bit1 set if incomplete y must be filled with 0 (to avoid isub oflo), (bit 2 not needed for NaN since all fp compares are quiet)
// bit3 set for int-to-float on x, bit4 for int-to-float on y
// bit5 set to suppress loop-unrolling
// bit6 set for bool-to-int on x, bit7 for bool-to-int on y
Expand Down
30 changes: 15 additions & 15 deletions jsrc/ve.c
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,10 @@ AHDR2(tymesDB,PVD,PVD,PVB){R tymesBD(m^1^SGNTO0(m),z,y,x,n,jt);} // does tymesB

#if C_AVX2 || EMU_AVX2
#define ORIGMN I nsav=n; if(msav>=0){n=m; m=nsav; nsav^=-(msav&1);} // restore old-style mn from modified mn and msav. If msav<0, OK already, otherwise swap & transfer flag to nsav
primop256(plusDD,0x3,NAN0;,zz=_mm256_add_pd(xx,yy),R NANTEST?EVNAN:EVOK;)
primop256(minusDD,0x2,NAN0;,zz=_mm256_sub_pd(xx,yy),R NANTEST?EVNAN:EVOK;)
primop256(minDD,1,,zz=_mm256_min_pd(xx,yy),R EVOK;)
primop256(maxDD,1,,zz=_mm256_max_pd(xx,yy),R EVOK;)
primop256(plusDD,0x7,NAN0;,zz=_mm256_add_pd(xx,yy),R NANTEST?EVNAN:EVOK;)
primop256(minusDD,0x6,NAN0;,zz=_mm256_sub_pd(xx,yy),R NANTEST?EVNAN:EVOK;)
primop256(minDD,0x7,,zz=_mm256_min_pd(xx,yy),R EVOK;)
primop256(maxDD,0x7,,zz=_mm256_max_pd(xx,yy),R EVOK;)
primop256(tymesDD,0x7,D *zsav=z;NAN0;,zz=_mm256_mul_pd(xx,yy),if(unlikely(NANTEST)){z=zsav; DQ(n*m, if(_isnan(*(D*)z))*(D*)z=0.0; z=(C*)z+SZD;)} R EVOK;)
// div can fail from 0%0 (which we turn to 0) or inf%inf (which we fail)
primop256(divDD,4,D *zsav=z; D *xsav=x; D *ysav=y; I msav=m;NAN0;,zz=_mm256_div_pd(xx,yy),
Expand All @@ -57,9 +57,9 @@ APFX( minEE, E,E,E, MINE,,R EVOK;)
APFX( maxEE, E,E,E, MAXE,,R EVOK;)

#if C_AVX2 || EMU_AVX2
primop256(plusDI,0x10,,zz=_mm256_add_pd(xx,yy),R EVOK;)
primop256(plusDI,0x16,,zz=_mm256_add_pd(xx,yy),R EVOK;)
// commutative primop256(plusID,8,,zz=_mm256_add_pd(xx,yy),R EVOK;)
primop256(plusDB,0xa00,,zz=_mm256_add_pd(xx,yy),R EVOK;)
primop256(plusDB,0xa06,,zz=_mm256_add_pd(xx,yy),R EVOK;)
// commutative primop256(plusBD,0x900,,zz=_mm256_add_pd(xx,yy),R EVOK;)
primop256(plusII,0x23,__m256d oflo=_mm256_setzero_pd();,
zz=_mm256_castsi256_pd(_mm256_add_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy))); oflo=_mm256_or_pd(oflo,_mm256_andnot_pd(_mm256_xor_pd(xx,yy),_mm256_xor_pd(xx,zz)));,
Expand All @@ -70,12 +70,12 @@ primop256(plusII,0x23,__m256d oflo=_mm256_setzero_pd();,
primop256(plusIB,0x882,__m256d oflo=_mm256_setzero_pd();,
zz=_mm256_castsi256_pd(_mm256_add_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy))); oflo=_mm256_or_pd(oflo,_mm256_castsi256_pd(_mm256_cmpgt_epi32(_mm256_castpd_si256(xx),_mm256_castpd_si256(zz))));,
R !_mm256_testc_pd(_mm256_setzero_pd(),oflo)?EWOVIP+EWOVIPPLUSIB:EVOK;) // ~0 & oflo, testc if =0 which means no overflow
primop256(plusBB,0xc0,,
primop256(plusBB,0xc1,,
zz=_mm256_castsi256_pd(_mm256_add_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy)));,R EVOK;)
primop256(minusDI,0x10,,zz=_mm256_sub_pd(xx,yy),R EVOK;)
primop256(minusID,0x8,,zz=_mm256_sub_pd(xx,yy),R EVOK;)
primop256(minusDB,0xa00,,zz=_mm256_sub_pd(xx,yy),R EVOK;)
primop256(minusBD,0x100,,zz=_mm256_sub_pd(xx,yy),R EVOK;)
primop256(minusDI,0x16,,zz=_mm256_sub_pd(xx,yy),R EVOK;)
primop256(minusID,0xa,,zz=_mm256_sub_pd(xx,yy),R EVOK;)
primop256(minusDB,0xa06,,zz=_mm256_sub_pd(xx,yy),R EVOK;)
primop256(minusBD,0x102,,zz=_mm256_sub_pd(xx,yy),R EVOK;)
primop256(minusII,0x22,__m256d oflo=_mm256_setzero_pd();,
zz=_mm256_castsi256_pd(_mm256_sub_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy))); oflo=_mm256_or_pd(oflo,_mm256_and_pd(_mm256_xor_pd(xx,yy),_mm256_xor_pd(xx,zz)));,
R !_mm256_testc_pd(_mm256_setzero_pd(),oflo)?EWOVIP+EWOVIPMINUSII:EVOK;) // ~0 & oflo, testc if =0 which means no overflow
Expand All @@ -87,10 +87,10 @@ primop256(minusIB,0x882,__m256d oflo=_mm256_setzero_pd();,
R !_mm256_testc_pd(_mm256_setzero_pd(),oflo)?EWOVIP+EWOVIPMINUSIB:EVOK;) // ~0 & oflo, testc if =0 which means no overflow
primop256(minusBB,0xc0,,
zz=_mm256_castsi256_pd(_mm256_sub_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy)));,R EVOK;)
primop256(minDI,0x10,,zz=_mm256_min_pd(xx,yy),R EVOK;)
primop256(minDI,0x16,,zz=_mm256_min_pd(xx,yy),R EVOK;)
// commutative primop256(minID,8,,zz=_mm256_min_pd(xx,yy),R EVOK;)
// commutative primop256(minBD,0x100,,zz=_mm256_min_pd(xx,yy),R EVOK;)
primop256(minDB,0x200,,zz=_mm256_min_pd(xx,yy),R EVOK;)
primop256(minDB,0x206,,zz=_mm256_min_pd(xx,yy),R EVOK;)
#if C_AVX512
primop256(minII,1,, zz=_mm256_castsi256_pd(_mm256_min_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy))),R EVOK;)
primop256(minIB,0x80,,zz=_mm256_castsi256_pd(_mm256_min_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy))),R EVOK;)
Expand All @@ -103,8 +103,8 @@ primop256(minII,1,,
primop256(minIB,0x80,,
zz=_mm256_castsi256_pd(BLENDVI(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy),_mm256_cmpgt_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy)))); ,R EVOK;)
#endif
primop256(maxDI,0x10,,zz=_mm256_max_pd(xx,yy),R EVOK;)
primop256(maxDB,0x200,,zz=_mm256_max_pd(xx,yy),R EVOK;)
primop256(maxDI,0x16,,zz=_mm256_max_pd(xx,yy),R EVOK;)
primop256(maxDB,0x206,,zz=_mm256_max_pd(xx,yy),R EVOK;)
// commutative primop256(maxID,8,,zz=_mm256_max_pd(xx,yy),R EVOK;)
// commutative primop256(maxBD,0x100,,zz=_mm256_max_pd(xx,yy),R EVOK;)
#if C_AVX512
Expand Down
10 changes: 10 additions & 0 deletions test/g13x.ijs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@ x return.
x + y
)

1: 0 : 0 NB. would be nice to verify that this crash is fixed, but 13!:0 takes us up to immed level
13!:0 (129)
foo =: 3 : 'y'
13!:3 'foo *:*;'
foo 7
13!:0 (129) NB. Used to crash
13!:0 (0)
)

foo =: 0$0 NB. will accumulate results executed during suspension
13!:0 ] 0
13!:3 'goo 0'
Expand Down Expand Up @@ -308,6 +317,7 @@ NB. stops ---------------------------------------------------------------
1 [ 13!:3 ''



NB. error text ----------------------------------------------------------

sum=: +/
Expand Down
18 changes: 18 additions & 0 deletions test/gnan.ijs
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,24 @@ _1 1 = 7 o. __ _
'NaN error' -: x: etx _.
'NaN error' -: x: etx 3 4 _.

NB. end-of-buffer garbage is masked out
0. = -~ 99 {. 1. (i. 99)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN
0. = -~ 98 {. 1. (i. 98)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN
0. = -~ 97 {. 1. (i. 97)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN
0. = -~ 96 {. 1. (i. 96)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN
0. = -~ 96 }. 99 {. 1. (i. 99)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN
0. = -~ 96 }. 98 {. 1. (i. 98)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN
0. = -~ 96 }. 97 {. 1. (i. 97)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN
0. = -~ 96 }. 96 {. 1. (i. 96)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN
1: +~ 99 {. 1. (i. 99)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN
1: = +~ 98 {. 1. (i. 98)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN
1: = +~ 97 {. 1. (i. 97)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN
1: = +~ 96 {. 1. (i. 96)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN
1: = +~ 96 }. 99 {. 1. (i. 99)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN
1: = +~ 96 }. 98 {. 1. (i. 98)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN
1: = +~ 96 }. 97 {. 1. (i. 97)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN
1: = +~ 96 }. 96 {. 1. (i. 96)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN


4!:55 ;:'d det f i inf ninf pinf t t1 x y zero znan'

Expand Down

0 comments on commit f6102f7

Please sign in to comment.