diff --git a/jsrc/va.h b/jsrc/va.h index 5bbeb097b..89fa4ab4a 100644 --- a/jsrc/va.h +++ b/jsrc/va.h @@ -279,7 +279,7 @@ // This was done with poor choice of address modes; needs to be rerun // fz=bit0 = commutative, -// bits 1-2=incomplete argument filling: 00=none, 01=incomplete y must be filled with 0 (to avoid isub oflo), 10=incomplete x must be filled with 1 (for fdiv NaN), 11=both x & y must be filled with 0 +// bits 1-2=incomplete argument filling: 00=none, 01=incomplete y must be filled with 0 (to avoid isub oflo), 10=incomplete x must be filled with 1.0, y with 0 (for fdiv NaN), 11=both x & y must be filled with 0 // bit3 set for int-to-float on x, bit4 for int-to-float on y // bit5 set to suppress loop-unrolling // bit6 set for bool-to-int on x, bit7 for bool-to-int on y @@ -310,7 +310,7 @@ endmask = _mm256_loadu_si256((__m256i*)(validitymask+NPAR-alignreq)); /* mask for 00=0000, 01=1000, 10=1100, 11=1110, 100=1111 */ \ if(xy&2)LDBID(xx,XAD(fz),fz,0x8,0x40,0x100) if(xy&1)LDBID(yy,YAD(fz),fz,0x10,0x80,0x200) \ if(xy&2)CVTBID(xx,xx,fz,0x8,0x40,0x100) if(xy&1)CVTBID(yy,yy,fz,0x10,0x80,0x200) \ - if((fz)&2)yy=_mm256_and_pd(_mm256_castsi256_pd(endmask),yy); /* init incomplete fetch */ \ + if((fz)&6)yy=_mm256_and_pd(_mm256_castsi256_pd(endmask),yy); /* init incomplete fetch, also in PRIMMASK */ \ if((fz)&4)if((fz)&2)xx=_mm256_and_pd(_mm256_castsi256_pd(endmask),xx); else xx=_mm256_blendv_pd(_mm256_broadcast_sd(&zone.real),xx,_mm256_castsi256_pd(endmask)); \ zzop; _mm256_maskstore_pd(z, endmask, zz); PRMINCR(xy,fz,alignreq) /* need mask store in case inplace */ \ if((xy)==2)yy=xysav; if((xy)==1)xx=xysav; /* restore repeated arg, which would have been masked */ \ @@ -333,7 +333,7 @@ #define PRMMASK(zzop,xy,fz) if(xy&2)LDBID(xx,XAD(fz),fz,0x8,0x40,0x100) if(xy&1)LDBID(yy,YAD(fz),fz,0x10,0x80,0x200) \ if(xy&2)CVTBID(xx,xx,fz,0x8,0x40,0x100) if(xy&1)CVTBID(yy,yy,fz,0x10,0x80,0x200) \ - if((fz)&2)yy=_mm256_and_pd(_mm256_castsi256_pd(endmask),yy); /* init incomplete fetch */ \ + if((fz)&6)yy=_mm256_and_pd(_mm256_castsi256_pd(endmask),yy); /* init incomplete fetch */ \ if((fz)&4)if((fz)&2)xx=_mm256_and_pd(_mm256_castsi256_pd(endmask),xx); else xx=_mm256_blendv_pd(_mm256_broadcast_sd(&zone.real),xx,_mm256_castsi256_pd(endmask)); \ zzop; _mm256_maskstore_pd(z, endmask, zz); diff --git a/jsrc/vcompsc.c b/jsrc/vcompsc.c index baa550a99..1b047f2c2 100644 --- a/jsrc/vcompsc.c +++ b/jsrc/vcompsc.c @@ -20,7 +20,7 @@ #include "ve.h" #include "vcomp.h" -// fz=bit0 = commutative, bit1 set if incomplete y must be filled with 0 (to avoid isub oflo), bit2 set if incomplete x must be filled with i (for fdiv NaN), +// fz=bit0 = commutative, bit1 set if incomplete y must be filled with 0 (to avoid isub oflo), (bit 2 not needed for NaN since all fp compares are quiet) // bit3 set for int-to-float on x, bit4 for int-to-float on y // bit5 set to suppress loop-unrolling // bit6 set for bool-to-int on x, bit7 for bool-to-int on y diff --git a/jsrc/ve.c b/jsrc/ve.c index 7fbf598ee..5d638396d 100644 --- a/jsrc/ve.c +++ b/jsrc/ve.c @@ -33,10 +33,10 @@ AHDR2(tymesDB,PVD,PVD,PVB){R tymesBD(m^1^SGNTO0(m),z,y,x,n,jt);} // does tymesB #if C_AVX2 || EMU_AVX2 #define ORIGMN I nsav=n; if(msav>=0){n=m; m=nsav; nsav^=-(msav&1);} // restore old-style mn from modified mn and msav. If msav<0, OK already, otherwise swap & transfer flag to nsav -primop256(plusDD,0x3,NAN0;,zz=_mm256_add_pd(xx,yy),R NANTEST?EVNAN:EVOK;) -primop256(minusDD,0x2,NAN0;,zz=_mm256_sub_pd(xx,yy),R NANTEST?EVNAN:EVOK;) -primop256(minDD,1,,zz=_mm256_min_pd(xx,yy),R EVOK;) -primop256(maxDD,1,,zz=_mm256_max_pd(xx,yy),R EVOK;) +primop256(plusDD,0x7,NAN0;,zz=_mm256_add_pd(xx,yy),R NANTEST?EVNAN:EVOK;) +primop256(minusDD,0x6,NAN0;,zz=_mm256_sub_pd(xx,yy),R NANTEST?EVNAN:EVOK;) +primop256(minDD,0x7,,zz=_mm256_min_pd(xx,yy),R EVOK;) +primop256(maxDD,0x7,,zz=_mm256_max_pd(xx,yy),R EVOK;) primop256(tymesDD,0x7,D *zsav=z;NAN0;,zz=_mm256_mul_pd(xx,yy),if(unlikely(NANTEST)){z=zsav; DQ(n*m, if(_isnan(*(D*)z))*(D*)z=0.0; z=(C*)z+SZD;)} R EVOK;) // div can fail from 0%0 (which we turn to 0) or inf%inf (which we fail) primop256(divDD,4,D *zsav=z; D *xsav=x; D *ysav=y; I msav=m;NAN0;,zz=_mm256_div_pd(xx,yy), @@ -57,9 +57,9 @@ APFX( minEE, E,E,E, MINE,,R EVOK;) APFX( maxEE, E,E,E, MAXE,,R EVOK;) #if C_AVX2 || EMU_AVX2 -primop256(plusDI,0x10,,zz=_mm256_add_pd(xx,yy),R EVOK;) +primop256(plusDI,0x16,,zz=_mm256_add_pd(xx,yy),R EVOK;) // commutative primop256(plusID,8,,zz=_mm256_add_pd(xx,yy),R EVOK;) -primop256(plusDB,0xa00,,zz=_mm256_add_pd(xx,yy),R EVOK;) +primop256(plusDB,0xa06,,zz=_mm256_add_pd(xx,yy),R EVOK;) // commutative primop256(plusBD,0x900,,zz=_mm256_add_pd(xx,yy),R EVOK;) primop256(plusII,0x23,__m256d oflo=_mm256_setzero_pd();, zz=_mm256_castsi256_pd(_mm256_add_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy))); oflo=_mm256_or_pd(oflo,_mm256_andnot_pd(_mm256_xor_pd(xx,yy),_mm256_xor_pd(xx,zz)));, @@ -70,12 +70,12 @@ primop256(plusII,0x23,__m256d oflo=_mm256_setzero_pd();, primop256(plusIB,0x882,__m256d oflo=_mm256_setzero_pd();, zz=_mm256_castsi256_pd(_mm256_add_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy))); oflo=_mm256_or_pd(oflo,_mm256_castsi256_pd(_mm256_cmpgt_epi32(_mm256_castpd_si256(xx),_mm256_castpd_si256(zz))));, R !_mm256_testc_pd(_mm256_setzero_pd(),oflo)?EWOVIP+EWOVIPPLUSIB:EVOK;) // ~0 & oflo, testc if =0 which means no overflow -primop256(plusBB,0xc0,, +primop256(plusBB,0xc1,, zz=_mm256_castsi256_pd(_mm256_add_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy)));,R EVOK;) -primop256(minusDI,0x10,,zz=_mm256_sub_pd(xx,yy),R EVOK;) -primop256(minusID,0x8,,zz=_mm256_sub_pd(xx,yy),R EVOK;) -primop256(minusDB,0xa00,,zz=_mm256_sub_pd(xx,yy),R EVOK;) -primop256(minusBD,0x100,,zz=_mm256_sub_pd(xx,yy),R EVOK;) +primop256(minusDI,0x16,,zz=_mm256_sub_pd(xx,yy),R EVOK;) +primop256(minusID,0xa,,zz=_mm256_sub_pd(xx,yy),R EVOK;) +primop256(minusDB,0xa06,,zz=_mm256_sub_pd(xx,yy),R EVOK;) +primop256(minusBD,0x102,,zz=_mm256_sub_pd(xx,yy),R EVOK;) primop256(minusII,0x22,__m256d oflo=_mm256_setzero_pd();, zz=_mm256_castsi256_pd(_mm256_sub_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy))); oflo=_mm256_or_pd(oflo,_mm256_and_pd(_mm256_xor_pd(xx,yy),_mm256_xor_pd(xx,zz)));, R !_mm256_testc_pd(_mm256_setzero_pd(),oflo)?EWOVIP+EWOVIPMINUSII:EVOK;) // ~0 & oflo, testc if =0 which means no overflow @@ -87,10 +87,10 @@ primop256(minusIB,0x882,__m256d oflo=_mm256_setzero_pd();, R !_mm256_testc_pd(_mm256_setzero_pd(),oflo)?EWOVIP+EWOVIPMINUSIB:EVOK;) // ~0 & oflo, testc if =0 which means no overflow primop256(minusBB,0xc0,, zz=_mm256_castsi256_pd(_mm256_sub_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy)));,R EVOK;) -primop256(minDI,0x10,,zz=_mm256_min_pd(xx,yy),R EVOK;) +primop256(minDI,0x16,,zz=_mm256_min_pd(xx,yy),R EVOK;) // commutative primop256(minID,8,,zz=_mm256_min_pd(xx,yy),R EVOK;) // commutative primop256(minBD,0x100,,zz=_mm256_min_pd(xx,yy),R EVOK;) -primop256(minDB,0x200,,zz=_mm256_min_pd(xx,yy),R EVOK;) +primop256(minDB,0x206,,zz=_mm256_min_pd(xx,yy),R EVOK;) #if C_AVX512 primop256(minII,1,, zz=_mm256_castsi256_pd(_mm256_min_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy))),R EVOK;) primop256(minIB,0x80,,zz=_mm256_castsi256_pd(_mm256_min_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy))),R EVOK;) @@ -103,8 +103,8 @@ primop256(minII,1,, primop256(minIB,0x80,, zz=_mm256_castsi256_pd(BLENDVI(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy),_mm256_cmpgt_epi64(_mm256_castpd_si256(xx),_mm256_castpd_si256(yy)))); ,R EVOK;) #endif -primop256(maxDI,0x10,,zz=_mm256_max_pd(xx,yy),R EVOK;) -primop256(maxDB,0x200,,zz=_mm256_max_pd(xx,yy),R EVOK;) +primop256(maxDI,0x16,,zz=_mm256_max_pd(xx,yy),R EVOK;) +primop256(maxDB,0x206,,zz=_mm256_max_pd(xx,yy),R EVOK;) // commutative primop256(maxID,8,,zz=_mm256_max_pd(xx,yy),R EVOK;) // commutative primop256(maxBD,0x100,,zz=_mm256_max_pd(xx,yy),R EVOK;) #if C_AVX512 diff --git a/test/g13x.ijs b/test/g13x.ijs index 26600f844..da7287f9a 100644 --- a/test/g13x.ijs +++ b/test/g13x.ijs @@ -24,6 +24,15 @@ x return. x + y ) +1: 0 : 0 NB. would be nice to verify that this crash is fixed, but 13!:0 takes us up to immed level +13!:0 (129) +foo =: 3 : 'y' +13!:3 'foo *:*;' +foo 7 +13!:0 (129) NB. Used to crash +13!:0 (0) +) + foo =: 0$0 NB. will accumulate results executed during suspension 13!:0 ] 0 13!:3 'goo 0' @@ -308,6 +317,7 @@ NB. stops --------------------------------------------------------------- 1 [ 13!:3 '' + NB. error text ---------------------------------------------------------- sum=: +/ diff --git a/test/gnan.ijs b/test/gnan.ijs index 4dfb4472e..e71d797eb 100644 --- a/test/gnan.ijs +++ b/test/gnan.ijs @@ -287,6 +287,24 @@ _1 1 = 7 o. __ _ 'NaN error' -: x: etx _. 'NaN error' -: x: etx 3 4 _. +NB. end-of-buffer garbage is masked out +0. = -~ 99 {. 1. (i. 99)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN +0. = -~ 98 {. 1. (i. 98)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN +0. = -~ 97 {. 1. (i. 97)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN +0. = -~ 96 {. 1. (i. 96)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN +0. = -~ 96 }. 99 {. 1. (i. 99)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN +0. = -~ 96 }. 98 {. 1. (i. 98)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN +0. = -~ 96 }. 97 {. 1. (i. 97)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN +0. = -~ 96 }. 96 {. 1. (i. 96)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN +1: +~ 99 {. 1. (i. 99)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN +1: = +~ 98 {. 1. (i. 98)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN +1: = +~ 97 {. 1. (i. 97)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN +1: = +~ 96 {. 1. (i. 96)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN +1: = +~ 96 }. 99 {. 1. (i. 99)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN +1: = +~ 96 }. 98 {. 1. (i. 98)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN +1: = +~ 96 }. 97 {. 1. (i. 97)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN +1: = +~ 96 }. 96 {. 1. (i. 96)} 100 $ _2 (3!:5) 2 0 0 0 0 0 240 255 { a. NB. signaling NaN + 4!:55 ;:'d det f i inf ninf pinf t t1 x y zero znan'