add faster [<>].@(2&^.) for floats

jsoftware · Aug 22, 2024 · 50a6b21 · 50a6b21
1 parent 103bf41
commit 50a6b21
Show file tree

Hide file tree

Showing 4 changed files with 87 additions and 32 deletions.
diff --git a/jsrc/ca.c b/jsrc/ca.c
@@ -22,36 +22,54 @@ static DF1(jtonf1){PROLOG(0021);DECLFG;I flag=sv->flag,m=jt->xmode;
 }
 
 // <.@(2&^.) monad
-static DF1(jtintfloorlog2) {A z;
+static DF1(jtintfloorlog2) {
  ARGCHK1(w);
- I *wv = IAV(w); I wn=AN(w);
- GATV(z, INT, wn, AR(w), AS(w)); I *zv = IAV(z); // zv points to allocated result area
- if (INT & AT(w)) {
-  // When d >= 1 then <.@(2&^.) d is equal to the position of the highest 1-bit in d (CTLZI).  failover to by hand if d<=0
-  DO(wn, I d=wv[i]; if(unlikely(d<=0))R onf1(w,self); zv[i]=CTLZI(d);)
-// obsolete   for (I i = wn - 1; i >= 0; --i, ++wv, ++zv) { // loop over all atoms of w
-// obsolete    I d = *wv;
-// obsolete    if (d > 0) *zv = CTLZI(d); // When d >= 1 then <.@(2&^.) d is equal to the position of the highest 1-bit in d (CTLZI).
-// obsolete    else R onf1(w, self); // When d < 1 then stop and reexecute by hand for whole w.
-// obsolete   }
- } else R onf1(w, self);
- R z;
+ if ((INT | FL) & AT(w)) { // Special cases only for integers and floats ([<>].@f for extended integers is handled in vx.c).
+  A z; I wn = AN(w); I wr = AR(w); I *ws = AS(w); I *wv = IAV(w); // GATV documentation advises using variables for arguments.
+  GATV(z, INT, wn, wr, ws); I *zv = IAV(z); // zv points to allocated result area.
+  if (INT & AT(w)) { // Case with integers.
+   DO(wn,
+    I d = wv[i]; if (unlikely(d <= 0)) R onf1(w, self); // Failover to by hand if d <= 0.
+    zv[i] = CTLZI(d); // When d >= 1 then <.@(2&^.) d is equal to the position of the highest 1-bit in d (CTLZI).
+   )
+  } else { // Case with floats.
+   DO(wn,
+    UI8 d = *(UI8*)&wv[i];
+    // Infs and NaN are the only floats that have all 1-bits in exponent. D_EXP_MSK represents +Inf, but is also used to check NaN and denorms.
+    if (unlikely(wv[i] <= 0 || (d & D_EXP_MSK) == D_EXP_MSK)) R onf1(w, self); // Failover to by hand if d <= 0 or d is +Inf or NaN.
+    zv[i] = ((~d) & D_EXP_MSK) == D_EXP_MSK // Denorms are the only floats (except 0 which was eliminated earlier) that have all 0-bits in exponent.
+     ? CTLZI(d) - D_MANT_BITS_N + D_EXP_MIN // Denorm. Position of the highest 1-bit in d (which is in fraction part) is found with CTLZI.
+     : ((d & D_EXP_MSK) >> D_MANT_BITS_N) - D_EXP_MAX; // Normal. Result is exponent.
+   )
+  }
+  R z;
+ }
+ R onf1(w, self);
 }
 
 // >.@(2&^.) monad
-static DF1(jtintceillog2) {A z; I wn, wr, *ws, *wv;
+static DF1(jtintceillog2) { // Similar to the above case with floor (almost rewritten, but inner loops differ).
  ARGCHK1(w);
- if (INT & AT(w)) {
-  wn = AN(w); wr = AR(w); ws = AS(w); wv = IAV(w);
+ if ((INT | FL) & AT(w)) {
+  A z; I wn = AN(w); I wr = AR(w); I *ws = AS(w); I *wv = IAV(w);
   GATV(z, INT, wn, wr, ws); I *zv = IAV(z);
-  for (I i = wn - 1; i >= 0; --i, wv++, zv++) {
-   I d = *wv;
-   // Similar to the above case with floor, but when d is not a power of 2 then add 1. Only powers of 2 have exactly one 1-bit, so count 1-bits in d (CT1I).
-   if (d > 0) *zv = CTLZI(d) + (CT1I(d) > 1);
-   else R onf1(w, self);
+  if (INT & AT(w)) { // Case with integers.
+   DO(wn,
+    I d = wv[i]; if (unlikely(d <= 0)) R onf1(w, self);
+    zv[i] = CTLZI(d) + (CT1I(d) > 1); // When d is not a power of 2 then add 1. Only powers of 2 have exactly one 1-bit, so count 1-bits in d (CT1I).
+   )
+  } else { // Case with floats.
+   DO(wn,
+    UI8 d = *(UI8*)&wv[i];
+    if (unlikely(wv[i] <= 0 || (d & D_EXP_MSK) == D_EXP_MSK)) R onf1(w, self);
+    zv[i] = ((~d) & D_EXP_MSK) == D_EXP_MSK
+     ? CTLZI(d) + (CT1I(d) > 1) - D_MANT_BITS_N + D_EXP_MIN // Denorm.
+     : ((d & D_EXP_MSK) >> D_MANT_BITS_N) - D_EXP_MAX + (CT1I(d & D_MANT_MSK) > 1); // Normal.
+   )
   }
- } else R onf1(w, self);
- R z;
+  R z;
+ }
+ R onf1(w, self);
 }
 
 // <.@ >.@ and the like, dyad 

diff --git a/jsrc/j.h b/jsrc/j.h
@@ -349,6 +349,21 @@ static inline omp_int_t omp_get_num_threads() { return 1;}
 #endif
 #endif
 
+/* IEEE 754 constants that are not defined in float.h */
+// D_MANT_BITS_N is number of bits of mantissa in bit representation.
+// Its value is DBL_MANT_DIG - 1, because the first digit does not occur in bit represention (always 1 for normalized numbers).
+#define D_MANT_BITS_N   52
+// D_EXP_BITS_N is number of bits of exponent in bit representation.
+#define D_EXP_BITS_N    11
+// 1 - D_EXP_MAX <= exponent <= D_EXP_MAX. In bit representation sum of exponent and D_EXP_MAX is stored which is positive. D_EXP_MAX = DBL_MAX_EXP - 1.
+#define D_EXP_MAX       1023
+// D_EXP_MIN = 1 - D_EXP_MAX
+#define D_EXP_MIN       -1022
+// Bit mask of exponent is (UI8)((1 << D_EXP_BITS_N) - 1) << D_MANT_BITS_N. This is also bit representation of +Inf.
+#define D_EXP_MSK       0x7ff0000000000000LL
+#define D_MANT_MSK      0x000fffffffffffffLL
+
+
 #if SY_64
 #define IMAX            9223372036854775807LL
 #define IMAXPRIME       9223372036854775783LL
@@ -2350,15 +2365,15 @@ if(unlikely(!_mm256_testz_pd(sgnbit,mantis0))){  /* if mantissa exactly 0, must
 #else
 #define CTLZI(w) (63-__builtin_clzll((UI)(w)))
 #endif
-#define CT1I(w) __builtin_popcountll((UI)w)
+#define CT1I(w) __builtin_popcountll((UI)(w))
 #else
 #define CTTZI(w) __builtin_ctzl((UINT)(w))
 #if (!C_AVX2) && (defined(__i386__) || defined(__x86_64__) || defined(_M_X64) || defined(_M_IX86))
 #define CTLZI(w) (31-__builtin_clzl((UI)(w)|1))  // use this if we fear garbage if w=0
 #else
 #define CTLZI(w) (31-__builtin_clzl((UI)(w)))
 #endif
-#define CT1I(w) __builtin_popcountl((UI)w)
+#define CT1I(w) __builtin_popcountl((UI)(w))
 #endif
 #define CTTZZ(w) ((w)==0 ? 32 : CTTZ(w))   // use this if we need 32 when w=0
 #endif

diff --git a/jsrc/vm.c b/jsrc/vm.c
@@ -18,12 +18,12 @@ D jtintpow(J jt, D x, I n) { D r;
   // n = n > INT_MAX ? INT_MAX : (n < INT_MIN ? INT_MIN : n);
   // R ldexp(1, n);
   UI8 ires; // where bit-pattern of result will be built
-  if(likely(BETWEENC(n, -1022, 1023))) ires = (UI8)(n + 1023) << 52; // normal range
-  else if (n > 1023) ires = (UI8)((1 << 11) - 1) << 52; // infinity
+  if(likely(BETWEENC(n, D_EXP_MIN, D_EXP_MAX))) ires = (UI8)(n + D_EXP_MAX) << D_MANT_BITS_N; // normal range
+  else if (n > D_EXP_MAX) ires = D_EXP_MSK; // infinity
   else {
-   // denorm and zero. n = -1023 is the largest denorm
-   n = MAX(n, -1023 - 53);
-   ires = ((UI8)1 << 51) >> (-1023 - n);
+   // denorm and zero. n = D_EXP_MIN - 1 is the largest denorm
+   n = MAX(n, D_EXP_MIN - 1 - D_MANT_BITS_N);
+   ires = ((UI8)1 << (D_MANT_BITS_N - 1)) >> (D_EXP_MIN - 1 - n);
   }
   R *(D*)&ires;  // return bit-pattern as a D
  }

diff --git a/test/g200.ijs b/test/g200.ijs
@@ -190,10 +190,32 @@ _ = 2^1e15  NB. on 32-bit, this is a float constant
 
 NB. <.@(2&^.) and >.@(2&^.) ---------------------------------------------
 
-(100 $ 0) -: (>.@(2&^.) - <.@(2&^.)) 2 <.@^ i. 100
-(100 $ 0) -: (>.@(2&^.) - <.@(2&^.)) 2 ^ i. 100
+(30 $ 0) -: (>.@(2&^.) - <.@(2&^.)) 2 <.@^ i. 30
+(30 $ 0) -: (>.@(2&^.) - <.@(2&^.)) 2 ^ i. 30
 0 1 1 2 2 2 2 3 3 3 -: <.@(2&^.) >: i. 10
 0 1 2 2 3 3 3 3 4 4 -: >.@(2&^.) >: i. 10
+49 53 56 -: <.@(2&^.) 1e15 1e16 1e17
+50 54 57 -: >.@(2&^.) 1e15 1e16 1e17
+0 1 1 2 3 3 6 -: <.@(2&^.) 1.1 2.3 3.4 7.8 8 8.0635 100.1921
+1 2 2 3 3 4 7 -: >.@(2&^.) 1.1 2.3 3.4 7.8 8 8.0635 100.1921
+(i. 10) -: <.@(2&^.) 2 ^ i. 10
+(i. 10) -: >.@(2&^.) 2 ^ i. 10
+(- i. 10) -: <.@(2&^.) 2 ^ - i. 10
+(- i. 10) -: >.@(2&^.) 2 ^ - i. 10
+1021 1022 1023 -: <.@(2&^.) 2 ^ 1021 1022 1023
+1021 1022 1023 -: >.@(2&^.) 2 ^ 1021 1022 1023
+_1072 _1073 _1074 -: <.@(2&^.) 2 ^ _1072 _1073 _1074
+_1072 _1073 _1074 -: >.@(2&^.) 2 ^ _1072 _1073 _1074
+_1050 -: <.@(2&^.) +/ ] 2 ^ _1050 _1054 _1070
+_1049 -: >.@(2&^.) +/ ] 2 ^ _1050 _1054 _1070
+'complex' -: datatype <.@(2&^.) _.
+'complex' -: datatype >.@(2&^.) _.
+_ -: <.@(2&^.) _
+_ -: >.@(2&^.) _
+__ -: <.@(2&^.) 0
+__ -: >.@(2&^.) 0
+1 1 0j4 -: <.@(2&^.) 2 3.4 _1
+1 2 0j5 -: >.@(2&^.) 2 3.4 _1
 
 NB. 0^0 -----------------------------------------------------------------