From 3562ee66ef1223959f962bdb2b98fc1772a22623 Mon Sep 17 00:00:00 2001 From: "Joshua E. Hill" Date: Fri, 21 Jul 2023 23:35:08 +0000 Subject: [PATCH] * Add support for large files. (> 2GB). * Requires divsufsort64 (commonly included in the relevant package). --- cpp/Makefile | 2 +- cpp/shared/lrs_test.h | 369 +++++++++++++++++++++++++++++++++++++----- cpp/shared/utils.h | 20 ++- 3 files changed, 350 insertions(+), 41 deletions(-) diff --git a/cpp/Makefile b/cpp/Makefile index 377320d..bc0605d 100644 --- a/cpp/Makefile +++ b/cpp/Makefile @@ -11,7 +11,7 @@ endif #CXXFLAGS = -g -Wno-padded -Wno-disabled-macro-expansion -Wno-gnu-statement-expression -Wno-bad-function-cast -fopenmp -O1 -fsanitize=address -fsanitize=undefined -fno-omit-frame-pointer -fdenormal-fp-math=ieee -msse2 -march=native -I/usr/include/jsoncpp #static analysis in clang using #scan-build-15 --use-c++=/usr/bin/clang++-15 make -LIB = -lbz2 -lpthread -ldivsufsort +LIB = -lbz2 -lpthread -ldivsufsort -ldivsufsort64 COND_LIB = -lmpfr -lgmp SHARED_LIB = -ljsoncpp -lcrypto INC = diff --git a/cpp/shared/lrs_test.h b/cpp/shared/lrs_test.h index c3fb783..ebed329 100644 --- a/cpp/shared/lrs_test.h +++ b/cpp/shared/lrs_test.h @@ -1,17 +1,19 @@ #pragma once #include "utils.h" +#include #include +#include #define SAINDEX_MAX INT32_MAX +#define SAINDEX64_MAX INT64_MAX //Using the Kasai (et al.) O(n) time "13n space" algorithm. //"Linear-Time Longest-Common-Prefix Computation in Suffix Arrays and Its Applications", by Kasai, Lee, Arimura, Arikawa, and Park //https://doi.org/10.1007/3-540-48194-X_17 //http://web.cs.iastate.edu/~cs548/references/linear_lcp.pdf //The default implementation uses 4 byte indexes -//Note that indexes should be signed, so the next natural size is int64_t -static void sa2lcp(const uint8_t text[], long int n, const vector &sa, vector &lcp) { +static void sa2lcp32(const uint8_t text[], long int n, const vector &sa, vector &lcp) { saidx_t h; vector rank(n+1,-1); @@ -44,11 +46,46 @@ static void sa2lcp(const uint8_t text[], long int n, const vector &sa, } } -void calcSALCP(const uint8_t text[], long int n, vector &sa, vector &lcp) { +//Using the Kasai (et al.) O(n) time "25n space" algorithm (with 64-bit indicies) +static void sa2lcp64(const uint8_t text[], long int n, const vector &sa, vector &lcp) { + saidx64_t h; + vector rank(n+1,-1); + + assert(n>1); + + lcp[0] = -1; + lcp[1] = 0; + + // compute rank = sa^{-1} + for(saidx64_t i=0; i<=(saidx64_t)n; i++) { + rank[sa[i]] = i; + } + + // traverse suffixes in rank order + h=0; + + for(saidx64_t i=0; i<(saidx64_t)n; i++) { + saidx64_t k = rank[i]; // rank of s[i ... n-1] + if(k>1) { + saidx64_t j = sa[k-1]; // predecessor of s[i ... n-1] + while((i+h<(saidx64_t)n) && (j+h<(saidx64_t)n) && (text[i+h]==text[j+h])) { + h++; + } + + lcp[k] = h; + } + if(h>0) { + h--; + } + } +} + + +void calcSALCP32(const uint8_t text[], long int n, vector &sa, vector &lcp) { int32_t res; - assert(n < INT32_MAX); //This is the default type, but it can be compiled to use 64 bit indexes (and then this should be INT64_MAX) - assert(n > 0); //This is the default type, but it can be compiled to use 64 bit indexes (and then this should be INT64_MAX) + assert(n < SAINDEX_MAX); + assert(n > 0); assert(sa.size() == (size_t)(n+1)); assert(sa.size() == (size_t)(n+1)); @@ -56,14 +93,28 @@ void calcSALCP(const uint8_t text[], long int n, vector &sa, vector &sa, vector &lcp) { + int32_t res; + + assert(n < SAINDEX64_MAX); + assert(n > 0); + assert(sa.size() == (size_t)(n+1)); + assert(sa.size() == (size_t)(n+1)); + + sa[0] = (saidx64_t)n; + + res=divsufsort64((const sauchar_t *)text, (saidx64_t *)(sa.data()+1), (saidx64_t)n); + assert(res==0); + sa2lcp64(text, n, sa, lcp); +} /* Based on the algorithm outlined by Aaron Kaufer * This is described here: * http://www.untruth.org/~josh/sp80090b/Kaufer%20Further%20Improvements%20for%20SP%20800-90B%20Tuple%20Counts.pdf */ -void SAalgs(const uint8_t text[], long int n, int k, double &t_tuple_res, double &lrs_res, const int verbose, const char *label) { +void SAalgs32(const uint8_t text[], long int n, int k, double &t_tuple_res, double &lrs_res, const int verbose, const char *label) { vector sa(n+1, -1); //each value is at most n-1 vector L(n+2, -1); //each value is at most n-1 @@ -73,15 +124,15 @@ void SAalgs(const uint8_t text[], long int n, int k, double &t_tuple_res, double long int j; //0 <= j <= v+1 <= n saidx_t t; //Takes values from LCP array. 0 <= t < n - double Pmax; - double pu; + long double Pmax; + long double pu; assert(n>0); assert(k>0); - assert(n <= SAINDEX_MAX - 1); + assert(n < SAINDEX_MAX); assert((UINT64_MAX / (uint64_t)n) >= ((uint64_t)n+1U)); // (mult assert) - calcSALCP(text, n, sa, L); + calcSALCP32(text, n, sa, L); //to conform with Kaufer's conventions L.erase(L.begin()); @@ -176,10 +227,8 @@ void SAalgs(const uint8_t text[], long int n, int k, double &t_tuple_res, double /*Calculate the various Pmax[i] values. We need not save the actual values, only the largest*/ Pmax = -1.0; for(long int i=1; i Pmax) { Pmax=curPMax; @@ -187,20 +236,20 @@ void SAalgs(const uint8_t text[], long int n, int k, double &t_tuple_res, double } //finalize the t-tuple estimate - if(Pmax > 0.0) { + if(Pmax > 0.0L) { //We encountered a valid t, so we can run the test - pu = Pmax + ZALPHA*sqrt(Pmax*(1.0 - Pmax)/((double)(n - 1))); - if(pu > 1.0) { - pu = 1.0; + pu = Pmax + ZALPHA_L*sqrtl(Pmax*(1.0L - Pmax)/((long double)(n - 1))); + if(pu > 1.0L) { + pu = 1.0L; } - t_tuple_res = -log2(pu); + t_tuple_res = (double)-log2l(pu); - if(verbose == 2) printf("%s t-Tuple Estimate: t = %ld, p-hat_max = %.17g, p_u = %.17g\n", label, u-1, Pmax, pu); + if(verbose == 2) printf("%s t-Tuple Estimate: t = %ld, p-hat_max = %.22Lg, p_u = %.22Lg\n", label, u-1, Pmax, pu); else if(verbose == 3) { printf("%s t-Tuple Estimate: t = %ld\n", label, u-1); - printf("%s t-Tuple Estimate: p-hat_max = %.17g\n", label, Pmax); - printf("%s t-Tuple Estimate: p_u = %.17g\n", label, pu); + printf("%s t-Tuple Estimate: p-hat_max = %.22Lg\n", label, Pmax); + printf("%s t-Tuple Estimate: p_u = %.22Lg\n", label, pu); printf("%s t-Tuple Estimate: min entropy = %.17g\n", label, t_tuple_res); } @@ -253,10 +302,230 @@ void SAalgs(const uint8_t text[], long int n, int k, double &t_tuple_res, double for(long int i=u; i<=v; i++) { // Note, the assert marked "(mult assert)" tells us that the multiplication won't rollover. uint64_t choices = (((uint64_t)n-(uint64_t)i)*((uint64_t)n-(uint64_t)i+1U))>>1; - double curP = ((double)S[i]) / (double)choices; - double curPMax = pow(curP, 1.0/((double)i)); - //fprintf(stderr, "LRS Estimate: P_%ld = %.17g ( %zu / %zu )\n", i, curP, S[i], ((n-i)*(n-i+1))>>1); - //fprintf(stderr, "LRS Estimate: P_{max,%ld} = %.17g\n", i, curPMax); + long double curP = ((long double)S[i]) / (long double)choices; + long double curPMax = pow(curP, 1.0/((long double)i)); + + if(Pmax < curPMax) { + Pmax = curPMax; + } + } + + pu = Pmax + ZALPHA_L*sqrtl(Pmax*(1.0L - Pmax)/((long double)(n - 1))); + if(pu > 1.0L) { + pu = 1.0L; + } + + lrs_res = (double)-log2l(pu); + + if(verbose == 2) printf("%s LRS Estimate: u = %ld, v = %ld, p-hat = %.17Lg, p_u = %.17Lg\n", label, u, v, Pmax, pu); + else if(verbose == 3) { + printf("%s LRS Estimate: u = %ld\n", label, u); + printf("%s LRS Estimate: v = %ld\n", label, v); + printf("%s LRS Estimate: p-hat = %.22Lg\n", label, Pmax); + printf("%s LRS Estimate: p_u = %.22Lg\n", label, pu); + printf("%s LRS Estimate: min entropy = %.17g\n", label, lrs_res); + } + + } else { + printf("LRS Estimate: v sa(n+1, -1); //each value is at most n-1 + vector L(n+2, -1); //each value is at most n-1 + + long int u; //The length of a string: 1 <= u <= v+1 <= n + long int v; //The length of the LRS. 1 <= v <= n-1 + long int c; //contains a count from A + long int j; //0 <= j <= v+1 <= n + saidx64_t t; //Takes values from LCP array. 0 <= t < n + + long double Pmax; + long double pu; + + assert(n>0); + assert(k>0); + assert(n <= SAINDEX64_MAX - 1); + assert((UINT128_MAX / (uint128_t)n) >= ((uint128_t)n+1U)); // (mult assert) + + calcSALCP64(text, n, sa, L); + + //to conform with Kaufer's conventions + L.erase(L.begin()); + L[n] = 0; + assert(L[0] == 0); + + //Find the length of the LRS, v + + v=0; + for(long int i=0; iv) v = L[i]; + } + + assert((v>0) && (v < n)); + //v is now set correctly + + vector Q(v+1, 1); //Contains an accumulation of positive counts 1 <= Q[i] <= n + vector A(v+2, 0); //Contains an accumulation of positive counts 0 <= A[i] <= n + //I is set from L + //Note that I is indexed by at most j+1. + // j takes the value 0 to v+1 (so I[v+2] should work) + //(I stores indices of A, and there are only v+2 of these) + vector I(v+3, 0); //each value is most 0 <= I[i] <= v+2 <= n+1 + + j = 0; + for(long int i = 1; i <= n; i++) { + c = 0; + //Note L[0] is already verified to be 0 + assert(L[i] >= 0); + + if(L[i] < L[i-1]) { + t = L[i-1]; + assert(j>0); + j--; + assert(j<=v); + + while(t > L[i]) { + assert((t>0) && (t <= v)); + if((j > 0) && (I[j] == t)) { + /* update count for non-zero entry of A */ + A[I[j]] += A[I[j+1]]; + A[I[j+1]] = 0; + j--; + } + + if(Q[t] >= A[I[j+1]]+1) { + /* + * Q[t] is at least as large as current count, + * and since Q[t] <= Q[t-1] <= ... <= Q[1], + * there is no need to check zero entries of A + * until next non-zero entry + */ + if(j > 0) { + /* skip to next non-zero entry of A */ + t = I[j]; + } else { + /* + * no more non-zero entries of A, + * so skip to L[i] (terminate while loop) + */ + t = L[i]; + } + } else { + /* update Q[t] with new maximum count */ + Q[t--] = A[I[j+1]]+1; + } + } + + c = A[I[j+1]]; /* store carry over count */ + A[I[j+1]] = 0; + } + + if(L[i] > 0) { + if((j < 1) || (I[j] < L[i])) { + /* insert index of next non-zero entry of A */ + assert(j= 35); u++); + + assert(u > 0); + assert(((u == v+1) || ((u <= v) && (Q[u] < 35)))); + assert(((u == 1) || (Q[u-1] >= 35))); + //u is now correctly set. + + //at this point, Q is completely calculated. + /*Calculate the various Pmax[i] values. We need not save the actual values, only the largest*/ + Pmax = -1.0; + for(long int i=1; i Pmax) { + Pmax=curPMax; + } + } + + //finalize the t-tuple estimate + if(Pmax > 0.0) { + //We encountered a valid t, so we can run the test + pu = Pmax + ZALPHA_L*sqrtl(Pmax*(1.0L - Pmax)/((long double)(n - 1))); + if(pu > 1.0) { + pu = 1.0; + } + + t_tuple_res = (double)-log2l(pu); + + if(verbose == 2) printf("%s t-Tuple Estimate: t = %ld, p-hat_max = %.22Lg, p_u = %.22Lg\n", label, u-1, Pmax, pu); + else if(verbose == 3) { + printf("%s t-Tuple Estimate: t = %ld\n", label, u-1); + printf("%s t-Tuple Estimate: p-hat_max = %.22Lg\n", label, Pmax); + printf("%s t-Tuple Estimate: p_u = %.22Lg\n", label, pu); + printf("%s t-Tuple Estimate: min entropy = %.17g\n", label, t_tuple_res); + } + + } else { + if(verbose > 1) printf("t-Tuple Estimate: No strings are repeated 35 times. t-Tuple estimate failed.\n"); + t_tuple_res = -1.0; + } + + //calculate the LRS estimate + if(v>=u) { + vector S(v+1, 0); + memset(A.data(), 0, sizeof(saidx64_t)*((size_t)v+2)); + + for(long int i = 1; i <= n; i++) { + if((L[i-1] >= u) && (L[i] < L[i-1])) { + saidx64_t b = L[i]; + + //A[u] stores the number of u-length tuples. We need to eventually clear down to A[u]=A[b+1]. + if(b < u) b = u-1; + + for(t = L[i-1]; t > b; t--) { + uint128_t priorS; + uint128_t choices; + A[t] += A[t+1]; + A[t+1] = 0; + + assert(A[t] >= 0); + // update sum + // Note that (c choose 2) is just (c)(c-1)/2. + // The numerator of this expression is necessarily even + // Dividing an even quantity by 2 is the same as right shifting by 1. + // Check for overflows when adding to S[t] (unsigned 64 bit integers) + // Note, A[t] <= n, so the assert marked "(mult assert)" tells us that the multiplication won't rollover. + + priorS = S[t]; + choices = ((((uint128_t)(A[t]+1) * (uint128_t)(A[t]))))>>1; + S[t] = priorS + choices; + assert(S[t] >= priorS); + } + + if(b >= u) A[b] += A[b+1]; /* carry over count for t = L[i] */ + A[b+1] = 0; + } + + if(L[i] >= u) A[L[i]]++; /* update count for t = L[i] */ + } + + //We now have a complete set of numerators in S + Pmax = 0.0; + for(long int i=u; i<=v; i++) { + // Note, the assert marked "(mult assert)" tells us that the multiplication won't rollover. + uint128_t choices = (((uint128_t)n-(uint128_t)i)*((uint128_t)n-(uint128_t)i+1U))>>1; + long double curP = ((long double)S[i]) / (long double)choices; + long double curPMax = powl(curP, 1.0L/((long double)i)); if(Pmax < curPMax) { @@ -264,19 +533,19 @@ void SAalgs(const uint8_t text[], long int n, int k, double &t_tuple_res, double } } - pu = Pmax + ZALPHA*sqrt(Pmax*(1.0 - Pmax)/((double)(n - 1))); + pu = Pmax + ZALPHA_L*sqrtl(Pmax*(1.0L - Pmax)/((long double)(n - 1))); if(pu > 1.0) { pu = 1.0; } lrs_res = -log2(pu); - if(verbose == 2) printf("%s LRS Estimate: u = %ld, v = %ld, p-hat = %.17g, p_u = %.17g\n", label, u, v, Pmax, pu); + if(verbose == 2) printf("%s LRS Estimate: u = %ld, v = %ld, p-hat = %.22Lg, p_u = %.22Lg\n", label, u, v, Pmax, pu); else if(verbose == 3) { printf("%s LRS Estimate: u = %ld\n", label, u); printf("%s LRS Estimate: v = %ld\n", label, v); - printf("%s LRS Estimate: p-hat = %.17g\n", label, Pmax); - printf("%s LRS Estimate: p_u = %.17g\n", label, pu); + printf("%s LRS Estimate: p-hat = %.22Lg\n", label, Pmax); + printf("%s LRS Estimate: p_u = %.22Lg\n", label, pu); printf("%s LRS Estimate: min entropy = %.17g\n", label, lrs_res); } @@ -289,12 +558,20 @@ void SAalgs(const uint8_t text[], long int n, int k, double &t_tuple_res, double return; } -int len_LRS(const uint8_t text[], const int sample_size){ +void SAalgs(const uint8_t text[], long int n, int k, double &t_tuple_res, double &lrs_res, const int verbose, const char *label) { + if(n sa(sample_size+1, -1); vector lcp(sample_size+1, -1); saidx_t lrs_len = -1; - calcSALCP(text, sample_size, sa, lcp); + calcSALCP32(text, sample_size, sa, lcp); for(saidx_t j = 0; j <= sample_size; j++) { if(lcp[j] > lrs_len) lrs_len = lcp[j]; @@ -303,6 +580,19 @@ int len_LRS(const uint8_t text[], const int sample_size){ return(lrs_len); } +long int len_LRS64(const uint8_t text[], const int sample_size){ + vector sa(sample_size+1, -1); + vector lcp(sample_size+1, -1); + saidx64_t lrs_len = -1; + + calcSALCP64(text, sample_size, sa, lcp); + + for(saidx64_t j = 0; j <= sample_size; j++) { + if(lcp[j] > lrs_len) lrs_len = lcp[j]; + } + + return(lrs_len); +} /* * --------------------------------------------- * HELPER FUNCTIONS @@ -327,7 +617,7 @@ void calc_collision_proportion(const vector &p, long double &p_col){ bool len_LRS_test(const uint8_t data[], const int L, const int k, const int verbose, const char *label) { // p_col is the probability of collision on a per-symbol basis under an IID assumption (this is related to the collision entropy). // p_col >= 1/k, which bounds this. - // Note, for SP 800-90B k<=256, so we can bound p_col >= 2^-8. + // Note, for SP 800-90B k<=256, so we can bound p_col >= 2^-8. vector p(k, 0.0); calc_proportions(data, p, L); long double p_col = 0.0; @@ -350,7 +640,12 @@ bool len_LRS_test(const uint8_t data[], const int L, const int k, const int verb assert(p_col < 1.0L); // The length of the longest repeated substring (LRS) for the supplied data is W. - int W = len_LRS(data, L); + long int W; + if(L 1) { if(verbose > 2) { printf("%s Longest Repeated Substring results: P_col = %.22Lg\n", label, p_col); - printf("%s Longest Repeated Substring results: W = %d\n", label, W); + printf("%s Longest Repeated Substring results: W = %ld\n", label, W); } else { printf("%s Longest Repeated Substring results\n", label); printf("\tP_col: %Lf\n", p_col); - printf("\tLength of LRS: %d\n", W); + printf("\tLength of LRS: %ld\n", W); } // Calculate the probability of not encountering a collision after N sets of independent pairs; diff --git a/cpp/shared/utils.h b/cpp/shared/utils.h index 1f1f6f3..d0ff98f 100644 --- a/cpp/shared/utils.h +++ b/cpp/shared/utils.h @@ -34,6 +34,18 @@ #define DBL_INFINITY __builtin_inf () #define ITERMAX 1076 #define ZALPHA 2.5758293035489008 +#define ZALPHA_L 2.575829303548900384158L + +//Make uint128_t a supported type (standard as of C23) +#ifdef __SIZEOF_INT128__ +typedef unsigned __int128 uint128_t; +typedef unsigned __int128 uint_least128_t; +# define UINT128_MAX ((uint128_t)-1) +# define UINT128_WIDTH 128 +# define UINT_LEAST128_WIDTH 128 +# define UINT_LEAST128_MAX UINT128_MAX +# define UINT128_C(N) ((uint_least128_t)+N ## WBU) +#endif //Version of the tool #define VERSION "1.1.6" @@ -51,6 +63,8 @@ struct data_t{ long blen; // number of bits in data }; + + using namespace std; //This generally performs a check for relative closeness, but (if that check would be nonsense) @@ -593,7 +607,7 @@ void seed(uint64_t *xoshiro256starstarState){ */ uint64_t randomRange64(uint64_t s, uint64_t *xoshiro256starstarState){ uint64_t x; - __uint128_t m; + uint128_t m; uint64_t l; x = xoshiro256starstar(xoshiro256starstarState); @@ -602,14 +616,14 @@ uint64_t randomRange64(uint64_t s, uint64_t *xoshiro256starstarState){ return x; } else { s++; // We want an integer in the range [0,s], not [0,s) - m = (__uint128_t)x * (__uint128_t)s; + m = (uint128_t)x * (uint128_t)s; l = (uint64_t)m; //This is m mod 2^64 if(l