flatironinstitute · DiamonDinoia · Jul 23, 2024 · Jul 16, 2024 · Jul 17, 2024 · Jul 17, 2024
diff --git a/CHANGELOG b/CHANGELOG
@@ -1,8 +1,10 @@
 List of features / changes made / release notes, in reverse chronological order.
 If not stated, FINUFFT is assumed (cuFINUFFT <=1.3 is listed separately).
 
-V 2.3.0beta (6/21/24)
+V 2.3.0beta (7/21/24)
 
+* ES kernel rescaled to max value 1, reduced horner degrees for upsampfac=1.25
+  (fixes fp32 overflow issue #454).
 * Major acceleration of spread/interp kernels using XSIMD header-only lib,
   kernel evaluation, templating by ns with AVX-width-dependent decisions.
   Up to 80% faster, dep on compiler. (Marco Barbone with help from Libin Lu).

diff --git a/devel/README b/devel/README
@@ -0,0 +1,17 @@
+Developer and experimental codes for FINUFFT
+--------------------------------------------
+
+For generating kernel coefficient codes in ../src,
+the developer must run from MATLAB the following:
+
+gen_all_horner_C_code.m          : writes C-style Horner coeffs (pre-2024)
+  (you must rerun with upsampfac=2 and 1.25)
+gen_all_horner_cpp_header.m      : writes C++ header Horner coeffs (7/2024 on)
+  (you must rerun with upsampfac=2 and 1.25)
+
+Both these call for the solve of the coeffs for each w:
+ker_ppval_coeff_mat.m
+  (which has the kernel definition in it, which must match spreadinterp.cpp)
+  The self-test for this last code is a good place to tweak the degree vs w.
+
+Barnett 7/21/24
diff --git a/devel/gen_all_horner_C_code.m b/devel/gen_all_horner_C_code.m
@@ -5,15 +5,16 @@
 % Barnett 4/23/18; now calling Ludvig's loop version from 4/25/18.
 % version including low upsampfac, 6/17/18.
 % Ludvig put in w=4n padding, 1/31/20. Mystery about why d was bigger 2/6/20.
+% lowered d for each w to match expts, esp for sig=1.25, Barnett 7/21/24
 clear
 opts = struct();
 
 ws = 2:16;
-upsampfac = 2;       % sigma (upsampling): either 2 (default) or low (eg 5/4).
+upsampfac = 1.25;     % sigma (upsampling): either 2 (default) or low (eg 5/4)
 opts.wpad = true;    % pad kernel eval to multiple of 4
 
-if upsampfac==2, fid = fopen('../src/ker_horner_allw_loop.c','w');
-else, fid = fopen('../src/ker_lowupsampfac_horner_allw_loop.c','w');
+if upsampfac==2, fid = fopen('../src/ker_horner_allw_loop_constexpr.c','w');
+else, fid = fopen('../src/ker_lowupsampfac_horner_allw_loop_constexpr.c','w');
 end
 fwrite(fid,sprintf('// Code generated by gen_all_horner_C_code.m in finufft/devel\n'));
 fwrite(fid,sprintf('// Authors: Alex Barnett & Ludvig af Klinteberg.\n// (C) The Simons Foundation, Inc.\n'));
@@ -22,18 +23,18 @@
   if upsampfac==2    % hardwire the betas for this default case
     betaoverws = [2.20 2.26 2.38 2.30];   % matches setup_spreader
     beta = betaoverws(min(4,w-1)) * w;    % uses last entry for w>=5
-    d = w + 2 + (w<=8);                   % between 2-3 more degree than w
+    d = w + 1 + (w<=8);                   % between 1-2 more degree than w
   else               % use formulae, must match params in setup_spreader...
     gamma=0.97;                           % safety factor
     betaoverws = gamma*pi*(1-1/(2*upsampfac));  % from cutoff freq formula
     beta = betaoverws * w;
-    d = w + 1 + (w<=8);                   % less, since beta smaller, smoother
+    d = ceil(0.6*w+2.2);                 % from ker_ppval_coeff_mat expts
   end
   str = gen_ker_horner_loop_C_code(w,d,beta,opts);
   if j==1                                % write switch statement
-    fwrite(fid,sprintf('  if (w==%d) {\n',w));
+    fwrite(fid,sprintf('  if constexpr(w==%d) {\n',w));
   else
-    fwrite(fid,sprintf('  } else if (w==%d) {\n',w));
+    fwrite(fid,sprintf('  } else if constexpr(w==%d) {\n',w));
   end
   for i=1:numel(str); fwrite(fid,['    ',str{i}]); end
 end

diff --git a/devel/gen_all_horner_cpp_header.m b/devel/gen_all_horner_cpp_header.m
@@ -0,0 +1,60 @@
+% Script to make C++ header for looped Horner eval of kernels of all widths,
+% for a particular opts.upsampfac (user to hand-choose below).
+% C++ version (now writes .h), uses constexpr to switch by width w.
+
+% Barnett 4/23/18; now calling Ludvig's loop version from 4/25/18.
+% version including low upsampfac, 6/17/18.
+% Ludvig put in w=4n padding, 1/31/20. Mystery about why d was bigger 2/6/20.
+% C++ header using constexpr of Barbone, replacing *_C_code.m. Barnett 7/16/24.
+clear
+opts = struct();
+
+ws = 2:16;           % list of widths (the driver, rather than toloerances)
+upsampfac = 1.25;    % sigma (upsampling): either 2 (default) or low (eg 5/4)
+
+if upsampfac==2
+  fid = fopen('../src/ker_horner_allw_loop_constexpr.h','w');
+  get_nc_code = 'w + 2 + (w <= 8)';   % must be C++ expression for nc=d+1, not d
+elseif upsampfac==1.25
+  fid = fopen('../src/ker_lowupsampfac_horner_allw_loop_constexpr.h','w');
+  get_nc_code = 'std::ceil(0.6*w + 3.2)';  % must be C++ code for nc=d+1, not d
+end
+fwrite(fid,sprintf('// Header of static arrays of monomial coeffs of spreading kernel function in each\n'));
+fwrite(fid,sprintf('// fine-grid interval. Generated by gen_all_horner_cpp_header.m in finufft/devel\n'));
+fwrite(fid,sprintf('// Authors: Alex Barnett, Ludvig af Klinteberg, Marco Barbone & Libin Lu.\n// (C) 2018--2024 The Simons Foundation, Inc.\n'));
+fwrite(fid,sprintf('#include <array>\n\n'));
+
+usf_tag = sprintf('%d',100*upsampfac);  % follow Barbone convention: 200 or 125
+fwrite(fid,sprintf('template<uint8_t w> constexpr auto nc%s() noexcept { return %s; }\n\n', usf_tag, get_nc_code));
+fwrite(fid,sprintf('template<class T, uint8_t w>\nconstexpr std::array<std::array<T, w>, nc%s<w>()> get_horner_coeffs_%s() noexcept {\n',usf_tag,usf_tag));
+fwrite(fid,sprintf('    constexpr auto nc = nc%s<w>();\n',usf_tag));
+
+for j=1:numel(ws)
+  w = ws(j)
+  if upsampfac==2    % hardwire the betas for this default case
+    betaoverws = [2.20 2.26 2.38 2.30];   % must match setup_spreader
+    beta = betaoverws(min(4,w-1)) * w;    % uses last entry for w>=5
+    d = w + 1 + (w<=8);                   % between 2-3 more degree than w
+  elseif upsampfac==1.25  % use formulae, must match params in setup_spreader
+    gamma=0.97;                           % safety factor
+    betaoverws = gamma*pi*(1-1/(2*upsampfac));  % from cutoff freq formula
+    beta = betaoverws * w;
+    d = ceil(0.6*w+2.2);                  % less, since beta smaller, smoother
+  end
+
+  str = gen_ker_horner_loop_cpp_code(w,d,beta,opts);  % code strings for this w
+
+  if j==1                                % write switch statement
+    fwrite(fid,sprintf('    if constexpr (w==%d) {\n',w));
+  else
+    fwrite(fid,sprintf('    } else if constexpr (w==%d) {\n',w));
+  end
+  for i=1:numel(str); fwrite(fid,['    ',str{i}]); end   % format 4 extra spaces
+end
+
+% handle bad w at compile time...
+fwrite(fid,sprintf('    } else {\n'));
+fwrite(fid,sprintf('      static_assert(w >= %d, "w must be >= %d");\n',ws(1),ws(1)));
+fwrite(fid,sprintf('      static_assert(w <= %d, "w must be <= %d");\n',ws(end),ws(end)));
+fwrite(fid,sprintf('      return {};\n    }\n};\n'));    % close all brackets
+fclose(fid);
diff --git a/devel/gen_ker_horner_loop_C_code.m b/devel/gen_ker_horner_loop_C_code.m
@@ -6,7 +6,7 @@
 % Inputs:
 %  w = integer kernel width in grid points, eg 10
 %  d = poly degree to keep, eg 13
-%  beta = kernel parameter, around 2.3*w
+%  beta = kernel parameter, around 2.3*w (for upsampfac=2)
 %  opts - optional struct, with fields:
 %         wpad - if true, pad the number of kernel eval (segments) to w=4n
 %                for SIMD speed, esp. w/ GCC<=5.4
@@ -23,18 +23,19 @@
 % (Horner can't be vectorized in the degree direction; Estrin was no faster.)
 
 % Ludvig af Klinteberg 4/25/18, based on Barnett 4/23/18. Ludvig wpad 1/31/20.
+% Barnett fixed bug where degree was d-1 not d throughout, 7/21/24.
 if nargin==0, test_gen_ker_horner_loop_C_code; return; end
 if nargin<4, o=[]; end
 
 C = ker_ppval_coeff_mat(w,d,be,o);
-str = cell(d+1,1);
+str = cell(d+2,1);
 if isfield(o,'wpad') && o.wpad
   width = 4*ceil(w/4);
   C = [C zeros(size(C,1),width-w)];    % pad coeffs w/ 0, up to multiple of 4
 else
   width = w;
 end
-for n=1:d                  % loop over poly coeff powers
+for n=1:d+1                 % loop over poly coeff powers
   s = sprintf('FLT c%d[] = {%.16E',n-1, C(n,1));
   for i=2:width            % loop over segments
     s = sprintf('%s, %.16E', s, C(n,i));      
@@ -43,17 +44,17 @@
 end
 
 s = sprintf('for (int i=0; i<%d; i++) ker[i] = ',width);
-for n=1:d-1
+for n=1:d
   s = [s sprintf('c%d[i] + z*(',n-1)];   % (n-1)th coeff for i'th segment
 end
-s = [s sprintf('c%d[i]',d-1)];
-for n=1:d-1, s = [s sprintf(')')]; end  % close all parens
+s = [s sprintf('c%d[i]',d)];
+for n=1:d, s = [s sprintf(')')]; end  % close all parens
 s = [s sprintf(';\n')];          % terminate the C line, CR
-str{d+1} = s;
+str{d+2} = s;
 
 %%%%%%%%
 function test_gen_ker_horner_loop_C_code  % writes C code to file, doesn't test
-w=13; d=16;           % pick a single kernel width and degree to write code for
+w=13; d=15;           % pick a single kernel width and degree to write code for
 %w=7; d=11;
 %w=2; d=5;
 beta=2.3*w;

diff --git a/devel/gen_ker_horner_loop_cpp_code.m b/devel/gen_ker_horner_loop_cpp_code.m
@@ -0,0 +1,67 @@
+function str = gen_ker_horner_loop_cpp_code(w,d,be,o)
+% GEN_KER_HORNER_LOOP_CPP_CODE  Write C++ for piecewise poly coeffs of kernel
+%
+% str = gen_ker_horner_loop_cpp_code(w,d,be,o)
+%
+% Inputs:
+%  w = integer kernel width in grid points, eg 10
+%  d = poly degree to keep, eg 13.  (will give # coeffs nc=d+1)
+%  beta = kernel parameter, around 2.3*w
+%  opts - optional struct, with fields: [none for now].
+%
+% Outputs:
+%  str = cell array of C++ code strings to define (d+1)*w static coeffs array,
+%        as in each w-specific block in ../src/ker_horner_allw_loop_constexpr.h
+%
+% Also see: KER_PPVAL_COEFF_MAT, FIG_SPEED_KER_PPVAL (which tests acc too)
+%           GEN_KER_HORNER_CPP_HEADER
+%
+% Notes:
+%
+% It exploits that there are w calls to different poly's w/ *same* z arg, writing
+% this as a loop. This allows the kernel evals at one z to be a w*nc mat-vec.
+% The xsimd code which uses this (see spreadinterp.cpp) is now too elaborate to
+% explain here.
+%
+% Changes from gen_ker_horner_loop_C_code.m:
+% i) a simple C++ style 2D array is written, with the w-direction fast, poly coeff
+%    direction slow. (It used to be a list of 1D arrays plus Horner eval code.)
+% ii) coeffs are now ordered c_d,c_{d-1},...,c_0, where degree d=nc-1. (Used
+%    to be reversed.)
+%
+% Ideas: could try PSWF, cosh kernel ES variant, Reinecke tweaked-power ES
+%  variant, etc..
+
+% Ludvig af Klinteberg 4/25/18, based on Barnett 4/23/18. Ludvig wpad 1/31/20.
+% Barnett redo for Barbone templated arrays, no wpad, 7/26/24.
+
+if nargin==0, test_gen_ker_horner_loop_cpp_code; return; end
+if nargin<4, o=[]; end
+
+C = ker_ppval_coeff_mat(w,d,be,o);
+str = cell(d+3,1);     % nc = d+1, plus one start and one close-paren line
+% code to open the templated array...   *** why two {{?
+str{1} = sprintf('  return std::array<std::array<T, w>, nc> {{\n');
+for n=1:d+1                  % loop over poly coeff powers 0,1,..,d
+  % implicitly loops over fine-grid interpolation intervals 1:w...
+  coeffrow = sprintf('%.16E, ', C(n,:));
+  coeffrow = coeffrow(1:end-2);   % kill trailing comma even though allowed in C++
+  str{d+3-n} = sprintf('      {%s},\n', coeffrow);    % leaves outer trailing comma
+end
+str{d+3} = sprintf('  }};\n');     % terminate the array   *** why two }}?
+
+
+%%%%%%%%
+function test_gen_ker_horner_loop_cpp_code  % writes code to file, doesn't test
+w=13; d=w+1;           % pick a single kernel width and degree to write code for
+%w=7; d=11;
+%w=2; d=5;
+beta=2.3*w;
+str = gen_ker_horner_loop_cpp_code(w,d,beta);
+% str{:}
+% check write and read to file...
+fnam = sprintf('ker_horner_w%d.cpp',w);
+fid = fopen(fnam,'w');
+for i=1:numel(str); fwrite(fid,str{i}); end
+fclose(fid);
+system(['more ' fnam])
diff --git a/devel/ker_ppval_coeff_mat.m b/devel/ker_ppval_coeff_mat.m
@@ -7,20 +7,23 @@
 %  w = integer kernel width in grid points, eg 10
 %  d = poly degree to keep, eg 13
 %  beta = kernel parameter, around 2.3*w
-%  opts - optional struct
+%  opts - optional struct [unused for now]
 % Outputs:
 %  C = (d+1)*w double-precision matrix of coeffs for ppval
 %      Each col is (c_0...c_d) in c_0 + c_1z + ... + c_dz^d where |z|<=1
 %      is the local variable in 1/w units about each grid pt.
+%
+% Notes: the self-test of this function is useful to tweak d (degree) for
+%  each w.  Also once FINUFFT CPU compiled, run test/checkallaccs.sh
 
-% Barnett 4/23/18
+% Barnett 4/23/18. Test all w, and rescale to 1 max val, 7/21/24
 if nargin==0, test_ker_ppval_coeff_mat; return; end
 if nargin<4, o=[]; end
 
-f = @(z) exp(be*sqrt(1-z.^2));    % ES kernel on [-1,1], handles complex
+f = @(z) exp(be*(sqrt(1-z.^2)-1));    % ES kernel on [-1,1], handles complex
 
 fitd = 20; % fit degree
-m = 7;     % colloc pts per wall (4m>deg)
+m = 7;     % colloc pts per wall (4m>deg). Odd is better, hits Re axis
 h = 1/w;   % size of half a grid spacing, in units where [-1,1] is kernel supp.
 C = nan(fitd+1,w);  % alloc output
 % set up collocation linear sys on a box...
@@ -38,21 +41,33 @@
 
 %%%%%%
 function test_ker_ppval_coeff_mat
-w=7; d=11;
-%w=13; d=15;
-beta=2.3*w;   % sigma=2
-%beta=1.83*w; w=7; d=9;  % sigma=5/4
-f = @(z) exp(beta*sqrt(1-z.^2));  % must match the above
+sigma = 1.25;  % upsampfac
+for w=2:16
+  if sigma==2.0          % we test the rules used in setup_spreader...
+    betaoverws = [2.20 2.26 2.38 2.30];
+    beta = betaoverws(min(4,w-1)) * w;
+    tol = 10^(1-w);    % desired tol, in setup_spreader
+    d = w+1 + (w<=8);  % hacking the degree rule so error << tol
+  elseif sigma==1.25
+    gamma = 0.97;
+    betaoverns = gamma*pi*(1-1/(2*sigma));   % actually good for any sigma
+    beta=betaoverns*w;
+    tol = exp(-pi*w*sqrt(1-1/sigma));
+    d = ceil(0.6*w+2.2);   % hacking the degree rule so error << tol
+                           % (and needs a little more acc at large w)
+  end
+  f = @(z) exp(beta*(sqrt(1-z.^2)-1));  % must match the above, to test
 
-C = ker_ppval_coeff_mat(w,d,beta);
-%C/exp(beta)  % shows that no advantage to truncating any tails...
-t = linspace(-1,1,30);   % list of offsets in 1/w units
-h = 1/w;
-maxerr = 0;   % track max error of evaluation in pts in all intervals
-for i=1:w
-  xi = -1+h*(2*i-1);      % center of the i'th expansion, in [-1,1] supp.
-  erri = max(abs(f(xi+t*h) - polyval(C(end:-1:1,i),t)));
-  erri = erri / exp(beta);     % scale to rel err to kernel peak
-  maxerr = max(erri,maxerr);
+  C = ker_ppval_coeff_mat(w,d,beta);
+  %C/exp(beta)  % shows that no advantage to truncating any tails...
+  t = linspace(-1,1,30);   % list of offsets in 1/w units
+  h = 1/w;
+  maxerr = 0;   % track max error of evaluation in pts in all intervals
+  for i=1:w
+    xi = -1+h*(2*i-1);      % center of the i'th expansion, in [-1,1] supp.
+    erri = max(abs(f(xi+t*h) - polyval(C(end:-1:1,i),t)));
+    % erri = erri / exp(beta);     % scale to rel err to kernel peak (now 1)
+    maxerr = max(erri,maxerr);
+  end
+  fprintf('w=%d\t d=%d\t tol=%.g\t maxerr=%.3g   \tmaxerr/tol=%.3g\n',w,d,tol,maxerr,maxerr/tol)
 end
-maxerr
diff --git a/matlab/test/fig_accuracy.m b/matlab/test/fig_accuracy.m
@@ -8,6 +8,8 @@
 
 tols = 10.^(-1:-1:-14);
 %tols = 1e-6;
+%tols = 10.^(-1:-1:-10); o.upsampfac=1.25;    % for lowupsampfac
+
 errs = nan*tols;
 for t=1:numel(tols)
   x = pi*(2*rand(1,M)-1);