|
| 1 | +import numpy as np |
| 2 | +from scipy.interpolate import interp1d |
| 3 | +from scipy.integrate import simps |
| 4 | +from scipy.optimize import minimize |
| 5 | +import time |
| 6 | +from CheckData import CheckData |
| 7 | +from CheckOptions import check_options |
| 8 | +from List2Mat import list_to_mat |
| 9 | +from HandleNumericsAndNan import handle_numerics_and_nan |
| 10 | +from GetMeanDense import get_mean_dense |
| 11 | +from SetOptions import set_options |
| 12 | +import sys |
| 13 | +import os |
| 14 | +sys.path.append(os.path.abspath('src')) |
| 15 | +from trapzRcpp import trapz |
| 16 | +from RcppPseudoApprox import pseudo_approx |
| 17 | +from Rcppsort import pybind_sort |
| 18 | + |
| 19 | +def WFDA(Ly, Lt, optns=None): |
| 20 | + if optns is None: |
| 21 | + optns = {} |
| 22 | + |
| 23 | + # Set default options |
| 24 | + optns.setdefault('isPWL', True) |
| 25 | + optns.setdefault('verbose', False) |
| 26 | + optns.setdefault('nknots', 2 if optns['isPWL'] else None) |
| 27 | + optns.setdefault('subsetProp', 0.50) |
| 28 | + optns.setdefault('choice', 'truncated') |
| 29 | + optns.setdefault('seed', 666) |
| 30 | + |
| 31 | + # Validate options |
| 32 | + if optns['isPWL'] and (not (1 <= optns['nknots'] <= 7)): |
| 33 | + raise ValueError("Number of knots should be between 1 and 7.") |
| 34 | + if not (0 < optns['subsetProp'] <= 1): |
| 35 | + raise ValueError("Number of proportion used should be above 0 and at most 1.") |
| 36 | + if optns['choice'] not in ['truncated', 'weighted']: |
| 37 | + raise ValueError("The estimation of warping functions can only be done by 'truncated' or 'weighted' average.") |
| 38 | + |
| 39 | + # Handle randomness |
| 40 | + np.random.seed(optns['seed']) |
| 41 | + |
| 42 | + # Assume CheckData and HandleNumericsAndNAN functions process data |
| 43 | + CheckData(Ly,Lt) |
| 44 | + inputData = handle_numerics_and_nan(Ly, Lt) |
| 45 | + Ly = inputData['Ly'] |
| 46 | + Lt = inputData['Lt'] |
| 47 | + |
| 48 | + # Check data is dense |
| 49 | + optnsFPCA = set_options(Ly, Lt, {}) |
| 50 | + if optnsFPCA['dataType'] != 'Dense': |
| 51 | + raise ValueError(f"The data has to be 'Dense' for WFDA to be relevant; the current dataType is : '{optnsFPCA['dataType']}'!") |
| 52 | + |
| 53 | + numOfCurves = len(Ly) |
| 54 | + check_options(Lt, optnsFPCA, numOfCurves) |
| 55 | + |
| 56 | + ymat = list_to_mat(Ly, Lt) |
| 57 | + N = ymat.shape[0] |
| 58 | + |
| 59 | + obsGrid = np.sort(np.unique(np.hstack(Lt))) |
| 60 | + workGrid = (obsGrid - obsGrid.min()) / (obsGrid.max() - obsGrid.min()) |
| 61 | + M = len(workGrid) |
| 62 | + |
| 63 | + # Super-norm normalization |
| 64 | + maxAtTimeT = np.max(np.abs(ymat), axis=1) |
| 65 | + ymatNormalised = ymat / maxAtTimeT[:, None] |
| 66 | + |
| 67 | + # Mean function |
| 68 | + smcObj = get_mean_dense(ymatNormalised, obsGrid, optnsFPCA) |
| 69 | + mu = smcObj['mu'] |
| 70 | + |
| 71 | + # Penalty parameter |
| 72 | + if 'lambda' not in optns: |
| 73 | + Vy = np.sqrt(np.sum([simps((ymatNormalised[i] - mu) ** 2, obsGrid) for i in range(N)]) / (N - 1)) |
| 74 | + lambda_ = Vy * 1e-4 |
| 75 | + else: |
| 76 | + lambda_ = optns['lambda'] |
| 77 | + |
| 78 | + if optns['lambda'] is None: |
| 79 | + Vy = np.sqrt(np.sum(np.apply_along_axis(lambda u: trapz(obsGrid, (u - mu) ** 2), axis=1, arr=ymatNormalised)) / (N - 1)) |
| 80 | + lambda_ = Vy * 10 ** -4 |
| 81 | + else: |
| 82 | + lambda_ = optns['lambda'] |
| 83 | + |
| 84 | + numOfKcurves = min(round(optns['subsetProp'] * (N - 1))) |
| 85 | + hikMat = np.empty((numOfKcurves, M, N)) |
| 86 | + distMat = np.empty((N, numOfKcurves)) |
| 87 | + hMat = np.empty((N, M)) |
| 88 | + hInvMat = np.empty((N, M)) |
| 89 | + alignedMat = np.empty((N, M)) |
| 90 | + |
| 91 | + def getcurveJ(tj, curvej): |
| 92 | + return pseudo_approx(workGrid, curvej, tj) |
| 93 | + |
| 94 | + def theCost(curvei, curvek, lambda_, ti, tk): |
| 95 | + return np.sum((getcurveJ(tk, curvek) - curvei) ** 2) + lambda_ * np.sum((tk - ti) ** 2) |
| 96 | + |
| 97 | + def get_hikrs(curvei, curvek, lambda_param): |
| 98 | + my_costs = [] |
| 99 | + |
| 100 | + for u in range(1, numOfKcurves ** 2 + 1): |
| 101 | + np.random.seed(u) # Set seed for reproducibility |
| 102 | + random_values = np.random.rand(m - 2) |
| 103 | + cost = theCost(curvei, curvek, lambda_, workGrid, np.concatenate(([0], pybind_sort(random_values), [1]))) |
| 104 | + my_costs.append(cost) |
| 105 | + |
| 106 | + np.random.seed(np.argmin(my_costs) + 1) # Set seed based on minimum cost index |
| 107 | + # min_cost = np.min(my_costs) |
| 108 | + |
| 109 | + return np.concatenate(([0], pybind_sort(np.random.rand(m - 2)), [1])) |
| 110 | + |
| 111 | + def get_sol(x): |
| 112 | + sorted_x = pybind_sort(x) # Sort using rcppsort |
| 113 | + y = np.concatenate(([0], sorted_x, [1])) |
| 114 | + x_seq = np.linspace(0, 1, 2 + optns['nknots']) |
| 115 | + return pseudo_approx(x_seq, y, np.linspace(0, 1, M)) |
| 116 | + |
| 117 | + def theCostOptim(x, curvei, curvek, lambda_, ti): |
| 118 | + tk = get_sol(x) |
| 119 | + return np.sum((getcurveJ(tk, curvek) - curvei) ** 2) + lambda_ * np.sum((tk - ti) ** 2) |
| 120 | + |
| 121 | + def get_hik_optim(curvei, curvek, lambda_param, minqa_avail, optns): |
| 122 | + s0 = np.linspace(0, 1, 2 + optns['nknots'])[1:(1 + optns['nknots'])] # Initial parameters |
| 123 | + bounds = [(1e-6, 1 - 1e-6)] * optns['nknots'] # Bounds for the parameters |
| 124 | + |
| 125 | + if not minqa_avail: |
| 126 | + optim_res = minimize( |
| 127 | + theCostOptim, |
| 128 | + s0, |
| 129 | + args=(curvei, curvek, lambda_param, workGrid), |
| 130 | + method='L-BFGS-B', |
| 131 | + bounds=bounds |
| 132 | + ) |
| 133 | + else: |
| 134 | + # Using scipy's optimization, adjust this if you have a specific Bobyqa implementation |
| 135 | + # You may need to find an alternative for the Bobyqa method |
| 136 | + optim_res = minimize( |
| 137 | + theCostOptim, |
| 138 | + s0, |
| 139 | + args=(curvei, curvek, lambda_param, workGrid), |
| 140 | + method='trust-constr', # Example; adjust as necessary for equivalent behavior |
| 141 | + bounds=bounds |
| 142 | + ) |
| 143 | + |
| 144 | + best_sol = get_sol(optim_res.x) # Adjust M as needed or pass it directly |
| 145 | + return best_sol |
| 146 | + |
| 147 | + start_time = time.time() |
| 148 | + |
| 149 | + # Check for minqa availability |
| 150 | + minqa_avail = 'minqa' in sys.modules or optns['isPWL'] # Placeholder for minqa check |
| 151 | + if not minqa_avail: |
| 152 | + print("Warning: Cannot use 'minqa::bobyqa' to find the optimal knot locations as 'minqa' is not installed. We will do an 'L-BFGS-B' search.") |
| 153 | + |
| 154 | + N = ymatNormalised.shape[0] # Number of curves |
| 155 | + # hik_mat = np.zeros((numOfKcurves, M, N)) # Replace M with the appropriate dimension |
| 156 | + # dist_mat = np.zeros((N, numOfKcurves)) |
| 157 | + # h_inv_mat = np.zeros((N, M)) # Adjust dimensions as necessary |
| 158 | + # h_mat = np.zeros((N, M)) # Adjust dimensions as necessary |
| 159 | + # aligned_mat = np.zeros((N, M)) # Adjust dimensions as necessary |
| 160 | + |
| 161 | + for i in range(N): # For each curve |
| 162 | + if optns['verbose']: |
| 163 | + print(f'Computing pairwise warping for curve #: {i + 1} out of {N} curves.') |
| 164 | + |
| 165 | + np.random.seed(i + optns['seed']) |
| 166 | + curvei = ymatNormalised[i, :] |
| 167 | + candidate_kcurves = np.random.choice(np.delete(np.arange(N), i), numOfKcurves, replace=False) |
| 168 | + |
| 169 | + for k in range(numOfKcurves): # For each candidate curve |
| 170 | + if not optns['isPWL']: |
| 171 | + hikMat[k, :, i] = get_hikrs(curvei, ymatNormalised[candidate_kcurves[k], :], lambda_) |
| 172 | + else: |
| 173 | + hikMat[k, :, i] = get_hik_optim(curvei, ymatNormalised[candidate_kcurves[k], :], lambda_, minqa_avail, optns) |
| 174 | + |
| 175 | + # Calculate the distance |
| 176 | + distMat[i, k] = np.mean((getcurveJ(tj=hikMat[k, :, i], curvej=curvei) - ymatNormalised[candidate_kcurves[k]]) ** 2) |
| 177 | + |
| 178 | + if optns['choice'] == 'weighted': |
| 179 | + hInvMat[i, :] = np.average(hikMat[:, :, i], axis=0, weights=1/distMat[i, :]) |
| 180 | + else: |
| 181 | + hInvMat[i, :] = np.mean(hikMat[distMat[i, :] <= np.quantile(distMat[i, :], 0.90), :, i], axis=0) |
| 182 | + |
| 183 | + # Interpolate h_mat and aligned_mat |
| 184 | + hMat[i, :] = interp1d(workGrid, hInvMat[i, :], bounds_error=False)(workGrid) |
| 185 | + alignedMat[i, :] = interp1d(workGrid, ymatNormalised[i, :], bounds_error=False)(hMat[i, :]) |
| 186 | + |
| 187 | + timing = time.time() - start_time |
| 188 | + ret = { |
| 189 | + 'optns': optns, |
| 190 | + 'lambda': lambda_, |
| 191 | + 'h': hMat, |
| 192 | + 'hInv': hInvMat, |
| 193 | + 'aligned': alignedMat, |
| 194 | + 'costs': np.mean(distMat, axis=1), |
| 195 | + 'timing': timing |
| 196 | + } |
| 197 | + |
| 198 | + # Set the class, if needed |
| 199 | + # If you want to handle class-like structures, consider using a custom class |
| 200 | + return ret |
0 commit comments