Skip to content

Commit 207f3fc

Browse files
author
Rithvik Donnipadu
committed
Added Rcppsort.cpp, WFDA.py and modified setup.py
1 parent e4d24a8 commit 207f3fc

File tree

3 files changed

+239
-0
lines changed

3 files changed

+239
-0
lines changed

src/Rcppsort.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#include <pybind11/pybind11.h>
2+
#include <pybind11/numpy.h>
3+
#include <algorithm>
4+
#include <vector>
5+
6+
namespace py = pybind11;
7+
8+
// Function to sort a numpy array
9+
py::array_t<double> pybind_sort(py::array_t<double> arr) {
10+
// Request a buffer descriptor from NumPy array
11+
py::buffer_info buf = arr.request();
12+
13+
// Pointer to the data as a double array
14+
double *ptr = static_cast<double *>(buf.ptr);
15+
size_t size = buf.size;
16+
17+
// Create a vector and copy data from NumPy array
18+
std::vector<double> vec(ptr, ptr + size);
19+
20+
// Sort the vector
21+
std::sort(vec.begin(), vec.end());
22+
23+
// Return a new NumPy array with sorted values
24+
return py::array_t<double>(vec.size(), vec.data());
25+
}
26+
27+
// Binding the C++ function to Python
28+
PYBIND11_MODULE(Rcppsort, m) {
29+
m.def("pybind_sort", &pybind_sort, "Sort a numpy array using C++");
30+
}

src/WFDA.py

Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
import numpy as np
2+
from scipy.interpolate import interp1d
3+
from scipy.integrate import simps
4+
from scipy.optimize import minimize
5+
import time
6+
from CheckData import CheckData
7+
from CheckOptions import check_options
8+
from List2Mat import list_to_mat
9+
from HandleNumericsAndNan import handle_numerics_and_nan
10+
from GetMeanDense import get_mean_dense
11+
from SetOptions import set_options
12+
import sys
13+
import os
14+
sys.path.append(os.path.abspath('src'))
15+
from trapzRcpp import trapz
16+
from RcppPseudoApprox import pseudo_approx
17+
from Rcppsort import pybind_sort
18+
19+
def WFDA(Ly, Lt, optns=None):
20+
if optns is None:
21+
optns = {}
22+
23+
# Set default options
24+
optns.setdefault('isPWL', True)
25+
optns.setdefault('verbose', False)
26+
optns.setdefault('nknots', 2 if optns['isPWL'] else None)
27+
optns.setdefault('subsetProp', 0.50)
28+
optns.setdefault('choice', 'truncated')
29+
optns.setdefault('seed', 666)
30+
31+
# Validate options
32+
if optns['isPWL'] and (not (1 <= optns['nknots'] <= 7)):
33+
raise ValueError("Number of knots should be between 1 and 7.")
34+
if not (0 < optns['subsetProp'] <= 1):
35+
raise ValueError("Number of proportion used should be above 0 and at most 1.")
36+
if optns['choice'] not in ['truncated', 'weighted']:
37+
raise ValueError("The estimation of warping functions can only be done by 'truncated' or 'weighted' average.")
38+
39+
# Handle randomness
40+
np.random.seed(optns['seed'])
41+
42+
# Assume CheckData and HandleNumericsAndNAN functions process data
43+
CheckData(Ly,Lt)
44+
inputData = handle_numerics_and_nan(Ly, Lt)
45+
Ly = inputData['Ly']
46+
Lt = inputData['Lt']
47+
48+
# Check data is dense
49+
optnsFPCA = set_options(Ly, Lt, {})
50+
if optnsFPCA['dataType'] != 'Dense':
51+
raise ValueError(f"The data has to be 'Dense' for WFDA to be relevant; the current dataType is : '{optnsFPCA['dataType']}'!")
52+
53+
numOfCurves = len(Ly)
54+
check_options(Lt, optnsFPCA, numOfCurves)
55+
56+
ymat = list_to_mat(Ly, Lt)
57+
N = ymat.shape[0]
58+
59+
obsGrid = np.sort(np.unique(np.hstack(Lt)))
60+
workGrid = (obsGrid - obsGrid.min()) / (obsGrid.max() - obsGrid.min())
61+
M = len(workGrid)
62+
63+
# Super-norm normalization
64+
maxAtTimeT = np.max(np.abs(ymat), axis=1)
65+
ymatNormalised = ymat / maxAtTimeT[:, None]
66+
67+
# Mean function
68+
smcObj = get_mean_dense(ymatNormalised, obsGrid, optnsFPCA)
69+
mu = smcObj['mu']
70+
71+
# Penalty parameter
72+
if 'lambda' not in optns:
73+
Vy = np.sqrt(np.sum([simps((ymatNormalised[i] - mu) ** 2, obsGrid) for i in range(N)]) / (N - 1))
74+
lambda_ = Vy * 1e-4
75+
else:
76+
lambda_ = optns['lambda']
77+
78+
if optns['lambda'] is None:
79+
Vy = np.sqrt(np.sum(np.apply_along_axis(lambda u: trapz(obsGrid, (u - mu) ** 2), axis=1, arr=ymatNormalised)) / (N - 1))
80+
lambda_ = Vy * 10 ** -4
81+
else:
82+
lambda_ = optns['lambda']
83+
84+
numOfKcurves = min(round(optns['subsetProp'] * (N - 1)))
85+
hikMat = np.empty((numOfKcurves, M, N))
86+
distMat = np.empty((N, numOfKcurves))
87+
hMat = np.empty((N, M))
88+
hInvMat = np.empty((N, M))
89+
alignedMat = np.empty((N, M))
90+
91+
def getcurveJ(tj, curvej):
92+
return pseudo_approx(workGrid, curvej, tj)
93+
94+
def theCost(curvei, curvek, lambda_, ti, tk):
95+
return np.sum((getcurveJ(tk, curvek) - curvei) ** 2) + lambda_ * np.sum((tk - ti) ** 2)
96+
97+
def get_hikrs(curvei, curvek, lambda_param):
98+
my_costs = []
99+
100+
for u in range(1, numOfKcurves ** 2 + 1):
101+
np.random.seed(u) # Set seed for reproducibility
102+
random_values = np.random.rand(m - 2)
103+
cost = theCost(curvei, curvek, lambda_, workGrid, np.concatenate(([0], pybind_sort(random_values), [1])))
104+
my_costs.append(cost)
105+
106+
np.random.seed(np.argmin(my_costs) + 1) # Set seed based on minimum cost index
107+
# min_cost = np.min(my_costs)
108+
109+
return np.concatenate(([0], pybind_sort(np.random.rand(m - 2)), [1]))
110+
111+
def get_sol(x):
112+
sorted_x = pybind_sort(x) # Sort using rcppsort
113+
y = np.concatenate(([0], sorted_x, [1]))
114+
x_seq = np.linspace(0, 1, 2 + optns['nknots'])
115+
return pseudo_approx(x_seq, y, np.linspace(0, 1, M))
116+
117+
def theCostOptim(x, curvei, curvek, lambda_, ti):
118+
tk = get_sol(x)
119+
return np.sum((getcurveJ(tk, curvek) - curvei) ** 2) + lambda_ * np.sum((tk - ti) ** 2)
120+
121+
def get_hik_optim(curvei, curvek, lambda_param, minqa_avail, optns):
122+
s0 = np.linspace(0, 1, 2 + optns['nknots'])[1:(1 + optns['nknots'])] # Initial parameters
123+
bounds = [(1e-6, 1 - 1e-6)] * optns['nknots'] # Bounds for the parameters
124+
125+
if not minqa_avail:
126+
optim_res = minimize(
127+
theCostOptim,
128+
s0,
129+
args=(curvei, curvek, lambda_param, workGrid),
130+
method='L-BFGS-B',
131+
bounds=bounds
132+
)
133+
else:
134+
# Using scipy's optimization, adjust this if you have a specific Bobyqa implementation
135+
# You may need to find an alternative for the Bobyqa method
136+
optim_res = minimize(
137+
theCostOptim,
138+
s0,
139+
args=(curvei, curvek, lambda_param, workGrid),
140+
method='trust-constr', # Example; adjust as necessary for equivalent behavior
141+
bounds=bounds
142+
)
143+
144+
best_sol = get_sol(optim_res.x) # Adjust M as needed or pass it directly
145+
return best_sol
146+
147+
start_time = time.time()
148+
149+
# Check for minqa availability
150+
minqa_avail = 'minqa' in sys.modules or optns['isPWL'] # Placeholder for minqa check
151+
if not minqa_avail:
152+
print("Warning: Cannot use 'minqa::bobyqa' to find the optimal knot locations as 'minqa' is not installed. We will do an 'L-BFGS-B' search.")
153+
154+
N = ymatNormalised.shape[0] # Number of curves
155+
# hik_mat = np.zeros((numOfKcurves, M, N)) # Replace M with the appropriate dimension
156+
# dist_mat = np.zeros((N, numOfKcurves))
157+
# h_inv_mat = np.zeros((N, M)) # Adjust dimensions as necessary
158+
# h_mat = np.zeros((N, M)) # Adjust dimensions as necessary
159+
# aligned_mat = np.zeros((N, M)) # Adjust dimensions as necessary
160+
161+
for i in range(N): # For each curve
162+
if optns['verbose']:
163+
print(f'Computing pairwise warping for curve #: {i + 1} out of {N} curves.')
164+
165+
np.random.seed(i + optns['seed'])
166+
curvei = ymatNormalised[i, :]
167+
candidate_kcurves = np.random.choice(np.delete(np.arange(N), i), numOfKcurves, replace=False)
168+
169+
for k in range(numOfKcurves): # For each candidate curve
170+
if not optns['isPWL']:
171+
hikMat[k, :, i] = get_hikrs(curvei, ymatNormalised[candidate_kcurves[k], :], lambda_)
172+
else:
173+
hikMat[k, :, i] = get_hik_optim(curvei, ymatNormalised[candidate_kcurves[k], :], lambda_, minqa_avail, optns)
174+
175+
# Calculate the distance
176+
distMat[i, k] = np.mean((getcurveJ(tj=hikMat[k, :, i], curvej=curvei) - ymatNormalised[candidate_kcurves[k]]) ** 2)
177+
178+
if optns['choice'] == 'weighted':
179+
hInvMat[i, :] = np.average(hikMat[:, :, i], axis=0, weights=1/distMat[i, :])
180+
else:
181+
hInvMat[i, :] = np.mean(hikMat[distMat[i, :] <= np.quantile(distMat[i, :], 0.90), :, i], axis=0)
182+
183+
# Interpolate h_mat and aligned_mat
184+
hMat[i, :] = interp1d(workGrid, hInvMat[i, :], bounds_error=False)(workGrid)
185+
alignedMat[i, :] = interp1d(workGrid, ymatNormalised[i, :], bounds_error=False)(hMat[i, :])
186+
187+
timing = time.time() - start_time
188+
ret = {
189+
'optns': optns,
190+
'lambda': lambda_,
191+
'h': hMat,
192+
'hInv': hInvMat,
193+
'aligned': alignedMat,
194+
'costs': np.mean(distMat, axis=1),
195+
'timing': timing
196+
}
197+
198+
# Set the class, if needed
199+
# If you want to handle class-like structures, consider using a custom class
200+
return ret

src/setup.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,15 @@ def __str__(self):
4545
],
4646
language='c++',
4747
),
48+
Extension(
49+
'Rcppsort',
50+
sources=['Rcppsort.cpp'],
51+
include_dirs=[
52+
# Path to pybind11 headers
53+
get_pybind_include(),
54+
],
55+
language='c++',
56+
),
4857
]
4958

5059
setup(

0 commit comments

Comments
 (0)