Skip to content

Commit

Permalink
Merge pull request #1 from PedroSeber/ALVEN_variable_selection
Browse files Browse the repository at this point in the history
New variable selection algorithm
  • Loading branch information
PedroSeber authored Dec 26, 2023
2 parents c6d4dfa + 10e9192 commit 9b10bec
Show file tree
Hide file tree
Showing 15 changed files with 1,485 additions and 987 deletions.
185 changes: 81 additions & 104 deletions Code-SPA/SPA.py

Large diffs are not rendered by default.

21 changes: 0 additions & 21 deletions Code-SPA/SPLS_Python.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
from sklearn.cross_decomposition import PLSRegression
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pdb

def SPLS(X, y, K, eta, kappa = 0.5, select = 'pls2', eps = 1e-4, max_steps = 200):
"""
Expand All @@ -20,7 +19,6 @@ def SPLS(X, y, K, eta, kappa = 0.5, select = 'pls2', eps = 1e-4, max_steps = 200
if len(y.shape) == 1:
y = y.reshape(-1, 1)
potential_idx = np.arange(X.shape[1])
#betahat = np.zeros((X.shape[0], y.shape[0]))
betahat = np.zeros((X.shape[1]))
betahat_list = []
new_selected_list = []
Expand All @@ -37,7 +35,6 @@ def SPLS(X, y, K, eta, kappa = 0.5, select = 'pls2', eps = 1e-4, max_steps = 200
my_PLS = PLSRegression(k+1, scale = False, tol = eps).fit(X, y)
# Updating the beta
betahat = np.zeros((X.shape[1]))
#pdb.set_trace()
betahat[all_selected] = my_PLS.coef_.squeeze()[all_selected]
betahat_list.append(betahat)
if select.casefold() == 'pls2':
Expand Down Expand Up @@ -97,21 +94,3 @@ def _hfunction(lambda_val, M, circle, kappa2):
alpha = np.linalg.inv(M + lambda_val*np.identity(M.shape[0])) @ M @ circle
obj = alpha.T @ alpha - 1/kappa2**2
return obj

""" Temp to run R online
library(spls)
X = rbind(c(1, 3, 0), c(0, 1, 1), c(2, 1, -1))
print(X)
y = c(7, 1, 5)
oi <- spls(X, y, 2, 0.9, scale.y = TRUE)
print(attributes(oi))
print("This is betahat")
print(oi$betahat)
print("This is A")
print(oi$A)
print("This is betamat")
print(oi$betamat)
print("This is new2As")
print(oi$new2As)
print(oi)
"""
469 changes: 155 additions & 314 deletions Code-SPA/cv_final.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions Code-SPA/dataset_property_new.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,8 @@ def nonlinearity_assess(X, y, plot = True, cat = None, alpha = 0.01, difference
Output:
int, whether there is nonlinearity in dataset
"""
poly = PolynomialFeatures(2, include_bias = False, interaction_only = True)
Bi = poly.fit_transform(X)[:, X.shape[1]:] # Just the interactions and not the x0, x1, ..., xN terms
poly, _, _ = rm._feature_trans(X, degree = 2, interaction = True, trans_type = 'simple_interaction')
Bi = poly[:, X.shape[1]+1:] # Just the interaction terms and not the intercept or x0, x1, ..., xN terms

# Nonlinearity by linear correlation, quadratic test, and maximal correlation
m = np.shape(X)[1]
Expand Down
514 changes: 86 additions & 428 deletions Code-SPA/regression_models.py

Large diffs are not rendered by default.

59 changes: 0 additions & 59 deletions Example/Example_1.py

This file was deleted.

Binary file removed Example/Printer.xlsx
Binary file not shown.
10 changes: 0 additions & 10 deletions Example/Printer_test.csv

This file was deleted.

40 changes: 0 additions & 40 deletions Example/Printer_training.csv

This file was deleted.

6 changes: 6 additions & 0 deletions Examples/Kepler_3rd_law.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
0.388,87.58
0.724,224.42
1,365.15
1.524,686.59
5.2,4332.37
9.51,10759.12
8 changes: 8 additions & 0 deletions Examples/Kepler_3rd_law_modern.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
0.3871,87.9693
0.72333,224.7008
1,365.2564
1.52366,686.9796
5.20336,4332.8201
9.53707,10775.599
19.1913,30687.153
30.069,60190.03
31 changes: 31 additions & 0 deletions Examples/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Examples

There are many ways to call SPA for automatic model training and testing. The simplest is to just input the path to training data (or perhaps two paths, one to training and the other to test data). SPA will automatically select a suitable cross-validation method and model architecture, then automatically cross-validate a combination of standard hyperparameters for that model architecture. It is possible to customize the process; some examples are shown below, but checking the documentation of `main_SPA()` (in the [SPA](../Code-SPA/SPA.py) file) is a good idea.

## Simplest use case; no testing data
`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv')`
## Manually select a CV method
`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', cv_method = 'KFold')`
## Manually select a model (or models)
`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', model_name = ['ALVEN'])`<br>
`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', model_name = ['ALVEN', 'EN', 'PLS'])`<br>
`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', model_name = ['MLP'])`
## Asking SPA to use dynamic models
`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', dynamic_model = True)`
## Restrict what values will be tested for some hyperparameter(s)
`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', model_name = ['ALVEN'], l1_ratio = [0, 0.5, 0.99])`<br>
`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', cv_method = 'KFold', model_name = ['ALVEN'], degree = list(range(1, 6)))`<br>
`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', cv_method = 'KFold', model_name = ['MLP'], activation = ['relu', 'tanh'], weight_decay = 1e-2)`<br>
`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', cv_method = 'KFold', lag = [0, 1, 2], model_name = ['DALVEN'])`
## Plotting the data interrogation results (relevant only when `model_name` is not passed to SPA)
`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', plot_interrogation = True)`

Note that all of the examples above do not pass a path to testing data (using the `test_data` keyword). In reality, it is essential to have independent testing data to properly evaluate a model. SPA properly isolates the testing set from the models while cross-validation occurs, and reports the testing results in the output files.

# Sample datasets

| Dataset name | Source |
| :----------: | :----: |
| [poly_1000x5-data_1to10-range_1-degree_123456789-seed_\(0,0\)-noise.csv](poly_1000x5-data_1to10-range_1-degree_123456789-seed_\(0,0\)-noise.csv) | Artificial data generated by [create_random_data.py](create_random_data.py) |
| [Kepler_3rd_law.csv](Kepler_3rd_law.csv) | [J. Kepler, "The Harmony of the World", 1619](https://books.google.com/books?id=rEkLAAAAIAAJ) |
| [Kepler_3rd_law_modern.csv](Kepler_3rd_law_modern.csv) | [Wolfram Alpha](https://www.wolframalpha.com/) |
107 changes: 107 additions & 0 deletions Examples/create_random_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
import numpy as np
import pandas as pd
from time import localtime
import pdb

def create_random_data(data_size, mode, data_range = (-10, 10), degrees = [1], seed = 123456789, coefficients = None, noise = (0, 0)):
"""
Creates "data_size" random points with values from data_range[0] to data_range[1], then use them and a functional form ...
(defined by "mode" and "degrees") to generate a y response.
"""
# Creating the X_data for the dataset
rng = np.random.default_rng(seed)
if mode.casefold() in {'multicollinear', 'multicollinearity'}:
X_0 = rng.integers(data_range[0]*10, data_range[1]*10, (data_size[0], 1)) / 10
X_noise_sigma = 0.05 # Edit as needed. New variable so that this can be added to the file_name below
noise_MC = rng.normal(0, X_noise_sigma, X_0.shape)
X_1 = X_0 + noise_MC
X_data = np.concatenate((X_0, X_1), axis = 1)
print(f'The X_1 noise level for the multicollinear model is {np.mean(np.abs(noise_MC/X_0))*100:.1f}%')
else: # General case
X_data = rng.integers(data_range[0]*10, data_range[1]*10, data_size) / 10 # Numbers between data_range[0] and data_range[1] with one decimal place
# Setting up the other parameters for the function
noise = tuple(int(elem) if elem == int(elem) else elem for elem in noise) # Changes integer noise parameters to actual ints for display convenience
noise_vals = rng.normal(noise[0], noise[1], X_data.shape[0]) # Always run to "burn" X_data.shape[0] values from rng to ensure reproducibility between noisy and noiseless runs
###noise_vals = rng.normal(noise[0], noise[1], X_data.shape)
if noise[1]:
noise_str = f' + ϵ{noise}'
else:
noise_str = ''
noise_vals = 0
if not coefficients:
coefficients = rng.integers(-100, 100, X_data.shape[1]) / 10
else:
_ = rng.integers(-100, 100, X_data.shape[1]) / 10 # Ran to "burn" X_data.shape[1] values from the rng to ensure reproducibility
degrees = [int(elem) if elem == int(elem) else elem for elem in degrees] # Changes integer degrees to actual ints for display convenience
powers = rng.choice(degrees, X_data.shape[1])
# Creating the y_data based on the input parameters
if mode.casefold() == 'poly':
is_ln = [False]*X_data.shape[1]
y_data = (coefficients * X_data**powers).sum(axis = 1)
###y_data_noisy = (coefficients * (X_data+noise_vals)**powers).sum(axis = 1)
elif mode.casefold() == 'log':
is_ln = [True]*X_data.shape[1]
y_data = (coefficients * np.log(np.abs(X_data))**powers).sum(axis = 1)
elif mode.casefold() in {'stefan-boltzmann', 'stefanboltzmann', 'stefan_boltzmann'}:
mode = 'stefanboltzmann' # For standardization
degrees = [1] # For standardization since we have a fixed eqn
y_data = 3.828e26 * X_data[:, 0]**2 * X_data[:, 1]**4 # L_{Sun} * R^2 * T^4 [In units of R_{Sun} and T_{Sun}]
functional_form_str = f'L = 3.828e26 * R^2 * T^4{noise_str}'
elif mode.casefold() == 'energy': # NOTE: if irrelevant variables are not needed, set data_size to (K, 1), as V always gets added later
degrees = [1] # For standardization since we have a fixed eqn, even though there's a degree 4 interaction with m^2*v^2
V = rng.integers(5e8, 2.5e9, data_size) / 10 # Numbers between 50 000 000 (5e7) and 250 000 000 (2.5e8) with one decimal place
X_data = np.hstack((X_data, V))
c = 299792458 # m / s
y_data = c**2 * X_data[:, 0]**2 * X_data[:, 1]**2 + c**4 * X_data[:, 0]**2 # E^2
functional_form_str = f'E^2 = 8.98755e16*m^2*v^2 + 8.07761e33*m^2{noise_str}'
print(f'Relativistic effects account for {np.mean(1 - c**4 * X_data[:, 0]**2/y_data)*100:.2f}% of this system\'s energy')
elif mode.casefold() in {'multicollinear', 'multicollinearity'}:
mode = 'multicollinear'
y_data = (2*X_data).sum(axis = 1)
functional_form_str = f'y = 2*X_0 + 2*X_1{noise_str}'
file_name = f'{mode}_{data_size[0]}x2-data_{data_range[0]}to{data_range[1]}-range_{seed}-seed_({noise[0]},{noise[1]})-ynoise_(0,{X_noise_sigma})-Xnoise.csv'
else: # TODO: mixture of log and linear, maybe other functional forms
raise NotImplementedError
# Saving the data using SPA's preferred format
y_data_noisy = y_data + noise_vals
all_data = np.column_stack((X_data, y_data_noisy))
degree_str = str(degrees)[1:-1]
coeff_str = str(coefficients)[1:-1].replace(",", "")
if 'file_name' not in locals(): # Specific equations already have the file_name built-in within their if-statement
file_name = f'{mode}_{data_size[0]}x{data_size[1]}-data_{data_range[0]}to{data_range[1]}-range_{degree_str.replace(" ", "")}-degree_{seed}-seed_({noise[0]},{noise[1]})-noise.csv'
np.savetxt(file_name, all_data, delimiter = ',', comments = '')
# Printing the final functional form
if 'functional_form_str' not in locals(): # Specific equations already have the functional_form_str built-in within their if-statement
functional_form_str = [f'{coefficients[idx]}*ln(x{idx})^{powers[idx]}' if is_ln[idx] else f'{coefficients[idx]}*(x{idx})^{powers[idx]}' for idx in range(X_data.shape[1])] # coefficients*log(x)^power or coeffiicents*x^power
functional_form_str = ' + '.join([elem[:-2] + elem[-2:].replace('^1', '') for elem in functional_form_str]) # Removing ^1. String addition to avoid removing ^10, ^11, ^12, ...
functional_form_str = f'y = {functional_form_str}{noise_str}'.replace('+ -', '- ')
print('The final functional form for this dataset is:')
print(functional_form_str)
print(f'The noise level is {np.mean(np.abs(noise_vals/y_data))*100:.1f}%')
# Writing a log file
time_now = '-'.join([str(elem) for elem in localtime()[:6]]) # YYYY-MM-DD-hh-mm-ss
log_text = f'Log for create_random_data()\n' + (
f'Function call: python create_random_data.py {data_size[0]} {data_size[1]} {mode} -dr {data_range[0]} {data_range[1]} -d {degree_str.replace(",", "")} -s {seed} -c {coeff_str} -n {noise[0]} {noise[1]} \n') + (
f'Functional form: {functional_form_str}\n') + (
f'Noise level: {np.mean(np.abs(noise_vals/y_data))*100:.1f}%\n') + (
f'Completion time: {time_now}\n')
with open(file_name[:-3] + 'log', 'w') as f:
f.write(log_text)

if __name__ == '__main__':
# Input setup
import argparse
parser = argparse.ArgumentParser(description = 'Creates "data_size" random points, then use them and a functional form (defined by "mode" and "degrees") to generate a y response.')
parser.add_argument('data_size', type = int, nargs = 2, help = 'The size of the X data that will be randomly created')
parser.add_argument('mode', type = str, nargs = 1, help = 'The functional form of the dataset response. Must be in {"poly", "log"}')
parser.add_argument('-dr', '--datarange', type = int, nargs = 2, metavar = (-10, 10), default = (-10, 10), help = 'Integers representing the minimum and maximum values of the X data. ' +
'The limits are multiplied by 10, used to generate random integers, then divided by 10 to turn the X data into floats with one decimal digit.')
parser.add_argument('-d', '--degrees', type = float, nargs = '+', metavar = 1, default = [1], help = 'A list of valid degrees to which the X data is randomly raised.' +
'For example, degrees = [1, 2] will include linear and quadratic terms at random, degrees = [2] will include only quadratic terms.')
parser.add_argument('-s', '--seed', type = int, nargs = 1, metavar = 'int', default = [123456789],
help = 'The seed that is passed to np.random.default_rng() for reproducibility.')
parser.add_argument('-c', '--coefficients', type = float, nargs = '+', metavar = 'None or list', default = None,
help='The coefficients that multiply each feature of the X data. If None, they are generated at random between -10 and 10.')
parser.add_argument('-n', '--noise', type = float, nargs = 2, metavar = (0, 0), default = (0, 0), help='The mean and stdev of the Gaussian noise added to the y data. By default, no noise is added.')
myargs = parser.parse_args()
create_random_data(myargs.data_size, myargs.mode[0], tuple(myargs.datarange), myargs.degrees, myargs.seed[0], myargs.coefficients, tuple(myargs.noise))
Loading

0 comments on commit 9b10bec

Please sign in to comment.