Merge pull request #1 from PedroSeber/ALVEN_variable_selection

New variable selection algorithm
PedroSeber · Dec 26, 2023 · 9b10bec · 9b10bec
2 parents c6d4dfa + 10e9192
commit 9b10bec
Show file tree

Hide file tree

Showing 15 changed files with 1,485 additions and 987 deletions.
diff --git a/Code-SPA/SPA.py b/Code-SPA/SPA.py
diff --git a/Code-SPA/SPLS_Python.py b/Code-SPA/SPLS_Python.py
@@ -3,7 +3,6 @@
 from sklearn.cross_decomposition import PLSRegression
 from warnings import simplefilter
 simplefilter(action='ignore', category=FutureWarning)
-import pdb
 
 def SPLS(X, y, K, eta, kappa = 0.5, select = 'pls2', eps = 1e-4, max_steps = 200):
     """
@@ -20,7 +19,6 @@ def SPLS(X, y, K, eta, kappa = 0.5, select = 'pls2', eps = 1e-4, max_steps = 200
     if len(y.shape) == 1:
         y = y.reshape(-1, 1)
     potential_idx = np.arange(X.shape[1])
-    #betahat = np.zeros((X.shape[0], y.shape[0]))
     betahat = np.zeros((X.shape[1]))
     betahat_list = []
     new_selected_list = []
@@ -37,7 +35,6 @@ def SPLS(X, y, K, eta, kappa = 0.5, select = 'pls2', eps = 1e-4, max_steps = 200
         my_PLS = PLSRegression(k+1, scale = False, tol = eps).fit(X, y)
         # Updating the beta
         betahat = np.zeros((X.shape[1]))
-        #pdb.set_trace()
         betahat[all_selected] = my_PLS.coef_.squeeze()[all_selected]
         betahat_list.append(betahat)
         if select.casefold() == 'pls2':
@@ -97,21 +94,3 @@ def _hfunction(lambda_val, M, circle, kappa2):
     alpha = np.linalg.inv(M + lambda_val*np.identity(M.shape[0])) @ M @ circle
     obj = alpha.T @ alpha - 1/kappa2**2
     return obj
-
-""" Temp to run R online
-library(spls)
-X = rbind(c(1, 3, 0), c(0, 1, 1), c(2, 1, -1))
-print(X)
-y = c(7, 1, 5)
-oi <- spls(X, y, 2, 0.9, scale.y = TRUE)
-print(attributes(oi))
-print("This is betahat")
-print(oi$betahat)
-print("This is A")
-print(oi$A)
-print("This is betamat")
-print(oi$betamat)
-print("This is new2As")
-print(oi$new2As)
-print(oi)
-"""
diff --git a/Code-SPA/cv_final.py b/Code-SPA/cv_final.py
diff --git a/Code-SPA/dataset_property_new.py b/Code-SPA/dataset_property_new.py
@@ -37,8 +37,8 @@ def nonlinearity_assess(X, y, plot = True, cat = None, alpha = 0.01, difference
     Output:
         int, whether there is nonlinearity in dataset
     """
-    poly = PolynomialFeatures(2, include_bias = False, interaction_only = True)
-    Bi = poly.fit_transform(X)[:, X.shape[1]:] # Just the interactions and not the x0, x1, ..., xN terms
+    poly, _, _ = rm._feature_trans(X, degree = 2, interaction = True, trans_type = 'simple_interaction')
+    Bi = poly[:, X.shape[1]+1:] # Just the interaction terms and not the intercept or x0, x1, ..., xN terms
 
     # Nonlinearity by linear correlation, quadratic test, and maximal correlation
     m = np.shape(X)[1]

diff --git a/Code-SPA/regression_models.py b/Code-SPA/regression_models.py
diff --git a/Example/Example_1.py b/Example/Example_1.py
diff --git a/Example/Printer.xlsx b/Example/Printer.xlsx
diff --git a/Example/Printer_test.csv b/Example/Printer_test.csv
diff --git a/Example/Printer_training.csv b/Example/Printer_training.csv
diff --git a/Examples/Kepler_3rd_law.csv b/Examples/Kepler_3rd_law.csv
@@ -0,0 +1,6 @@
+0.388,87.58
+0.724,224.42
+1,365.15
+1.524,686.59
+5.2,4332.37
+9.51,10759.12
diff --git a/Examples/Kepler_3rd_law_modern.csv b/Examples/Kepler_3rd_law_modern.csv
@@ -0,0 +1,8 @@
+0.3871,87.9693
+0.72333,224.7008
+1,365.2564
+1.52366,686.9796
+5.20336,4332.8201
+9.53707,10775.599
+19.1913,30687.153
+30.069,60190.03
diff --git a/Examples/README.md b/Examples/README.md
@@ -0,0 +1,31 @@
+# Examples
+
+There are many ways to call SPA for automatic model training and testing. The simplest is to just input the path to training data (or perhaps two paths, one to training and the other to test data). SPA will automatically select a suitable cross-validation method and model architecture, then automatically cross-validate a combination of standard hyperparameters for that model architecture. It is possible to customize the process; some examples are shown below, but checking the documentation of `main_SPA()` (in the [SPA](../Code-SPA/SPA.py) file) is a good idea.
+
+## Simplest use case; no testing data
+`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv')`
+## Manually select a CV method
+`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', cv_method = 'KFold')`
+## Manually select a model (or models)
+`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', model_name = ['ALVEN'])`<br>
+`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', model_name = ['ALVEN', 'EN', 'PLS'])`<br>
+`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', model_name = ['MLP'])`
+## Asking SPA to use dynamic models
+`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', dynamic_model = True)`
+## Restrict what values will be tested for some hyperparameter(s)
+`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', model_name = ['ALVEN'], l1_ratio = [0, 0.5, 0.99])`<br>
+`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', cv_method = 'KFold', model_name = ['ALVEN'], degree = list(range(1, 6)))`<br>
+`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', cv_method = 'KFold', model_name = ['MLP'], activation = ['relu', 'tanh'], weight_decay = 1e-2)`<br>
+`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', cv_method = 'KFold', lag = [0, 1, 2], model_name = ['DALVEN'])`
+## Plotting the data interrogation results (relevant only when `model_name` is not passed to SPA)
+`import SPA; _ = SPA.main_SPA('poly_1000x5-data_1to10-range_1-degree_123456789-seed_(0,0)-noise.csv', plot_interrogation = True)`
+
+Note that all of the examples above do not pass a path to testing data (using the `test_data` keyword). In reality, it is essential to have independent testing data to properly evaluate a model. SPA properly isolates the testing set from the models while cross-validation occurs, and reports the testing results in the output files.
+
+# Sample datasets
+
+| Dataset name | Source |
+| :----------: | :----: |
+| [poly_1000x5-data_1to10-range_1-degree_123456789-seed_\(0,0\)-noise.csv](poly_1000x5-data_1to10-range_1-degree_123456789-seed_\(0,0\)-noise.csv) | Artificial data generated by [create_random_data.py](create_random_data.py) |
+| [Kepler_3rd_law.csv](Kepler_3rd_law.csv) | [J. Kepler, "The Harmony of the World", 1619](https://books.google.com/books?id=rEkLAAAAIAAJ) |
+| [Kepler_3rd_law_modern.csv](Kepler_3rd_law_modern.csv) | [Wolfram Alpha](https://www.wolframalpha.com/) |
diff --git a/Examples/create_random_data.py b/Examples/create_random_data.py
@@ -0,0 +1,107 @@
+import numpy as np
+import pandas as pd
+from time import localtime
+import pdb
+
+def create_random_data(data_size, mode, data_range = (-10, 10), degrees = [1], seed = 123456789, coefficients = None, noise = (0, 0)):
+    """
+    Creates "data_size" random points with values from data_range[0] to data_range[1], then use them and a functional form ...
+        (defined by "mode" and "degrees") to generate a y response.
+    """
+    # Creating the X_data for the dataset
+    rng = np.random.default_rng(seed)
+    if mode.casefold() in {'multicollinear', 'multicollinearity'}:
+        X_0 = rng.integers(data_range[0]*10, data_range[1]*10, (data_size[0], 1)) / 10
+        X_noise_sigma = 0.05 # Edit as needed. New variable so that this can be added to the file_name below
+        noise_MC = rng.normal(0, X_noise_sigma, X_0.shape)
+        X_1 = X_0 + noise_MC
+        X_data = np.concatenate((X_0, X_1), axis = 1)
+        print(f'The X_1 noise level for the multicollinear model is {np.mean(np.abs(noise_MC/X_0))*100:.1f}%')
+    else: # General case
+        X_data = rng.integers(data_range[0]*10, data_range[1]*10, data_size) / 10 # Numbers between data_range[0] and data_range[1] with one decimal place
+    # Setting up the other parameters for the function
+    noise = tuple(int(elem) if elem == int(elem) else elem for elem in noise) # Changes integer noise parameters to actual ints for display convenience
+    noise_vals = rng.normal(noise[0], noise[1], X_data.shape[0]) # Always run to "burn" X_data.shape[0] values from rng to ensure reproducibility between noisy and noiseless runs
+    ###noise_vals = rng.normal(noise[0], noise[1], X_data.shape)
+    if noise[1]:
+        noise_str = f' + ϵ{noise}'
+    else:
+        noise_str = ''
+        noise_vals = 0
+    if not coefficients:
+        coefficients = rng.integers(-100, 100, X_data.shape[1]) / 10
+    else:
+        _ = rng.integers(-100, 100, X_data.shape[1]) / 10 # Ran to "burn" X_data.shape[1] values from the rng to ensure reproducibility
+    degrees = [int(elem) if elem == int(elem) else elem for elem in degrees] # Changes integer degrees to actual ints for display convenience
+    powers = rng.choice(degrees, X_data.shape[1])
+    # Creating the y_data based on the input parameters
+    if mode.casefold() == 'poly':
+        is_ln = [False]*X_data.shape[1]
+        y_data = (coefficients * X_data**powers).sum(axis = 1)
+        ###y_data_noisy = (coefficients * (X_data+noise_vals)**powers).sum(axis = 1)
+    elif mode.casefold() == 'log':
+        is_ln = [True]*X_data.shape[1]
+        y_data = (coefficients * np.log(np.abs(X_data))**powers).sum(axis = 1)
+    elif mode.casefold() in {'stefan-boltzmann', 'stefanboltzmann', 'stefan_boltzmann'}:
+        mode = 'stefanboltzmann' # For standardization
+        degrees = [1] # For standardization since we have a fixed eqn
+        y_data = 3.828e26 * X_data[:, 0]**2 * X_data[:, 1]**4 # L_{Sun} * R^2 * T^4 [In units of R_{Sun} and T_{Sun}]
+        functional_form_str = f'L = 3.828e26 * R^2 * T^4{noise_str}'
+    elif mode.casefold() == 'energy': # NOTE: if irrelevant variables are not needed, set data_size to (K, 1), as V always gets added later
+        degrees = [1] # For standardization since we have a fixed eqn, even though there's a degree 4 interaction with m^2*v^2
+        V = rng.integers(5e8, 2.5e9, data_size) / 10 # Numbers between 50 000 000 (5e7) and 250 000 000 (2.5e8) with one decimal place
+        X_data = np.hstack((X_data, V))
+        c = 299792458 # m / s
+        y_data = c**2 * X_data[:, 0]**2 * X_data[:, 1]**2 + c**4 * X_data[:, 0]**2 # E^2
+        functional_form_str = f'E^2 = 8.98755e16*m^2*v^2 + 8.07761e33*m^2{noise_str}'
+        print(f'Relativistic effects account for {np.mean(1 - c**4 * X_data[:, 0]**2/y_data)*100:.2f}% of this system\'s energy')
+    elif mode.casefold() in {'multicollinear', 'multicollinearity'}:
+        mode = 'multicollinear'
+        y_data = (2*X_data).sum(axis = 1)
+        functional_form_str = f'y = 2*X_0 + 2*X_1{noise_str}'
+        file_name = f'{mode}_{data_size[0]}x2-data_{data_range[0]}to{data_range[1]}-range_{seed}-seed_({noise[0]},{noise[1]})-ynoise_(0,{X_noise_sigma})-Xnoise.csv'
+    else: # TODO: mixture of log and linear, maybe other functional forms
+        raise NotImplementedError
+    # Saving the data using SPA's preferred format
+    y_data_noisy = y_data + noise_vals
+    all_data = np.column_stack((X_data, y_data_noisy))
+    degree_str = str(degrees)[1:-1]
+    coeff_str = str(coefficients)[1:-1].replace(",", "")
+    if 'file_name' not in locals(): # Specific equations already have the file_name built-in within their if-statement
+        file_name = f'{mode}_{data_size[0]}x{data_size[1]}-data_{data_range[0]}to{data_range[1]}-range_{degree_str.replace(" ", "")}-degree_{seed}-seed_({noise[0]},{noise[1]})-noise.csv'
+    np.savetxt(file_name, all_data, delimiter = ',', comments = '')
+    # Printing the final functional form
+    if 'functional_form_str' not in locals(): # Specific equations already have the functional_form_str built-in within their if-statement
+        functional_form_str = [f'{coefficients[idx]}*ln(x{idx})^{powers[idx]}' if is_ln[idx] else f'{coefficients[idx]}*(x{idx})^{powers[idx]}' for idx in range(X_data.shape[1])] # coefficients*log(x)^power or coeffiicents*x^power
+        functional_form_str = ' + '.join([elem[:-2] + elem[-2:].replace('^1', '') for elem in functional_form_str]) # Removing ^1. String addition to avoid removing ^10, ^11, ^12, ...
+        functional_form_str = f'y = {functional_form_str}{noise_str}'.replace('+ -', '- ')
+    print('The final functional form for this dataset is:')
+    print(functional_form_str)
+    print(f'The noise level is {np.mean(np.abs(noise_vals/y_data))*100:.1f}%')
+    # Writing a log file
+    time_now = '-'.join([str(elem) for elem in localtime()[:6]]) # YYYY-MM-DD-hh-mm-ss
+    log_text = f'Log for create_random_data()\n' + (
+               f'Function call: python create_random_data.py {data_size[0]} {data_size[1]} {mode} -dr {data_range[0]} {data_range[1]} -d {degree_str.replace(",", "")} -s {seed} -c {coeff_str} -n {noise[0]} {noise[1]} \n') + (
+               f'Functional form: {functional_form_str}\n') + (
+               f'Noise level: {np.mean(np.abs(noise_vals/y_data))*100:.1f}%\n') + (
+               f'Completion time: {time_now}\n')
+    with open(file_name[:-3] + 'log', 'w') as f:
+        f.write(log_text)
+
+if __name__ == '__main__':
+    # Input setup
+    import argparse
+    parser = argparse.ArgumentParser(description = 'Creates "data_size" random points, then use them and a functional form (defined by "mode" and "degrees") to generate a y response.')
+    parser.add_argument('data_size', type = int, nargs = 2, help = 'The size of the X data that will be randomly created')
+    parser.add_argument('mode', type = str, nargs = 1, help = 'The functional form of the dataset response. Must be in {"poly", "log"}')
+    parser.add_argument('-dr', '--datarange', type = int, nargs = 2, metavar = (-10, 10), default = (-10, 10), help = 'Integers representing the minimum and maximum values of the X data. ' +
+                        'The limits are multiplied by 10, used to generate random integers, then divided by 10 to turn the X data into floats with one decimal digit.')
+    parser.add_argument('-d', '--degrees', type = float, nargs = '+', metavar = 1, default = [1], help = 'A list of valid degrees to which the X data is randomly raised.' +
+                        'For example, degrees = [1, 2] will include linear and quadratic terms at random, degrees = [2] will include only quadratic terms.')
+    parser.add_argument('-s', '--seed', type = int, nargs = 1, metavar = 'int', default = [123456789],
+        help = 'The seed that is passed to np.random.default_rng() for reproducibility.')
+    parser.add_argument('-c', '--coefficients', type = float, nargs = '+', metavar = 'None or list', default = None,
+        help='The coefficients that multiply each feature of the X data. If None, they are generated at random between -10 and 10.')
+    parser.add_argument('-n', '--noise', type = float, nargs = 2, metavar = (0, 0), default = (0, 0), help='The mean and stdev of the Gaussian noise added to the y data. By default, no noise is added.')
+    myargs = parser.parse_args()
+    create_random_data(myargs.data_size, myargs.mode[0], tuple(myargs.datarange), myargs.degrees, myargs.seed[0], myargs.coefficients, tuple(myargs.noise))