11#!/usr/bin/env python
22
3- import pydra
4- import typing as ty
5- import numpy as np
63
4+ def read_file (filename , x_indices = None , target_vars = None , group = None ):
5+ """Read a CSV data file
76
8- @pydra .mark .task
9- @pydra .mark .annotate (
10- {"return" : {"X" : ty .Any , "Y" : ty .Any , "groups" : ty .Any , "feature_names" : ty .Any }}
11- )
12- def read_file (filename , x_indices = None , target_vars = None , group = "groups" ):
7+ :param filename: CSV filename containing a column header
8+ :param x_indices: integer or string indices
9+ :param target_vars: Target variables to use
10+ :param group: CSV column name containing grouping information
11+ :return: Tuple containing train data, target data, groups, features
12+ """
1313 import pandas as pd
1414
1515 data = pd .read_csv (filename )
@@ -20,17 +20,27 @@ def read_file(filename, x_indices=None, target_vars=None, group="groups"):
2020 else :
2121 raise ValueError (f"{ x_indices } is not a list of string or ints" )
2222 Y = data [target_vars ]
23- if group in data .keys ():
24- groups = data [:, [group ]]
25- else :
23+ if group is None :
2624 groups = list (range (X .shape [0 ]))
25+ else :
26+ groups = data [:, [group ]]
2727 feature_names = list (X .columns )
2828 return X .values , Y .values , groups , feature_names
2929
3030
31- @pydra .mark .task
32- @pydra .mark .annotate ({"return" : {"splits" : ty .Any , "split_indices" : ty .Any }})
3331def gen_splits (n_splits , test_size , X , Y , groups = None , random_state = 0 ):
32+ """Generate train-test splits for the data.
33+
34+ Uses GroupShuffleSplit from scikit-learn
35+
36+ :param n_splits: Number of splits
37+ :param test_size: fractional test size
38+ :param X: Sample feature data
39+ :param Y: Sample target data
40+ :param groups: Grouping of sample data for shufflesplit
41+ :param random_state: randomization for shuffling (default 0)
42+ :return: splits and indices to splits
43+ """
3444 from sklearn .model_selection import GroupShuffleSplit
3545
3646 gss = GroupShuffleSplit (
@@ -41,9 +51,17 @@ def gen_splits(n_splits, test_size, X, Y, groups=None, random_state=0):
4151 return train_test_splits , split_indices
4252
4353
44- @pydra .mark .task
45- @pydra .mark .annotate ({"return" : {"output" : ty .Any , "model" : ty .Any }})
46- def train_test_kernel (X , y , train_test_split , split_index , clf_info , permute , metrics ):
54+ def train_test_kernel (X , y , train_test_split , split_index , clf_info , permute ):
55+ """Core model fitting and predicting function
56+
57+ :param X: Input features
58+ :param y: Target variables
59+ :param train_test_split: split indices
60+ :param split_index: which index to use
61+ :param clf_info: how to construct the classifier
62+ :param permute: whether to run it in permuted mode or not
63+ :return: outputs, trained classifier with sample indices
64+ """
4765 from sklearn .preprocessing import StandardScaler
4866 from sklearn .pipeline import Pipeline
4967 import numpy as np
@@ -68,9 +86,13 @@ def train_test_kernel(X, y, train_test_split, split_index, clf_info, permute, me
6886 return (y [test_index ], predicted ), (pipe , train_index , test_index )
6987
7088
71- @pydra .mark .task
72- @pydra .mark .annotate ({"return" : {"score" : ty .Any , "output" : ty .Any }})
7389def calc_metric (output , metrics ):
90+ """Calculate the scores for the predicted outputs
91+
92+ :param output: true, predicted output
93+ :param metrics: list of metrics to evaluate
94+ :return: list of scores and pass the output
95+ """
7496 score = []
7597 for metric in metrics :
7698 metric_mod = __import__ ("sklearn.metrics" , fromlist = [metric ])
@@ -79,9 +101,17 @@ def calc_metric(output, metrics):
79101 return score , output
80102
81103
82- @pydra .mark .task
83- @pydra .mark .annotate ({"return" : {"shaps" : ty .Any }})
84104def get_shap (X , permute , model , gen_shap = False , nsamples = "auto" , l1_reg = "aic" ):
105+ """Compute shap information for the test data
106+
107+ :param X: sample data
108+ :param permute: whether model was permuted or not
109+ :param model: model containing trained classifier and train/test index
110+ :param gen_shap: whether to generate shap features
111+ :param nsamples: number of samples for shap evaluation
112+ :param l1_reg: L1 regularization for shap evaluation
113+ :return: shap values for each test sample
114+ """
85115 if permute or not gen_shap :
86116 return []
87117 pipe , train_index , test_index = model
0 commit comments