Merge pull request #24 from satra/enh-split-tasks

satra · web-flow · commit d224cf6c1854 · 2020-06-13T11:25:33.000-04:00
several pydra related updates to reflect better usage
diff --git a/pydra_ml/classifier.py b/pydra_ml/classifier.py
@@ -1,10 +1,41 @@
 #!/usr/bin/env python
 
 import pydra
+from pydra.mark import task, annotate
+from pydra.utils.messenger import AuditFlag, FileMessenger
+import typing as ty
 import os
 from .tasks import read_file, gen_splits, train_test_kernel, calc_metric, get_shap
 from .report import gen_report
 
+# Create pydra tasks
+read_file_pdt = task(
+    annotate(
+        {
+            "return": {
+                "X": ty.Any,
+                "Y": ty.Any,
+                "groups": ty.Any,
+                "feature_names": ty.Any,
+            }
+        }
+    )(read_file)
+)
+
+gen_splits_pdt = task(
+    annotate({"return": {"splits": ty.Any, "split_indices": ty.Any}})(gen_splits)
+)
+
+train_test_kernel_pdt = task(
+    annotate({"return": {"output": ty.Any, "model": ty.Any}})(train_test_kernel)
+)
+
+calc_metric_pdt = task(
+    annotate({"return": {"score": ty.Any, "output": ty.Any}})(calc_metric)
+)
+
+get_shap_pdt = task(annotate({"return": {"shaps": ty.Any}})(get_shap))
+
 
 def gen_workflow(inputs, cache_dir=None, cache_locations=None):
     wf = pydra.Workflow(
@@ -13,18 +44,21 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
         **inputs,
         cache_dir=cache_dir,
         cache_locations=cache_locations,
+        audit_flags=AuditFlag.ALL,
+        messengers=FileMessenger(),
+        messenger_args={"message_dir": os.path.join(os.getcwd(), "messages")},
     )
     wf.split(["clf_info", "permute"])
     wf.add(
-        read_file(
+        read_file_pdt(
             name="readcsv",
             filename=wf.lzin.filename,
             x_indices=wf.lzin.x_indices,
             target_vars=wf.lzin.target_vars,
         )
     )
     wf.add(
-        gen_splits(
+        gen_splits_pdt(
             name="gensplit",
             n_splits=wf.lzin.n_splits,
             test_size=wf.lzin.test_size,
@@ -34,26 +68,25 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
         )
     )
     wf.add(
-        train_test_kernel(
+        train_test_kernel_pdt(
             name="fit_clf",
             X=wf.readcsv.lzout.X,
             y=wf.readcsv.lzout.Y,
             train_test_split=wf.gensplit.lzout.splits,
             split_index=wf.gensplit.lzout.split_indices,
             clf_info=wf.lzin.clf_info,
             permute=wf.lzin.permute,
-            metrics=wf.lzin.metrics,
         )
     )
     wf.fit_clf.split("split_index")
     wf.add(
-        calc_metric(
+        calc_metric_pdt(
             name="metric", output=wf.fit_clf.lzout.output, metrics=wf.lzin.metrics
         )
     )
     wf.metric.combine("fit_clf.split_index")
     wf.add(
-        get_shap(
+        get_shap_pdt(
             name="shap",
             X=wf.readcsv.lzout.X,
             permute=wf.lzin.permute,
@@ -75,16 +108,21 @@ def gen_workflow(inputs, cache_dir=None, cache_locations=None):
     return wf
 
 
-def run_workflow(wf, plugin, plugin_args):
+def run_workflow(wf, plugin, plugin_args, specfile="localspec"):
     cwd = os.getcwd()
     with pydra.Submitter(plugin=plugin, **plugin_args) as sub:
         sub(runnable=wf)
     results = wf.result(return_inputs=True)
     os.chdir(cwd)
+
     import pickle as pk
     import datetime
 
     timestamp = datetime.datetime.utcnow().isoformat()
+    timestamp = timestamp.replace(":", "").replace("-", "")
+    result_dir = f"out-{os.path.basename(specfile)}-{timestamp}"
+    os.makedirs(result_dir)
+    os.chdir(result_dir)
     with open(f"results-{timestamp}.pkl", "wb") as fp:
         pk.dump(results, fp)
 
@@ -95,4 +133,5 @@ def run_workflow(wf, plugin, plugin_args):
         gen_shap=wf.inputs.gen_shap,
         plot_top_n_shap=wf.inputs.plot_top_n_shap,
     )
+    os.chdir(cwd)
     return results
diff --git a/pydra_ml/cli.py b/pydra_ml/cli.py
@@ -45,4 +45,4 @@ def main(specfile, plugin, cache):
         if plugin[0] == "cf" and key == "n_procs":
             value = int(value)
         plugin_args[key] = value
-    run_workflow(wf, plugin[0], plugin_args)
+    run_workflow(wf, plugin[0], plugin_args, specfile)
diff --git a/pydra_ml/report.py b/pydra_ml/report.py
@@ -14,12 +14,7 @@ def save_obj(obj, path):
         pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
 
 
-def plot_summary(
-    summary,
-    output_dir=None,
-    filename="shap_LogisticRegression_all_predictions",
-    plot_top_n_shap=16,
-):
+def plot_summary(summary, output_dir=None, filename="shap_plot", plot_top_n_shap=16):
     plt.clf()
     plt.figure(figsize=(8, 12))
     # plot without all bootstrapping values
@@ -54,7 +49,7 @@ def shaps_to_summary(
     shaps_n_splits,
     feature_names=None,
     output_dir=None,
-    filename="shap_LogisticRegression_all_predictions",
+    filename="shap_summary",
     plot_top_n_shap=16,
 ):
     shaps_n_splits.columns = [
@@ -84,6 +79,7 @@ def shaps_to_summary(
 def gen_report_shap(results, output_dir="./", plot_top_n_shap=16):
     # Create shap_dir
     timestamp = datetime.datetime.utcnow().isoformat()
+    timestamp = timestamp.replace(":", "").replace("-", "")
     shap_dir = output_dir + f"shap-{timestamp}/"
     os.mkdir(shap_dir)
 
@@ -213,6 +209,7 @@ def gen_report(
         import datetime
 
         timestamp = datetime.datetime.utcnow().isoformat()
+        timestamp = timestamp.replace(":", "").replace("-", "")
         plt.savefig(f"test-{name}-{timestamp}.png")
 
     # create SHAP summary csv and figures
diff --git a/pydra_ml/tasks.py b/pydra_ml/tasks.py
@@ -1,15 +1,15 @@
 #!/usr/bin/env python
 
-import pydra
-import typing as ty
-import numpy as np
 
+def read_file(filename, x_indices=None, target_vars=None, group=None):
+    """Read a CSV data file
 
-@pydra.mark.task
-@pydra.mark.annotate(
-    {"return": {"X": ty.Any, "Y": ty.Any, "groups": ty.Any, "feature_names": ty.Any}}
-)
-def read_file(filename, x_indices=None, target_vars=None, group="groups"):
+    :param filename: CSV filename containing a column header
+    :param x_indices: integer or string indices
+    :param target_vars: Target variables to use
+    :param group: CSV column name containing grouping information
+    :return: Tuple containing train data, target data, groups, features
+    """
     import pandas as pd
 
     data = pd.read_csv(filename)
@@ -20,17 +20,27 @@ def read_file(filename, x_indices=None, target_vars=None, group="groups"):
     else:
         raise ValueError(f"{x_indices} is not a list of string or ints")
     Y = data[target_vars]
-    if group in data.keys():
-        groups = data[:, [group]]
-    else:
+    if group is None:
         groups = list(range(X.shape[0]))
+    else:
+        groups = data[:, [group]]
     feature_names = list(X.columns)
     return X.values, Y.values, groups, feature_names
 
 
-@pydra.mark.task
-@pydra.mark.annotate({"return": {"splits": ty.Any, "split_indices": ty.Any}})
 def gen_splits(n_splits, test_size, X, Y, groups=None, random_state=0):
+    """Generate train-test splits for the data.
+
+    Uses GroupShuffleSplit from scikit-learn
+
+    :param n_splits: Number of splits
+    :param test_size: fractional test size
+    :param X: Sample feature data
+    :param Y: Sample target data
+    :param groups: Grouping of sample data for shufflesplit
+    :param random_state: randomization for shuffling (default 0)
+    :return: splits and indices to splits
+    """
     from sklearn.model_selection import GroupShuffleSplit
 
     gss = GroupShuffleSplit(
@@ -41,9 +51,17 @@ def gen_splits(n_splits, test_size, X, Y, groups=None, random_state=0):
     return train_test_splits, split_indices
 
 
-@pydra.mark.task
-@pydra.mark.annotate({"return": {"output": ty.Any, "model": ty.Any}})
-def train_test_kernel(X, y, train_test_split, split_index, clf_info, permute, metrics):
+def train_test_kernel(X, y, train_test_split, split_index, clf_info, permute):
+    """Core model fitting and predicting function
+
+    :param X: Input features
+    :param y: Target variables
+    :param train_test_split: split indices
+    :param split_index: which index to use
+    :param clf_info: how to construct the classifier
+    :param permute: whether to run it in permuted mode or not
+    :return: outputs, trained classifier with sample indices
+    """
     from sklearn.preprocessing import StandardScaler
     from sklearn.pipeline import Pipeline
     import numpy as np
@@ -68,9 +86,13 @@ def train_test_kernel(X, y, train_test_split, split_index, clf_info, permute, me
     return (y[test_index], predicted), (pipe, train_index, test_index)
 
 
-@pydra.mark.task
-@pydra.mark.annotate({"return": {"score": ty.Any, "output": ty.Any}})
 def calc_metric(output, metrics):
+    """Calculate the scores for the predicted outputs
+
+    :param output: true, predicted output
+    :param metrics: list of metrics to evaluate
+    :return: list of scores and pass the output
+    """
     score = []
     for metric in metrics:
         metric_mod = __import__("sklearn.metrics", fromlist=[metric])
@@ -79,9 +101,17 @@ def calc_metric(output, metrics):
     return score, output
 
 
-@pydra.mark.task
-@pydra.mark.annotate({"return": {"shaps": ty.Any}})
 def get_shap(X, permute, model, gen_shap=False, nsamples="auto", l1_reg="aic"):
+    """Compute shap information for the test data
+
+    :param X: sample data
+    :param permute: whether model was permuted or not
+    :param model: model containing trained classifier and train/test index
+    :param gen_shap: whether to generate shap features
+    :param nsamples: number of samples for shap evaluation
+    :param l1_reg: L1 regularization for shap evaluation
+    :return: shap values for each test sample
+    """
     if permute or not gen_shap:
         return []
     pipe, train_index, test_index = model
diff --git a/setup.cfg b/setup.cfg
@@ -24,6 +24,7 @@ classifiers =
 python_requires = >= 3.7
 install_requires =
     pydra >= 0.6
+    psutil
     scikit-learn
     seaborn
     click