glotzerlab · cbkerr · Oct 25, 2022 · Oct 25, 2022 · Oct 25, 2022 · Oct 25, 2022
diff --git a/.gitignore b/.gitignore
@@ -6,6 +6,7 @@
 .signac
 .signac_shell_history
 .signac_sp_cache.json.gz
+*__pycache__
 notebooks/projects
 notebooks/static
 signac.rc

diff --git a/projects/flow.pytorch.HyperparameterOptimize/.gitignore b/projects/flow.pytorch.HyperparameterOptimize/.gitignore
@@ -0,0 +1 @@
+source/data
diff --git a/projects/flow.pytorch.HyperparameterOptimize/README.md b/projects/flow.pytorch.HyperparameterOptimize/README.md
@@ -0,0 +1,80 @@
+An Example of Using signac with [Pytorch] to train [Variational Autoencoder] (VAE).
+
+This example project demonstrate how to use signac to optimize the hyperparameters of VAE trained on the popular [MNIST] datasets.
+
+[Variational Autoencoder]: https://arxiv.org/pdf/1312.6114.pdf
+[Pytorch]: https://pytorch.org/
+[MNIST]: https://pytorch.org/vision/main/generated/torchvision.datasets.MNIST.html
+
+## Prerequisites
+
+This example use the following python packages:
+
+```conda create --name pytorch python=3.10 matplotlib numpy signac-flow signac-dashboard h5py umap-learn -c conda-forge```
+* [Numpy](https://github.com/numpy/numpy)
+* [matplotlib](https://github.com/matplotlib/matplotlib)
+* [signac](https://github.com/glotzerlab/signac)
+* [signac-flow](https://github.com/glotzerlab/signac-flow)
+* [signac-dashboard](https://github.com/glotzerlab/signac-dashboard)
+
+```conda activate pytorch```
+```conda install pytorch torchvision==0.13.0 -c pytorch```
+* [Pytorch](https://github.com/pytorch/pytorch)
+* [torchvision](https://github.com/pytorch/vision)
+
+
+Conda users can install these from [conda-forge](https://conda-forge.org/):
+
+# Usage
+
+1. Initialize the project with
+
+    ```
+    python init.py
+    ```
+
+    In `init.py`, you can define VAE's hyperparameters you would like to try.
+
+- This checks if MNIST dataset has been downloaded yet. If it's not, it will automatically download it and store in `/source/data/MNIST`.
+- This creates the `workspace/`, which holds all of our `jobs`. Each `job` has its own directory, named by the job's unique `id` (something like `87c7fccdea3531da704bbae95e95e914`).\
+- If you look in these directories, you'll see `signac_statepoint.json`. This is a json file that contains the statepoint parameters (hyperparameters of VAE) for that job.
+- NOTE: The job's `id` is generated specifically for the dict containing the statepoint parameters, so do not edit the directory name or `signac_statepoint.json`.
+
+2. All of the operations we will be performing on [MNIST] dataset are defined in `project.py`. An operation is a function with `job` as its only argument that signac-flow recognizes as a part of your workflow.
+    - You can tell which methods are operations because they will have the `@Project.operation` decorator, which tells signac-flow that this method is to be treated as an operation associated with `Project`.
+    - `operation`s typically have pre- and post-conditions. This is how signac-flow knows when the operation should be run. For example, the `train()` function has no pre-condition because it must be run first, but has `@Project.post(labels.check_train_complete)` as a post-condition.
+
+3. All of the functions used in `project.py` are defined in three python scripts: `source/labels.py`, `source/workflow.py`, and `source/vae.py`.
+    - `source/labels.py` defines the functions that are used to identify whether an signac project level operation is done or not in `project.py`.
+    - `source/workflow.py` defines the functions that used to manage the workflow of training protocols including post-evaluation on the training.
+    - `source/vae.py` defines any functions that relate to the training and post-evaluation of VAE using pytorch, which includes functions for, e.g. download [MNIST] dataset, and plot learning curve, etc.
+
+4. Now let's run the operations:
+
+- First run a status check:
+
+    ```
+    python project.py status -d
+    ```
+
+- Now run eligible jobs:
+
+    ```
+    python project.py run -o train
+    ```
+
+- After the training is done, you can run post-evaluation:
+
+    ```
+    python project.py run -o evaluation
+    ```
+
+5. For visualizing the post-evaluation, you can launch the [signac-dashbboard](https://github.com/glotzerlab/signac-dashboard):
+
+    ```
+    python dashborad.py run
+    ```
+
+    And open `http://localhost:8888/` in your web browser.
+
+**NOTE**: If you want to run this tutorial from scratch, just run `rm -rf workspace/` to delete the workspace.
diff --git a/projects/flow.pytorch.HyperparameterOptimize/dashboard.py b/projects/flow.pytorch.HyperparameterOptimize/dashboard.py
@@ -0,0 +1,34 @@
+from signac_dashboard import Dashboard
+from signac_dashboard.modules import (
+    DocumentList,
+    ImageViewer,
+    Notes,
+    StatepointList,
+    VideoViewer,
+)
+
+
+class MyDashboard(Dashboard):
+    def job_title(self, job):
+        return (
+            f"Total epochs={job.sp.epochs}, Learning rate={job.sp.lr}, "
+            f"Latent space dimension={job.sp.latent_dim}"
+        )
+
+    def job_sorter(self, job):
+        return (-job.sp.lr, job.sp.hidden_dim, job.sp.epochs)
+
+
+if __name__ == "__main__":
+    MyDashboard(
+        modules=[
+            ImageViewer(img_globs=["Loss.jpg"], name="Learning Curve"),
+            ImageViewer(img_globs=["latent.jpg"], name="Latent Space"),
+            ImageViewer(img_globs=["digits_recon.jpg"], name="Reconstructed"),
+            ImageViewer(img_globs=["digits_orig.jpg"], name="Original"),
+            StatepointList(),
+            VideoViewer(),
+            DocumentList(),
+            Notes(),
+        ]
+    ).main()
diff --git a/projects/flow.pytorch.HyperparameterOptimize/init.py b/projects/flow.pytorch.HyperparameterOptimize/init.py
@@ -0,0 +1,48 @@
+import itertools
+import os
+
+import signac
+import torchvision.transforms as transforms
+from torchvision import datasets
+from tqdm import tqdm
+
+PR = signac.init_project()
+HYPER_PARAMS = {
+    "seed": [1],
+    "epochs": [10, 20, 30, 40],
+    "batch_size": [64],
+    "lr": [0.0001, 0.0005, 0.001, 0.005, 0.01],
+    "hidden_dim": [128, 256, 512],
+    "latent_dim": [16],
+}
+
+
+def download_MNIST():
+    # transforms
+    transform = transforms.Compose(
+        [
+            transforms.ToTensor(),
+        ]
+    )
+
+    # train and validation data
+    datasets.MNIST(root="./source/data", train=True, download=True, transform=transform)
+    datasets.MNIST(
+        root="./source/data", train=False, download=True, transform=transform
+    )
+
+
+def generate_workspace(pr, hyper_params):
+    for sp in cartesian(**hyper_params):
+        pr.open_job(sp).init()
+
+
+def cartesian(**kwargs):
+    for combo in tqdm(itertools.product(*kwargs.values())):
+        yield dict(zip(kwargs.keys(), combo))
+
+
+if __name__ == "__main__":
+    generate_workspace(PR, HYPER_PARAMS)
+    if not os.path.exists("./source/data/MNIST"):
+        download_MNIST()
diff --git a/projects/flow.pytorch.HyperparameterOptimize/project.py b/projects/flow.pytorch.HyperparameterOptimize/project.py
@@ -0,0 +1,69 @@
+import numpy as np
+import signac
+import torch
+from flow import FlowProject
+from source import vae
+
+
+class Project(FlowProject):
+    pass
+
+
+@Project.label
+def status_label(job):
+    return ", ".join(
+        [
+            f"{check_point}_completed"
+            for check_point in ("train", "evaluation")
+            if job.doc.get(check_point + "_done", False)
+        ]
+    )
+
+
+def gpu_directives(walltime: float = 0.5, n_gpu: int = 1):
+    return {"nranks": n_gpu, "ngpu": n_gpu, "walltime": walltime}
+
+
+def cpu_directives(walltime: float = 0.5, n_cpu: int = 1):
+    return {"nranks": n_cpu, "walltime": walltime}
+
+
+def store_success_to_doc(operation_name, job):
+    job.doc.update({f"{operation_name}_done": True})
+
+
+training_group = Project.make_group(name="trainings")
+
+TRAIN_WALLTIME = 1
+EVAL_WALLTIME = 0.5
+
+
+@training_group
+@Project.operation_hooks.on_success(store_success_to_doc)
+@Project.post.true("train_done")
+@Project.operation(directives=gpu_directives(walltime=TRAIN_WALLTIME))
+def train(job):
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    train_loader, val_loader = vae.load_data(job)
+    vae.fit(job=job, train_loader=train_loader, val_loader=val_loader, device=device)
+
+
+@training_group
+@Project.operation_hooks.on_success(store_success_to_doc)
+@Project.post.true("evaluation_done")
+@Project.pre.after(train)
+@Project.operation(directives=gpu_directives(walltime=EVAL_WALLTIME))
+def evaluation(job):
+    vae.plot_loss(job)
+
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    _, val_loader = vae.load_data(job)
+    dataset = val_loader.dataset
+    vae.plot_reconstruction(
+        job=job, dataset=dataset, plot_arrangement=(3, 3), device=device
+    )
+    vae.plot_latent(job=job, dataset=dataset, device=device)
+
+
+if __name__ == "__main__":
+    Project().main()