aistairc · KanaiYuma-aist · Jun 18, 2025 · Jun 19, 2025 · Jun 19, 2025 · Jun 20, 2025
diff --git a/examples/integration/README.md b/examples/integration/README.md
@@ -0,0 +1,35 @@
+# Example of Black Box Optimization on ABCI 3.0
+
+This is an example of performing black-box optimization of the learning rate for a ResNet50 model on the MNIST dataset.
+
+## Getting started
+
+In an environment where aiaccel is installed, additionally install pyproject.toml.
+
+```bash
+pip install .
+```
+
+PATH_TO_ENV in job_config.yaml should be changed to the path of the environment prepared above.
+
+```yaml
+    source PATH_TO_ENV
+```
+
+Run the following command to perform black-box optimization.
+
+```bash
+aiaccel-hpo optimize --config hpo_config.yaml -- \
+    aiaccel-job pbs --config job_config.yaml train --n_gpus=1 jobs/{job_name}.log -- \
+        aiaccel-torch train resnet50/config.yaml \
+            working_directory=jobs/{job_name}/ \
+            task.optimizer_config.optimizer_generator.lr={lr} \
+            out_filename={out_filename}
+```
+
+## Detailed Descriptions
+
+The target function for optimization using aiaccel.hpo.app.optimize is objective_integration.main.
+Within objective_integration.main, aiaccel.torch.app.train is called, and the learning rate is returned.
+
+Detailed descriptions of torch and optimize are available on the [aiaccel document(torch)](https://aistairc.github.io/aiaccel/user_guide/torch.html) [aiaccel document(optimize)](https://aistairc.github.io/aiaccel/user_guide/hpo.html)
diff --git a/examples/integration/common_config.yaml b/examples/integration/common_config.yaml
@@ -0,0 +1,62 @@
+trainer:
+  max_epochs: 10
+
+  callbacks:
+    - _target_: lightning.pytorch.callbacks.ModelCheckpoint
+      filename: "{epoch:04d}"
+      save_last: True
+      save_top_k: -1
+    - _target_: aiaccel.torch.lightning.callback.SaveMetricCallback
+      metric_name: "validation/loss"
+      output_path: ${out_filename}
+
+
+datamodule:
+  _target_: aiaccel.torch.lightning.datamodules.single_datamodule.SingleDataModule
+
+  train_dataset_fn:
+    _partial_: true
+    _target_: torchvision.datasets.MNIST
+    train: True
+
+  val_dataset_fn:
+    _partial_: true
+    _target_: torchvision.datasets.MNIST
+    train: False
+
+  common_args:
+    root: "./dataset"
+    download: True
+    transform: ${transform}
+
+  batch_size: 128
+  use_scatter: False
+
+
+transform:
+  _target_: torchvision.transforms.Compose
+  transforms:
+    - _target_: torchvision.transforms.Resize
+      size: [256, 256]
+    - _target_: torchvision.transforms.Grayscale
+      num_output_channels: 3
+    - _target_: torchvision.transforms.ToTensor
+    - _target_: torchvision.transforms.Normalize
+      mean: [0.5]
+      std: [0.5]
+
+task:
+  _target_: torchvision_task_integration.ImageClassificationTask
+  num_classes: 10
+
+  model:
+    _target_: torchvision.models.resnet50
+    weights:
+      _target_: hydra.utils.get_object
+      path: torchvision.models.ResNet50_Weights.DEFAULT
+
+  optimizer_config:
+    _target_: aiaccel.torch.lightning.OptimizerConfig
+    optimizer_generator:
+      _partial_: True
+      _target_: torch.optim.Adam
diff --git a/examples/integration/hpo_config.yaml b/examples/integration/hpo_config.yaml
@@ -0,0 +1,12 @@
+params:
+  _convert_: partial
+  _target_: aiaccel.hpo.apps.optimize.HparamsManager
+  lr:
+    _target_: aiaccel.hpo.optuna.suggest_wrapper.SuggestFloat
+    name: lr
+    low: 1.e-6
+    high: 1.e-2
+    log: true
+
+n_trials: 1
+n_max_jobs: 1
diff --git a/examples/integration/job_config.yaml b/examples/integration/job_config.yaml
@@ -0,0 +1,60 @@
+walltime: "1:0:0"
+
+script_prologue: |
+    echo Job ID: $PBS_JOBID
+    echo Hostname: $(hostname)
+
+    export NVIDIA_VISIBLE_DEVICES=all
+    module load cuda/12.6/12.6.1 python/3.13/3.13.2
+    source PATH_TO_ENV
+
+qsub: "qsub -P $JOB_GROUP -l walltime={args.walltime} -v USE_SSH=1"
+
+cpu:
+    qsub_args: "-q rt_HF -l select=1"
+    job: "{command}"
+
+cpu-array:
+    n_tasks_per_proc: 128
+    n_procs: 24
+    qsub_args: "-q rt_HF -l select=1 -J 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))"
+    job: "{command}"
+
+gpu:
+    qsub_args: "-q rt_HF -l select=1"
+    job: "{command}"
+
+gpu-array:
+    n_tasks_per_proc: 128
+    n_procs: 8
+    qsub_args: "-q rt_HF -l select=1 -J 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))"
+    job: "CUDA_VISIBLE_DEVICES=$(( LOCAL_PROC_INDEX % 8 )) {command}"
+
+mpi:
+    n_nodes: 1
+    qsub_args: >-
+        -q rt_HF
+        -l select={args.n_nodes}:mpiprocs=$(( {args.n_procs} / {args.n_nodes} )):ompthreads=$(( {args.n_nodes} * 96 / {args.n_procs} ))
+    job: |
+        source /etc/profile.d/modules.sh
+        module load hpcx
+
+        mpirun -np {args.n_procs} -bind-to none -map-by slot \
+            -mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \
+            {command}
+
+train:
+    qsub_args: >-
+        -q $( (({args.n_gpus}==1)) && printf rt_HG || printf rt_HF )
+        -l select=$(( ({args.n_gpus} + 7) / 8 )):mpiprocs=$( (({args.n_gpus}==1)) && printf 1 || printf 8 ):ompthreads=$( (({args.n_gpus}==1)) && printf 8 || printf 12 )
+    job: |
+        source /etc/profile.d/modules.sh
+        module load hpcx
+
+        mpirun -np {args.n_gpus} -bind-to none -map-by slot \
+            -mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \
+            -x MAIN_ADDR=$(hostname -i) \
+            -x MAIN_PORT=3000 \
+            -x COLUMNS=120 \
+            -x PYTHONUNBUFFERED=true \
+            {command}
diff --git a/examples/integration/pyproject.toml b/examples/integration/pyproject.toml
@@ -0,0 +1,19 @@
+[build-system]
+requires = ["setuptools>=61.0", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "task"
+version = "0.1.0"
+description = "Package for integration example."
+authors = [
+    {name = "AIST", email = "[email protected]"}
+]
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "torchvision"
+]
+
+[tool.setuptools]
+package-dir = {"" = "task"}
diff --git a/examples/integration/resnet50/config.yaml b/examples/integration/resnet50/config.yaml
@@ -0,0 +1,3 @@
+_base_:
+  - ${base_config_path}/train_base.yaml
+  - ../common_config.yaml
diff --git a/examples/integration/task/torchvision_task_integration.py b/examples/integration/task/torchvision_task_integration.py
@@ -0,0 +1,56 @@
+import torch
+from torch import nn
+from torch.nn import functional as func
+
+from torchmetrics.classification import MulticlassAccuracy
+
+from aiaccel.torch.lightning import OptimizerConfig, OptimizerLightningModule
+
+
+class ImageClassificationTask(OptimizerLightningModule):
+    def __init__(self, model: nn.Module, optimizer_config: OptimizerConfig, num_classes: int = 10):
+        super().__init__(optimizer_config)
+
+        self.model = model
+        if hasattr(self.model.fc, "in_features") and isinstance(self.model.fc.in_features, int):
+            self.model.fc = nn.Linear(self.model.fc.in_features, num_classes)
+
+        self.train_accuracy = MulticlassAccuracy(num_classes=num_classes)
+        self.val_accuracy = MulticlassAccuracy(num_classes=num_classes)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.model(x)  # type: ignore
+
+    def training_step(self, batch: tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> torch.Tensor:
+        x, y = batch
+
+        logits = self(x)
+
+        loss = func.cross_entropy(logits, y)
+
+        acc = self.train_accuracy(logits, y)
+        self.log_dict(
+            {
+                "training/loss": loss,
+                "training/acc": acc,
+            },
+            prog_bar=True,
+        )
+
+        return loss
+
+    def validation_step(self, batch: tuple[torch.Tensor, torch.Tensor], batch_idx: int) -> None:
+        x, y = batch
+
+        logits = self(x)
+
+        loss = func.cross_entropy(logits, y)
+
+        acc = self.val_accuracy(logits, y)
+        self.log_dict(
+            {
+                "validation/loss": loss,
+                "validation/acc": acc,
+            },
+            prog_bar=True,
+        )