diff --git a/examples/hpo/resnet50/README.md b/examples/hpo/resnet50/README.md new file mode 100644 index 00000000..83fb02f7 --- /dev/null +++ b/examples/hpo/resnet50/README.md @@ -0,0 +1,37 @@ +# Example of Black Box Optimization on ABCI 3.0 + +This is an example of performing black-box optimization of the learning rate for a ResNet50 model on the MNIST dataset. + +## Getting started + +In an environment where aiaccel is installed, additionally install pyproject.toml. + +```bash +pip install . +``` + +PATH_TO_ENV in job_config.yaml should be changed to the path of the environment prepared above. + +```yaml + source PATH_TO_ENV +``` + +Run the following command to perform black-box optimization. + +```bash +cmd=aiaccel-job pbs --config config_job.yaml + +aiaccel-hpo optimize --config config_hpo.yaml -- \ + $cmd train --n_gpus=1 {config.working_directory}/{job_name}.log -- \ + aiaccel-torch train config_torch.yaml \ + working_directory={config.working_directory}/{job_name}/ \ + task.optimizer_config.optimizer_generator.lr={lr} \ + out_filename={out_filename} +``` + +## Detailed Descriptions + +The target function for optimization using aiaccel.hpo.app.optimize is objective_integration.main. +Within objective_integration.main, aiaccel.torch.app.train is called, and the learning rate is returned. + +Detailed descriptions of torch and optimize are available on the [aiaccel document(torch)](https://aistairc.github.io/aiaccel/user_guide/torch.html) [aiaccel document(optimize)](https://aistairc.github.io/aiaccel/user_guide/hpo.html) diff --git a/examples/hpo/resnet50/config_hpo.yaml b/examples/hpo/resnet50/config_hpo.yaml new file mode 100644 index 00000000..3d7c8714 --- /dev/null +++ b/examples/hpo/resnet50/config_hpo.yaml @@ -0,0 +1,12 @@ +params: + _convert_: partial + _target_: aiaccel.hpo.apps.optimize.HparamsManager + lr: + _target_: aiaccel.hpo.optuna.suggest_wrapper.SuggestFloat + name: lr + low: 1.e-6 + high: 1.e-2 + log: true + +n_trials: 1 +n_max_jobs: 1 diff --git a/examples/hpo/resnet50/config_job.yaml b/examples/hpo/resnet50/config_job.yaml new file mode 100644 index 00000000..01727539 --- /dev/null +++ b/examples/hpo/resnet50/config_job.yaml @@ -0,0 +1,59 @@ +walltime: "1:0:0" + +script_prologue: | + echo Job ID: $PBS_JOBID + echo Hostname: $(hostname) + + export NVIDIA_VISIBLE_DEVICES=all + +qsub: "qsub -P $JOB_GROUP -l walltime={args.walltime} -v USE_SSH=1" + +cpu: + qsub_args: "-q rt_HF -l select=1" + job: "{command}" + +cpu-array: + n_tasks_per_proc: 128 + n_procs: 24 + qsub_args: "-q rt_HF -l select=1 -J 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))" + job: "{command}" + +gpu: + qsub_args: "-q rt_HF -l select=1" + job: "{command}" + +gpu-array: + n_tasks_per_proc: 128 + n_procs: 8 + qsub_args: "-q rt_HF -l select=1 -J 1-{args.n_tasks}:$(( {args.n_tasks_per_proc} * {args.n_procs} ))" + job: "CUDA_VISIBLE_DEVICES=$(( LOCAL_PROC_INDEX % 8 )) {command}" + +mpi: + n_nodes: 1 + qsub_args: >- + -q rt_HF + -l select={args.n_nodes}:mpiprocs=$(( {args.n_procs} / {args.n_nodes} )):ompthreads=$(( {args.n_nodes} * 96 / {args.n_procs} )) + job: | + source /etc/profile.d/modules.sh + module load hpcx + + mpirun -np {args.n_procs} -bind-to none -map-by slot \ + -mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \ + {command} + +train: + qsub_args: >- + -q $( (({args.n_gpus}==1)) && printf rt_HG || printf rt_HF ) + -l select=$(( ({args.n_gpus} + 7) / 8 )):mpiprocs=$( (({args.n_gpus}==1)) && printf 1 || printf 8 ):ompthreads=$( (({args.n_gpus}==1)) && printf 8 || printf 12 ) + job: | + source /etc/profile.d/modules.sh + module load hpcx + + mpirun -np {args.n_gpus} -bind-to none -map-by slot \ + -mca pml ob1 -mca btl self,tcp -mca btl_tcp_if_include bond0 \ + -x MAIN_ADDR=$(hostname -i) \ + -x MAIN_PORT=3000 \ + -x COLUMNS=120 \ + -x PYTHONUNBUFFERED=true \ + {command} + diff --git a/examples/hpo/resnet50/config_torch.yaml b/examples/hpo/resnet50/config_torch.yaml new file mode 100644 index 00000000..fe1e5e85 --- /dev/null +++ b/examples/hpo/resnet50/config_torch.yaml @@ -0,0 +1,11 @@ +_base_: + - ${working_directory}/../../../../torch/image_classification/recipes/resnet50.cifar50/config.yaml + +trainer: + max_epochs: 10 + + callbacks: + - _target_: aiaccel.torch.lightning.callback.SaveMetricCallback + metric_name: "validation/loss" + output_path: ${out_filename} +