diff --git a/.github/utils/dryruns.py b/.github/utils/dryruns.py index b0631cd8e..f84a89bb1 100644 --- a/.github/utils/dryruns.py +++ b/.github/utils/dryruns.py @@ -26,6 +26,9 @@ # Not mpionlyexperiment "py-scaffold+strong", "py-scaffold+weak", + # Needs package_manager=spack-pip + "py-scaffold+rocm", + "py-scaffold+cuda", ] diff --git a/.gitlab/tests/shared_flux_clusters.yml b/.gitlab/tests/shared_flux_clusters.yml index a53f82d22..96934b09b 100644 --- a/.gitlab/tests/shared_flux_clusters.yml +++ b/.gitlab/tests/shared_flux_clusters.yml @@ -130,7 +130,8 @@ run_tests_flux_tuolumne: - HOST: tuolumne ARCHCONFIG: llnl-elcapitan BENCHMARK: [py-scaffold] - VARIANT: ['+rocm package_manager=spack-pip caliper=mpi,time,rocm'] + VARIANT: + - +rocm package_manager=spack-pip caliper=mpi,time,rocm allocation=torchrun-hpc # rocm7 - HOST: tuolumne ARCHCONFIG: llnl-elcapitan @@ -152,4 +153,5 @@ run_tests_flux_tioga: - HOST: tioga ARCHCONFIG: llnl-elcapitan BENCHMARK: [py-scaffold] - VARIANT: ['+rocm package_manager=spack-pip caliper=mpi,time,rocm'] + VARIANT: + - +rocm package_manager=spack-pip caliper=mpi,time,rocm allocation=torchrun-hpc diff --git a/.gitlab/tests/shared_slurm_clusters.yml b/.gitlab/tests/shared_slurm_clusters.yml index 5e97cf9b1..2ffcf3ead 100644 --- a/.gitlab/tests/shared_slurm_clusters.yml +++ b/.gitlab/tests/shared_slurm_clusters.yml @@ -151,4 +151,5 @@ run_tests_slurm_matrix: - HOST: matrix ARCHCONFIG: llnl-matrix BENCHMARK: [py-scaffold] - VARIANT: ['+cuda package_manager=spack-pip caliper=mpi,time,cuda'] + VARIANT: + - +cuda package_manager=spack-pip caliper=mpi,time,cuda allocation=torchrun-hpc diff --git a/experiments/osu-micro-benchmarks/experiment.py b/experiments/osu-micro-benchmarks/experiment.py index 7d129effc..8e310dedb 100644 --- a/experiments/osu-micro-benchmarks/experiment.py +++ b/experiments/osu-micro-benchmarks/experiment.py @@ -107,7 +107,7 @@ class OsuMicroBenchmarks( def compute_applications_section(self): - num_nodes = {"n_nodes": 2} + num_nodes = {"n_nodes": 2, "n_ranks": 1} if self.spec.satisfies("exec_mode=test"): for pk, pv in num_nodes.items(): @@ -119,8 +119,7 @@ def compute_applications_section(self): self.add_experiment_variable("additional_args", " -d cuda", False) if self.spec.satisfies("+rocm") or self.spec.satisfies("+cuda"): resource = "n_gpus" - for pk, pv in num_nodes.items(): - self.add_experiment_variable("n_gpus", pv, True) + self.add_experiment_variable("n_gpus", 1, True) else: resource = "n_nodes" diff --git a/experiments/py-scaffold/experiment.py b/experiments/py-scaffold/experiment.py index d07c3f3b1..0a2a1d287 100644 --- a/experiments/py-scaffold/experiment.py +++ b/experiments/py-scaffold/experiment.py @@ -68,6 +68,15 @@ def compute_applications_section(self): ) def compute_package_section(self): + if self.spec.variants["package_manager"][0] != "spack-pip": + raise ValueError( + "Use the 'spack-pip' package manager for this benchmark. Set 'package_manager=spack-pip'" + ) + elif self.spec.variants["allocation"][0] != "torchrun-hpc": + raise ValueError( + "Use the 'torchrun-hpc' launcher mode for this benchmark. Set 'allocation=torchrun-hpc'" + ) + # Spec that will be written into requirements.txt for pip install sys_name = self.system_spec._name if self.spec.satisfies("+rocm"): diff --git a/lib/benchpark/experiment.py b/lib/benchpark/experiment.py index 77f427c25..1caf40ce1 100644 --- a/lib/benchpark/experiment.py +++ b/lib/benchpark/experiment.py @@ -223,6 +223,13 @@ class Experiment(ExperimentSystemBase, ExecMode, Affinity, Hwloc): description="Number of experiment repetitions", ) + variant( + "allocation", + default="standard", + values=("standard", "torchrun-hpc"), + description="Allocation modifier mode", + ) + def __init__(self, spec): self.spec: "benchpark.spec.ConcreteExperimentSpec" = spec # Device type must be set before super with absence of mpionly experiment type @@ -372,7 +379,10 @@ def compute_modifiers_section(self): def compute_modifiers_section_wrapper(self): # by default we use the allocation modifier and no others - modifier_list = [{"name": "allocation"}, {"name": "exit-code"}] + modifier_list = [ + {"name": "allocation", "mode": self.spec.variants["allocation"][0]}, + {"name": "exit-code"}, + ] modifier_list += self.compute_modifiers_section() for cls in self.helpers: cls_list = cls.compute_modifiers_section() diff --git a/lib/benchpark/test/caliper.py b/lib/benchpark/test/caliper.py index 108f3955f..7258a9972 100644 --- a/lib/benchpark/test/caliper.py +++ b/lib/benchpark/test/caliper.py @@ -30,6 +30,7 @@ def test_experiment_compute_variables_section_caliper(monkeypatch): assert vars_section == { "caliper_metadata": { "affinity": "none", + "allocation": "standard", "hwloc": "none", "application_name": "{application_name}", "experiment_name": "{experiment_name}", @@ -89,6 +90,7 @@ def test_caliper_modifier(monkeypatch): # Check file assert data == { "sys_cores_per_node": 84, + "allocation": "standard", "scheduler": "flux", "rocm_arch": "gfx942", "sys_cores_os_reserved_per_node": 12, diff --git a/lib/benchpark/test/experiment.py b/lib/benchpark/test/experiment.py index 0c5b54353..87f71bc5b 100644 --- a/lib/benchpark/test/experiment.py +++ b/lib/benchpark/test/experiment.py @@ -141,7 +141,10 @@ def test_default_modifiers_section(): modifiers_section = experiment.compute_modifiers_section_wrapper() - assert modifiers_section == [{"name": "allocation"}, {"name": "exit-code"}] + assert modifiers_section == [ + {"name": "allocation", "mode": "standard"}, + {"name": "exit-code"}, + ] def test_multiple_models(): diff --git a/modifiers/allocation/modifier.py b/modifiers/allocation/modifier.py index 4fb42fabf..b3649ecaa 100644 --- a/modifiers/allocation/modifier.py +++ b/modifiers/allocation/modifier.py @@ -227,10 +227,11 @@ class Allocation(BasicModifier): tags("infrastructure") - # Currently there is only one mode. The only behavior supported right - # now is to attempt to request "enough" resources for a given - # request (e.g. to make sure we request enough nodes, assuming we - # know how many CPUs we want)" + mode( + name="torchrun-hpc", + description="Use torchrun-hpc as launcher instead of default scheduler launcher.", + ) + mode("standard", description="Standard execution mode for allocation") default_mode("standard") @@ -294,6 +295,9 @@ def determine_allocation(self, v): ) v.n_nodes = max(cores_node_request or 0, gpus_node_request or 0) + if not v.n_ranks_per_node: + v.n_ranks_per_node = v.n_ranks // v.n_nodes + if not v.n_threads_per_proc: v.n_threads_per_proc = 1 @@ -313,10 +317,20 @@ def determine_allocation(self, v): def slurm_instructions(self, v): sbatch_opts, srun_opts = Allocation._init_batch_and_cmd_opts(v) + launch_cmd = "srun" if self._usage_mode == "standard" else self._usage_mode + if v.n_ranks: - srun_opts.append(f"-n {v.n_ranks}") + if self._usage_mode == "torchrun-hpc": + srun_opts.append(f"-n {v.n_ranks_per_node}") + else: + srun_opts.append(f"-n {v.n_ranks}") + sbatch_opts.append(f"-n {v.n_ranks}") if v.n_gpus: - srun_opts.append(f"--gpus {v.n_gpus}") + if self._usage_mode == "torchrun-hpc": + srun_opts.append("--gpus-per-proc=1") + else: + srun_opts.append(f"--gpus {v.n_gpus}") + sbatch_opts.append(f"--gpus {v.n_gpus}") if v.n_nodes: srun_opts.append(f"-N {v.n_nodes}") @@ -331,9 +345,9 @@ def slurm_instructions(self, v): sbatch_opts.append("--exclusive") - sbatch_directives = list(f"#SBATCH {x}" for x in (srun_opts + sbatch_opts)) + sbatch_directives = list(f"#SBATCH {x}" for x in (sbatch_opts)) - v.mpi_command = f"srun {' '.join(srun_opts)}" + v.mpi_command = f"{launch_cmd} {' '.join(srun_opts)}" v.batch_submit = "sbatch {execute_experiment}" v.allocation_directives = "\n".join(sbatch_directives) @@ -384,18 +398,29 @@ def _init_batch_and_cmd_opts(v): def flux_instructions(self, v): batch_opts, cmd_opts = Allocation._init_batch_and_cmd_opts(v) + launch_cmd = "flux run" if self._usage_mode == "standard" else self._usage_mode + # Always run exclusive for mpibind + flux. # Otherwise, binding may oversubscribe cores before all cores are allocated. cmd_opts.append("--exclusive") + batch_opts.append("--exclusive") # Required for '--exclusive'. Will be computed, if not defined, from initialization cmd_opts.append(f"-N {v.n_nodes}") + batch_opts.append(f"-N {v.n_nodes}") cmd_ranks = "" if v.n_ranks: - cmd_ranks = f"-n {v.n_ranks}" + if self._usage_mode == "torchrun-hpc": + cmd_ranks = f"-n {v.n_ranks_per_node}" + else: + cmd_ranks = f"-n {v.n_ranks}" if v.n_gpus: gpus_per_rank = 1 # self.gpus_as_gpus_per_rank(v) - cmd_opts.append(f"-g={gpus_per_rank}") + if self._usage_mode == "torchrun-hpc": + cmd_opts.append(f"--gpus-per-proc={gpus_per_rank}") + else: + cmd_opts.append(f"-g={gpus_per_rank}") + batch_opts.append(f"-g={gpus_per_rank}") if v.queue: batch_opts.append(f"-q {v.queue}") @@ -406,9 +431,9 @@ def flux_instructions(self, v): if v.bank: batch_opts.append(f"-B {v.bank}") - batch_directives = list(f"# flux: {x}" for x in (cmd_opts + batch_opts)) + batch_directives = list(f"# flux: {x}" for x in (batch_opts)) - v.mpi_command = f"flux run {' '.join([cmd_ranks] + cmd_opts)}" + v.mpi_command = f"{launch_cmd} {' '.join([cmd_ranks] + cmd_opts)}" v.batch_submit = "flux batch {execute_experiment}" v.allocation_directives = "\n".join(batch_directives) diff --git a/repos/ramble_applications/py_scaffold/application.py b/repos/ramble_applications/py_scaffold/application.py index 2326c635f..ac6b79f0f 100644 --- a/repos/ramble_applications/py_scaffold/application.py +++ b/repos/ramble_applications/py_scaffold/application.py @@ -14,31 +14,31 @@ class PyScaffold(ExecutableApplication): tags = ["python"] - register_phase("prepend_library_path", pipeline="setup", run_before=["make_experiments"]) + register_phase( + "prepend_library_path", pipeline="setup", run_before=["make_experiments"] + ) def _prepend_library_path(self, workspace, app_inst=None): """Function to prepend to LD_LIBRARY_PATH, can't do in spack because python_platlib points to wrong site-packages dir""" paths = [] - # if cuda - if "cuda_arch" in app_inst.variables.keys(): - # Avoid libcudnn_graph.so error (unnecessary if cuX_full, necessary if cuX wheel) - paths.append("{pip_site_packages_path}/nvidia/cudnn/lib") app_inst.variables["rocm_mods"] = "" if "rocm_arch" in app_inst.variables.keys(): - app_inst.variables["rocm_mods"] = "module load rocm/6.4.2 rccl/fast-env-slows-mpi libfabric\nexport SPINDLE_FLUXOPT=off\nexport LD_PRELOAD=/opt/rocm-6.4.2/llvm/lib/libomp.so\nexport MPICH_GPU_SUPPORT_ENABLED=0\nexport LD_LIBRARY_PATH=/collab/usr/global/tools/rccl/toss_4_x86_64_ib_cray/rocm-6.4.1/install/lib/:$LD_LIBRARY_PATH\nexport LD_LIBRARY_PATH=/opt/cray/pe/cce/20.0.0/cce/x86_64/lib:$LD_LIBRARY_PATH\n" - - # if caliper - Avoid libcaffe2_nvrtc.so - paths.append("{pip_site_packages_path}/torch/lib") + app_inst.variables["rocm_mods"] = ( + "module load rocm/6.4.2 rccl/fast-env-slows-mpi libfabric\nexport SPINDLE_FLUXOPT=off\nexport LD_PRELOAD=/opt/rocm-6.4.2/llvm/lib/libomp.so\nexport MPICH_GPU_SUPPORT_ENABLED=0\nexport LD_LIBRARY_PATH=/collab/usr/global/tools/rccl/toss_4_x86_64_ib_cray/rocm-6.4.1/install/lib/:$LD_LIBRARY_PATH\nexport LD_LIBRARY_PATH=/opt/cray/pe/cce/20.0.0/cce/x86_64/lib:$LD_LIBRARY_PATH\n" + ) app_inst.variables["ld_paths"] = ":".join(paths) with when("package_manager_family=pip"): software_spec("scaffold", pkg_spec="py-scaffold") - input_file('config_file', url='https://github.com/LBANN/ScaFFold/blob/main/ScaFFold/configs/benchmark_default.yml', - md5='eb9b1337aa16dfc7feed0936e61b410c8f2caa89773fd60321254e9ba8cf3771', - description='') + input_file( + "config_file", + url="https://raw.githubusercontent.com/LBANN/ScaFFold/refs/heads/main/ScaFFold/configs/benchmark_default.yml", + expand=False, + description="", + ) # TODO: Figure out MPICH_GPU_SUPPORT_ENABLED=0, disabling GTL otherwise linker error. executable( @@ -47,13 +47,13 @@ def _prepend_library_path(self, workspace, app_inst=None): ) executable( "generate", - "scaffold generate_fractals -c {config_file} --problem-scale {problem_scale}", - use_mpi=True, + "$(which scaffold) generate_fractals -c {config_file} --problem-scale {problem_scale}", + use_mpi=False, ) executable( "run", - "scaffold benchmark -c {config_file} --problem-scale {problem_scale}", + "$(which scaffold) benchmark -c {config_file} --problem-scale {problem_scale}", use_mpi=True, ) - workload("sweep", executables=["modules", "generate", "run"]) + workload("sweep", executables=["modules", "generate", "run"], input="config_file")