Skip to content
Open
3 changes: 3 additions & 0 deletions .github/utils/dryruns.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@
# Not mpionlyexperiment
"py-scaffold+strong",
"py-scaffold+weak",
# Needs package_manager=spack-pip
"py-scaffold+rocm",
"py-scaffold+cuda",
]


Expand Down
6 changes: 4 additions & 2 deletions .gitlab/tests/shared_flux_clusters.yml
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,8 @@ run_tests_flux_tuolumne:
- HOST: tuolumne
ARCHCONFIG: llnl-elcapitan
BENCHMARK: [py-scaffold]
VARIANT: ['+rocm package_manager=spack-pip caliper=mpi,time,rocm']
VARIANT:
- +rocm package_manager=spack-pip caliper=mpi,time,rocm allocation=torchrun-hpc
# rocm7
- HOST: tuolumne
ARCHCONFIG: llnl-elcapitan
Expand All @@ -152,4 +153,5 @@ run_tests_flux_tioga:
- HOST: tioga
ARCHCONFIG: llnl-elcapitan
BENCHMARK: [py-scaffold]
VARIANT: ['+rocm package_manager=spack-pip caliper=mpi,time,rocm']
VARIANT:
- +rocm package_manager=spack-pip caliper=mpi,time,rocm allocation=torchrun-hpc
3 changes: 2 additions & 1 deletion .gitlab/tests/shared_slurm_clusters.yml
Original file line number Diff line number Diff line change
Expand Up @@ -151,4 +151,5 @@ run_tests_slurm_matrix:
- HOST: matrix
ARCHCONFIG: llnl-matrix
BENCHMARK: [py-scaffold]
VARIANT: ['+cuda package_manager=spack-pip caliper=mpi,time,cuda']
VARIANT:
- +cuda package_manager=spack-pip caliper=mpi,time,cuda allocation=torchrun-hpc
5 changes: 2 additions & 3 deletions experiments/osu-micro-benchmarks/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ class OsuMicroBenchmarks(

def compute_applications_section(self):

num_nodes = {"n_nodes": 2}
num_nodes = {"n_nodes": 2, "n_ranks": 1}

if self.spec.satisfies("exec_mode=test"):
for pk, pv in num_nodes.items():
Expand All @@ -119,8 +119,7 @@ def compute_applications_section(self):
self.add_experiment_variable("additional_args", " -d cuda", False)
if self.spec.satisfies("+rocm") or self.spec.satisfies("+cuda"):
resource = "n_gpus"
for pk, pv in num_nodes.items():
self.add_experiment_variable("n_gpus", pv, True)
self.add_experiment_variable("n_gpus", 1, True)
else:
resource = "n_nodes"

Expand Down
9 changes: 9 additions & 0 deletions experiments/py-scaffold/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,15 @@ def compute_applications_section(self):
)

def compute_package_section(self):
if self.spec.variants["package_manager"][0] != "spack-pip":
raise ValueError(
"Use the 'spack-pip' package manager for this benchmark. Set 'package_manager=spack-pip'"
)
elif self.spec.variants["allocation"][0] != "torchrun-hpc":
raise ValueError(
"Use the 'torchrun-hpc' launcher mode for this benchmark. Set 'allocation=torchrun-hpc'"
)

# Spec that will be written into requirements.txt for pip install
sys_name = self.system_spec._name
if self.spec.satisfies("+rocm"):
Expand Down
12 changes: 11 additions & 1 deletion lib/benchpark/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,13 @@ class Experiment(ExperimentSystemBase, ExecMode, Affinity, Hwloc):
description="Number of experiment repetitions",
)

variant(
"allocation",
default="standard",
values=("standard", "torchrun-hpc"),
description="Allocation modifier mode",
)

def __init__(self, spec):
self.spec: "benchpark.spec.ConcreteExperimentSpec" = spec
# Device type must be set before super with absence of mpionly experiment type
Expand Down Expand Up @@ -372,7 +379,10 @@ def compute_modifiers_section(self):

def compute_modifiers_section_wrapper(self):
# by default we use the allocation modifier and no others
modifier_list = [{"name": "allocation"}, {"name": "exit-code"}]
modifier_list = [
{"name": "allocation", "mode": self.spec.variants["allocation"][0]},
{"name": "exit-code"},
]
modifier_list += self.compute_modifiers_section()
for cls in self.helpers:
cls_list = cls.compute_modifiers_section()
Expand Down
2 changes: 2 additions & 0 deletions lib/benchpark/test/caliper.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def test_experiment_compute_variables_section_caliper(monkeypatch):
assert vars_section == {
"caliper_metadata": {
"affinity": "none",
"allocation": "standard",
"hwloc": "none",
"application_name": "{application_name}",
"experiment_name": "{experiment_name}",
Expand Down Expand Up @@ -89,6 +90,7 @@ def test_caliper_modifier(monkeypatch):
# Check file
assert data == {
"sys_cores_per_node": 84,
"allocation": "standard",
"scheduler": "flux",
"rocm_arch": "gfx942",
"sys_cores_os_reserved_per_node": 12,
Expand Down
5 changes: 4 additions & 1 deletion lib/benchpark/test/experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,10 @@ def test_default_modifiers_section():

modifiers_section = experiment.compute_modifiers_section_wrapper()

assert modifiers_section == [{"name": "allocation"}, {"name": "exit-code"}]
assert modifiers_section == [
{"name": "allocation", "mode": "standard"},
{"name": "exit-code"},
]


def test_multiple_models():
Expand Down
49 changes: 37 additions & 12 deletions modifiers/allocation/modifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,10 +227,11 @@ class Allocation(BasicModifier):

tags("infrastructure")

# Currently there is only one mode. The only behavior supported right
# now is to attempt to request "enough" resources for a given
# request (e.g. to make sure we request enough nodes, assuming we
# know how many CPUs we want)"
mode(
name="torchrun-hpc",
description="Use torchrun-hpc as launcher instead of default scheduler launcher.",
)

mode("standard", description="Standard execution mode for allocation")
default_mode("standard")

Expand Down Expand Up @@ -294,6 +295,9 @@ def determine_allocation(self, v):
)
v.n_nodes = max(cores_node_request or 0, gpus_node_request or 0)

if not v.n_ranks_per_node:
v.n_ranks_per_node = v.n_ranks // v.n_nodes

if not v.n_threads_per_proc:
v.n_threads_per_proc = 1

Expand All @@ -313,10 +317,20 @@ def determine_allocation(self, v):
def slurm_instructions(self, v):
sbatch_opts, srun_opts = Allocation._init_batch_and_cmd_opts(v)

launch_cmd = "srun" if self._usage_mode == "standard" else self._usage_mode

if v.n_ranks:
srun_opts.append(f"-n {v.n_ranks}")
if self._usage_mode == "torchrun-hpc":
srun_opts.append(f"-n {v.n_ranks_per_node}")
else:
srun_opts.append(f"-n {v.n_ranks}")
sbatch_opts.append(f"-n {v.n_ranks}")
if v.n_gpus:
srun_opts.append(f"--gpus {v.n_gpus}")
if self._usage_mode == "torchrun-hpc":
srun_opts.append("--gpus-per-proc=1")
else:
srun_opts.append(f"--gpus {v.n_gpus}")
sbatch_opts.append(f"--gpus {v.n_gpus}")
if v.n_nodes:
srun_opts.append(f"-N {v.n_nodes}")

Expand All @@ -331,9 +345,9 @@ def slurm_instructions(self, v):

sbatch_opts.append("--exclusive")

sbatch_directives = list(f"#SBATCH {x}" for x in (srun_opts + sbatch_opts))
sbatch_directives = list(f"#SBATCH {x}" for x in (sbatch_opts))

v.mpi_command = f"srun {' '.join(srun_opts)}"
v.mpi_command = f"{launch_cmd} {' '.join(srun_opts)}"
v.batch_submit = "sbatch {execute_experiment}"
v.allocation_directives = "\n".join(sbatch_directives)

Expand Down Expand Up @@ -384,18 +398,29 @@ def _init_batch_and_cmd_opts(v):
def flux_instructions(self, v):
batch_opts, cmd_opts = Allocation._init_batch_and_cmd_opts(v)

launch_cmd = "flux run" if self._usage_mode == "standard" else self._usage_mode

# Always run exclusive for mpibind + flux.
# Otherwise, binding may oversubscribe cores before all cores are allocated.
cmd_opts.append("--exclusive")
batch_opts.append("--exclusive")
# Required for '--exclusive'. Will be computed, if not defined, from initialization
cmd_opts.append(f"-N {v.n_nodes}")
batch_opts.append(f"-N {v.n_nodes}")

cmd_ranks = ""
if v.n_ranks:
cmd_ranks = f"-n {v.n_ranks}"
if self._usage_mode == "torchrun-hpc":
cmd_ranks = f"-n {v.n_ranks_per_node}"
else:
cmd_ranks = f"-n {v.n_ranks}"
if v.n_gpus:
gpus_per_rank = 1 # self.gpus_as_gpus_per_rank(v)
cmd_opts.append(f"-g={gpus_per_rank}")
if self._usage_mode == "torchrun-hpc":
cmd_opts.append(f"--gpus-per-proc={gpus_per_rank}")
else:
cmd_opts.append(f"-g={gpus_per_rank}")
batch_opts.append(f"-g={gpus_per_rank}")

if v.queue:
batch_opts.append(f"-q {v.queue}")
Expand All @@ -406,9 +431,9 @@ def flux_instructions(self, v):
if v.bank:
batch_opts.append(f"-B {v.bank}")

batch_directives = list(f"# flux: {x}" for x in (cmd_opts + batch_opts))
batch_directives = list(f"# flux: {x}" for x in (batch_opts))

v.mpi_command = f"flux run {' '.join([cmd_ranks] + cmd_opts)}"
v.mpi_command = f"{launch_cmd} {' '.join([cmd_ranks] + cmd_opts)}"
v.batch_submit = "flux batch {execute_experiment}"
v.allocation_directives = "\n".join(batch_directives)

Expand Down
32 changes: 16 additions & 16 deletions repos/ramble_applications/py_scaffold/application.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,31 +14,31 @@ class PyScaffold(ExecutableApplication):

tags = ["python"]

register_phase("prepend_library_path", pipeline="setup", run_before=["make_experiments"])
register_phase(
"prepend_library_path", pipeline="setup", run_before=["make_experiments"]
)

def _prepend_library_path(self, workspace, app_inst=None):
"""Function to prepend to LD_LIBRARY_PATH, can't do in spack because python_platlib points to wrong site-packages dir"""
paths = []
# if cuda
if "cuda_arch" in app_inst.variables.keys():
# Avoid libcudnn_graph.so error (unnecessary if cuX_full, necessary if cuX wheel)
paths.append("{pip_site_packages_path}/nvidia/cudnn/lib")

app_inst.variables["rocm_mods"] = ""
if "rocm_arch" in app_inst.variables.keys():
app_inst.variables["rocm_mods"] = "module load rocm/6.4.2 rccl/fast-env-slows-mpi libfabric\nexport SPINDLE_FLUXOPT=off\nexport LD_PRELOAD=/opt/rocm-6.4.2/llvm/lib/libomp.so\nexport MPICH_GPU_SUPPORT_ENABLED=0\nexport LD_LIBRARY_PATH=/collab/usr/global/tools/rccl/toss_4_x86_64_ib_cray/rocm-6.4.1/install/lib/:$LD_LIBRARY_PATH\nexport LD_LIBRARY_PATH=/opt/cray/pe/cce/20.0.0/cce/x86_64/lib:$LD_LIBRARY_PATH\n"

# if caliper - Avoid libcaffe2_nvrtc.so
paths.append("{pip_site_packages_path}/torch/lib")
app_inst.variables["rocm_mods"] = (
"module load rocm/6.4.2 rccl/fast-env-slows-mpi libfabric\nexport SPINDLE_FLUXOPT=off\nexport LD_PRELOAD=/opt/rocm-6.4.2/llvm/lib/libomp.so\nexport MPICH_GPU_SUPPORT_ENABLED=0\nexport LD_LIBRARY_PATH=/collab/usr/global/tools/rccl/toss_4_x86_64_ib_cray/rocm-6.4.1/install/lib/:$LD_LIBRARY_PATH\nexport LD_LIBRARY_PATH=/opt/cray/pe/cce/20.0.0/cce/x86_64/lib:$LD_LIBRARY_PATH\n"
)

app_inst.variables["ld_paths"] = ":".join(paths)

with when("package_manager_family=pip"):
software_spec("scaffold", pkg_spec="py-scaffold")

input_file('config_file', url='https://github.com/LBANN/ScaFFold/blob/main/ScaFFold/configs/benchmark_default.yml',
md5='eb9b1337aa16dfc7feed0936e61b410c8f2caa89773fd60321254e9ba8cf3771',
description='')
input_file(
"config_file",
url="https://raw.githubusercontent.com/LBANN/ScaFFold/refs/heads/main/ScaFFold/configs/benchmark_default.yml",
expand=False,
description="",
)

# TODO: Figure out MPICH_GPU_SUPPORT_ENABLED=0, disabling GTL otherwise linker error.
executable(
Expand All @@ -47,13 +47,13 @@ def _prepend_library_path(self, workspace, app_inst=None):
)
executable(
"generate",
"scaffold generate_fractals -c {config_file} --problem-scale {problem_scale}",
use_mpi=True,
"$(which scaffold) generate_fractals -c {config_file} --problem-scale {problem_scale}",
use_mpi=False,
)
executable(
"run",
"scaffold benchmark -c {config_file} --problem-scale {problem_scale}",
"$(which scaffold) benchmark -c {config_file} --problem-scale {problem_scale}",
use_mpi=True,
)

workload("sweep", executables=["modules", "generate", "run"])
workload("sweep", executables=["modules", "generate", "run"], input="config_file")
Loading