From 5f79b7aa48ac61439cce41e0cd45a3b0aa7790ec Mon Sep 17 00:00:00 2001 From: davidjurado Date: Fri, 5 Jan 2024 12:05:06 -0500 Subject: [PATCH] Add CUDA_VISIBLE_DEVICES environment variable when using the --gpus flag (#345) Add CUDA_VISIBLE_DEVICES environment variable when using the `--gpus` flag. --- mlcube/mlcube/parser.py | 12 ++++++++---- runners/mlcube_docker/mlcube_docker/docker_run.py | 13 +++++++++++++ 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/mlcube/mlcube/parser.py b/mlcube/mlcube/parser.py index 17ee4269..3097bf50 100644 --- a/mlcube/mlcube/parser.py +++ b/mlcube/mlcube/parser.py @@ -126,13 +126,17 @@ def parse_extra_arg( key = "--security-opt" if platform == "docker" else "--security" runner_run_args[key] = parsed_args["security"] if parsed_args.get("gpus", None): + cuda_visible_devices = parsed_args["gpus"] + if "device" in cuda_visible_devices: + cuda_visible_devices = cuda_visible_devices.replace("device=", "") + elif str(cuda_visible_devices).isnumeric(): + cuda_visible_devices = str(list(range(int(cuda_visible_devices)))) + cuda_visible_devices = cuda_visible_devices.replace(" ", "")[1:-1] if platform == "docker": - runner_run_args["--gpus"] = parsed_args["gpus"] + runner_run_args["--gpus"] = cuda_visible_devices else: runner_run_args["--nv"] = "" - os.environ["SINGULARITYENV_CUDA_VISIBLE_DEVICES"] = parsed_args[ - "gpus" - ] + os.environ["SINGULARITYENV_CUDA_VISIBLE_DEVICES"] = cuda_visible_devices if parsed_args.get("memory", None): key = "--memory" if platform == "docker" else "--vm-ram" runner_run_args[key] = parsed_args["memory"] diff --git a/runners/mlcube_docker/mlcube_docker/docker_run.py b/runners/mlcube_docker/mlcube_docker/docker_run.py index a8f918fe..4829bc65 100644 --- a/runners/mlcube_docker/mlcube_docker/docker_run.py +++ b/runners/mlcube_docker/mlcube_docker/docker_run.py @@ -266,6 +266,19 @@ def run(self) -> None: if extra_args: run_args += " " + extra_args + valid_gpu_flag = "--gpus" in self.mlcube.runner and self.mlcube.runner["--gpus"] is not None + + if valid_gpu_flag: + cuda_visible_devices = self.mlcube.runner["--gpus"] + else: + cuda_visible_devices = num_gpus + + if str(cuda_visible_devices).isnumeric(): + cuda_visible_devices = str(list(range(int(cuda_visible_devices)))) + cuda_visible_devices = cuda_visible_devices.replace(" ", "")[1:-1] + + run_args += f" --env CUDA_VISIBLE_DEVICES={cuda_visible_devices}" + if "entrypoint" in self.mlcube.tasks[self.task]: logger.info( "Using custom task entrypoint: task=%s, entrypoint='%s'",