From f9325782396a5c5f0b89ed8966d8e2e35d6577a3 Mon Sep 17 00:00:00 2001 From: Diego Lovison Date: Tue, 15 Oct 2024 14:30:02 -0300 Subject: [PATCH] Automated test to verify AMD GPU's can be used in Data Science Pipelines --- .../pytorch/pytorch_amd_gpu_availability.py | 54 ++++++ ...pytorch_amd_gpu_availability_compiled.yaml | 139 +++++++++++++++ .../pytorch_nvidia_gpu_availability.py | 54 ++++++ ...orch_nvidia_gpu_availability_compiled.yaml | 139 +++++++++++++++ .../pytorch_verify_gpu_availability.py | 65 ------- ...orch_verify_gpu_availability_compiled.yaml | 164 ------------------ ...=> 1105__data-science-pipelines-gpu.robot} | 41 +++-- 7 files changed, 416 insertions(+), 240 deletions(-) create mode 100644 ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability.py create mode 100644 ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability_compiled.yaml create mode 100644 ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability.py create mode 100644 ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability_compiled.yaml delete mode 100644 ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_verify_gpu_availability.py delete mode 100644 ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_verify_gpu_availability_compiled.yaml rename ods_ci/tests/Tests/1100__data_science_pipelines/{1105__data-science-pipelines-gpu-nvidia.robot => 1105__data-science-pipelines-gpu.robot} (63%) diff --git a/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability.py b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability.py new file mode 100644 index 000000000..71f532de3 --- /dev/null +++ b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability.py @@ -0,0 +1,54 @@ +from kfp import compiler, dsl, kubernetes +from kfp.dsl import PipelineTask + +# Runtime: Pytorch with ROCm and Python 3.9 (UBI 9) +common_base_image = ( + "quay.io/modh/runtime-images@sha256:76d2a5d9ca0bb35c4d9962a7ee6798672afd3a5c7d226d87ef65f952e7cb93cf" +) + + +def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_limit: int): + print(f"Adding GPU tolerations: {accelerator_type}({accelerator_limit})") + task.set_accelerator_type(accelerator=accelerator_type) + task.set_accelerator_limit(accelerator_limit) + kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule") + + +@dsl.component( + base_image=common_base_image +) +def verify_gpu_availability(gpu_toleration: bool): + import torch + + cuda_available = torch.cuda.is_available() + device_count = torch.cuda.device_count() + print("------------------------------") + print("GPU availability") + print("------------------------------") + print(f"cuda available: {cuda_available}") + print(f"device count: {device_count}") + if gpu_toleration: + assert torch.cuda.is_available() + assert torch.cuda.device_count() > 0 + t = torch.tensor([5, 5, 5], dtype=torch.int64, device='cuda') + else: + assert not torch.cuda.is_available() + assert torch.cuda.device_count() == 0 + t = torch.tensor([5, 5, 5], dtype=torch.int64) + print(f"tensor: {t}") + print("GPU availability test: PASS") + + +@dsl.pipeline( + name="pytorch-amd-gpu-availability", + description="Verifies pipeline tasks run on GPU nodes only when tolerations are added", +) +def pytorch_amd_gpu_availability(): + verify_gpu_availability(gpu_toleration=False).set_caching_options(False) + + task_with_toleration = verify_gpu_availability(gpu_toleration=True).set_caching_options(False) + add_gpu_toleration(task_with_toleration, "amd.com/gpu", 1) + + +if __name__ == "__main__": + compiler.Compiler().compile(pytorch_amd_gpu_availability, package_path=__file__.replace(".py", "_compiled.yaml")) diff --git a/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability_compiled.yaml b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability_compiled.yaml new file mode 100644 index 000000000..1c2b67ddc --- /dev/null +++ b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability_compiled.yaml @@ -0,0 +1,139 @@ +# PIPELINE DEFINITION +# Name: pytorch-amd-gpu-availability +# Description: Verifies pipeline tasks run on GPU nodes only when tolerations are added +components: + comp-verify-gpu-availability: + executorLabel: exec-verify-gpu-availability + inputDefinitions: + parameters: + gpu_toleration: + parameterType: BOOLEAN + comp-verify-gpu-availability-2: + executorLabel: exec-verify-gpu-availability-2 + inputDefinitions: + parameters: + gpu_toleration: + parameterType: BOOLEAN +deploymentSpec: + executors: + exec-verify-gpu-availability: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - verify_gpu_availability + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.9.0'\ + \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ + $0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\ + \n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\ + \ print(\"------------------------------\")\n print(\"GPU availability\"\ + )\n print(\"------------------------------\")\n print(f\"cuda available:\ + \ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \ + \ if gpu_toleration:\n assert torch.cuda.is_available()\n \ + \ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\ + \ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\ + \ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\ + \ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\ + \ availability test: PASS\")\n\n" + image: quay.io/modh/runtime-images@sha256:76d2a5d9ca0bb35c4d9962a7ee6798672afd3a5c7d226d87ef65f952e7cb93cf + exec-verify-gpu-availability-2: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - verify_gpu_availability + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.9.0'\ + \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ + $0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\ + \n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\ + \ print(\"------------------------------\")\n print(\"GPU availability\"\ + )\n print(\"------------------------------\")\n print(f\"cuda available:\ + \ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \ + \ if gpu_toleration:\n assert torch.cuda.is_available()\n \ + \ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\ + \ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\ + \ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\ + \ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\ + \ availability test: PASS\")\n\n" + image: quay.io/modh/runtime-images@sha256:76d2a5d9ca0bb35c4d9962a7ee6798672afd3a5c7d226d87ef65f952e7cb93cf + resources: + accelerator: + count: '1' + type: amd.com/gpu +pipelineInfo: + description: Verifies pipeline tasks run on GPU nodes only when tolerations are + added + name: pytorch-amd-gpu-availability +root: + dag: + tasks: + verify-gpu-availability: + cachingOptions: {} + componentRef: + name: comp-verify-gpu-availability + inputs: + parameters: + gpu_toleration: + runtimeValue: + constant: false + taskInfo: + name: verify-gpu-availability + verify-gpu-availability-2: + cachingOptions: {} + componentRef: + name: comp-verify-gpu-availability-2 + inputs: + parameters: + gpu_toleration: + runtimeValue: + constant: true + taskInfo: + name: verify-gpu-availability-2 +schemaVersion: 2.1.0 +sdkVersion: kfp-2.9.0 +--- +platforms: + kubernetes: + deploymentSpec: + executors: + exec-verify-gpu-availability-2: + tolerations: + - effect: NoSchedule + key: amd.com/gpu + operator: Exists diff --git a/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability.py b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability.py new file mode 100644 index 000000000..575e859fc --- /dev/null +++ b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability.py @@ -0,0 +1,54 @@ +from kfp import compiler, dsl, kubernetes +from kfp.dsl import PipelineTask + +# Runtime: Pytorch with CUDA and Python 3.9 (UBI 9) +common_base_image = ( + "quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa" +) + + +def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_limit: int): + print(f"Adding GPU tolerations: {accelerator_type}({accelerator_limit})") + task.set_accelerator_type(accelerator=accelerator_type) + task.set_accelerator_limit(accelerator_limit) + kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule") + + +@dsl.component( + base_image=common_base_image +) +def verify_gpu_availability(gpu_toleration: bool): + import torch + + cuda_available = torch.cuda.is_available() + device_count = torch.cuda.device_count() + print("------------------------------") + print("GPU availability") + print("------------------------------") + print(f"cuda available: {cuda_available}") + print(f"device count: {device_count}") + if gpu_toleration: + assert torch.cuda.is_available() + assert torch.cuda.device_count() > 0 + t = torch.tensor([5, 5, 5], dtype=torch.int64, device='cuda') + else: + assert not torch.cuda.is_available() + assert torch.cuda.device_count() == 0 + t = torch.tensor([5, 5, 5], dtype=torch.int64) + print(f"tensor: {t}") + print("GPU availability test: PASS") + + +@dsl.pipeline( + name="pytorch-nvidia-gpu-availability", + description="Verifies pipeline tasks run on GPU nodes only when tolerations are added", +) +def pytorch_nvidia_gpu_availability(): + verify_gpu_availability(gpu_toleration=False).set_caching_options(False) + + task_with_toleration = verify_gpu_availability(gpu_toleration=True).set_caching_options(False) + add_gpu_toleration(task_with_toleration, "nvidia.com/gpu", 1) + + +if __name__ == "__main__": + compiler.Compiler().compile(pytorch_nvidia_gpu_availability, package_path=__file__.replace(".py", "_compiled.yaml")) diff --git a/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability_compiled.yaml b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability_compiled.yaml new file mode 100644 index 000000000..7c40c99f4 --- /dev/null +++ b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability_compiled.yaml @@ -0,0 +1,139 @@ +# PIPELINE DEFINITION +# Name: pytorch-nvidia-gpu-availability +# Description: Verifies pipeline tasks run on GPU nodes only when tolerations are added +components: + comp-verify-gpu-availability: + executorLabel: exec-verify-gpu-availability + inputDefinitions: + parameters: + gpu_toleration: + parameterType: BOOLEAN + comp-verify-gpu-availability-2: + executorLabel: exec-verify-gpu-availability-2 + inputDefinitions: + parameters: + gpu_toleration: + parameterType: BOOLEAN +deploymentSpec: + executors: + exec-verify-gpu-availability: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - verify_gpu_availability + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.9.0'\ + \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ + $0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\ + \n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\ + \ print(\"------------------------------\")\n print(\"GPU availability\"\ + )\n print(\"------------------------------\")\n print(f\"cuda available:\ + \ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \ + \ if gpu_toleration:\n assert torch.cuda.is_available()\n \ + \ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\ + \ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\ + \ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\ + \ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\ + \ availability test: PASS\")\n\n" + image: quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa + exec-verify-gpu-availability-2: + container: + args: + - --executor_input + - '{{$}}' + - --function_to_execute + - verify_gpu_availability + command: + - sh + - -c + - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ + \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ + \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.9.0'\ + \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\ + $0\" \"$@\"\n" + - sh + - -ec + - 'program_path=$(mktemp -d) + + + printf "%s" "$0" > "$program_path/ephemeral_component.py" + + _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" + + ' + - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ + \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\ + \n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\ + \ print(\"------------------------------\")\n print(\"GPU availability\"\ + )\n print(\"------------------------------\")\n print(f\"cuda available:\ + \ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \ + \ if gpu_toleration:\n assert torch.cuda.is_available()\n \ + \ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\ + \ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\ + \ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\ + \ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\ + \ availability test: PASS\")\n\n" + image: quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa + resources: + accelerator: + count: '1' + type: nvidia.com/gpu +pipelineInfo: + description: Verifies pipeline tasks run on GPU nodes only when tolerations are + added + name: pytorch-nvidia-gpu-availability +root: + dag: + tasks: + verify-gpu-availability: + cachingOptions: {} + componentRef: + name: comp-verify-gpu-availability + inputs: + parameters: + gpu_toleration: + runtimeValue: + constant: false + taskInfo: + name: verify-gpu-availability + verify-gpu-availability-2: + cachingOptions: {} + componentRef: + name: comp-verify-gpu-availability-2 + inputs: + parameters: + gpu_toleration: + runtimeValue: + constant: true + taskInfo: + name: verify-gpu-availability-2 +schemaVersion: 2.1.0 +sdkVersion: kfp-2.9.0 +--- +platforms: + kubernetes: + deploymentSpec: + executors: + exec-verify-gpu-availability-2: + tolerations: + - effect: NoSchedule + key: nvidia.com/gpu + operator: Exists diff --git a/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_verify_gpu_availability.py b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_verify_gpu_availability.py deleted file mode 100644 index eb824e0cd..000000000 --- a/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_verify_gpu_availability.py +++ /dev/null @@ -1,65 +0,0 @@ -from kfp import compiler, dsl, kubernetes -from kfp.dsl import PipelineTask - -# Runtime: Pytorch with CUDA and Python 3.9 (UBI 9) -common_base_image = ( - "quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa" -) - - -def add_pip_index_configuration(task: PipelineTask): - kubernetes.use_config_map_as_env( - task, - config_map_name="ds-pipeline-custom-env-vars", - config_map_key_to_env={"pip_index_url": "PIP_INDEX_URL", "pip_trusted_host": "PIP_TRUSTED_HOST"}, - ) - - -def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_limit: int): - print("Adding GPU tolerations") - task.set_accelerator_type(accelerator=accelerator_type) - task.set_accelerator_limit(accelerator_limit) - kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule") - - -@dsl.component( - base_image=common_base_image, - packages_to_install=["torch"], - pip_index_urls=["$PIP_INDEX_URL"], - pip_trusted_hosts=["$PIP_TRUSTED_HOST"], -) -def verify_gpu_availability(gpu_toleration_added: bool): - import torch # noqa: PLC0415 - - cuda_available = torch.cuda.is_available() - device_count = torch.cuda.device_count() - print("------------------------------") - print("GPU availability") - print("------------------------------") - print("gpu_toleration_added:" + str(gpu_toleration_added)) - print("torch.cuda.is_available():" + str(cuda_available)) - print("torch.cuda.device_count():" + str(device_count)) - if gpu_toleration_added and not torch.cuda.is_available(): - print("GPU availability test: FAIL") - raise ValueError("GPU toleration was added but there is no GPU not available for this task") - if not gpu_toleration_added and torch.cuda.is_available(): - print("GPU availability test: FAIL") - raise ValueError("GPU toleration was not added but there is a GPU available for this task") - print("GPU availability test: PASS") - - -@dsl.pipeline( - name="pytorch-verify-gpu-availability", - description="Verifies pipeline tasks run on GPU nodes only when tolerations are added", -) -def pytorch_verify_gpu_availability(): - task_without_toleration = verify_gpu_availability(gpu_toleration_added=False).set_caching_options(False) - add_pip_index_configuration(task_without_toleration) - - task_with_toleration = verify_gpu_availability(gpu_toleration_added=True).set_caching_options(False) - add_pip_index_configuration(task_with_toleration) - add_gpu_toleration(task_with_toleration, "nvidia.com/gpu", 1) - - -if __name__ == "__main__": - compiler.Compiler().compile(pytorch_verify_gpu_availability, package_path=__file__.replace(".py", "_compiled.yaml")) diff --git a/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_verify_gpu_availability_compiled.yaml b/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_verify_gpu_availability_compiled.yaml deleted file mode 100644 index b2a004fcd..000000000 --- a/ods_ci/tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_verify_gpu_availability_compiled.yaml +++ /dev/null @@ -1,164 +0,0 @@ -# PIPELINE DEFINITION -# Name: pytorch-verify-gpu-availability -# Description: Verifies pipeline tasks run on GPU nodes only when tolerations are added -components: - comp-verify-gpu-availability: - executorLabel: exec-verify-gpu-availability - inputDefinitions: - parameters: - gpu_toleration_added: - parameterType: BOOLEAN - comp-verify-gpu-availability-2: - executorLabel: exec-verify-gpu-availability-2 - inputDefinitions: - parameters: - gpu_toleration_added: - parameterType: BOOLEAN -deploymentSpec: - executors: - exec-verify-gpu-availability: - container: - args: - - --executor_input - - '{{$}}' - - --function_to_execute - - verify_gpu_availability - command: - - sh - - -c - - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ - \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ - \ python3 -m pip install --quiet --no-warn-script-location --index-url $PIP_INDEX_URL\ - \ --trusted-host $PIP_TRUSTED_HOST 'kfp==2.9.0' '--no-deps' 'typing-extensions>=3.7.4,<5;\ - \ python_version<\"3.9\"' && python3 -m pip install --quiet --no-warn-script-location\ - \ --index-url $PIP_INDEX_URL --trusted-host $PIP_TRUSTED_HOST 'torch' &&\ - \ \"$0\" \"$@\"\n" - - sh - - -ec - - 'program_path=$(mktemp -d) - - - printf "%s" "$0" > "$program_path/ephemeral_component.py" - - _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" - - ' - - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef verify_gpu_availability(gpu_toleration_added: bool):\n import\ - \ torch # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n\ - \ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\ - )\n print(\"GPU availability\")\n print(\"------------------------------\"\ - )\n print(\"gpu_toleration_added:\" + str(gpu_toleration_added))\n \ - \ print(\"torch.cuda.is_available():\" + str(cuda_available))\n print(\"\ - torch.cuda.device_count():\" + str(device_count))\n if gpu_toleration_added\ - \ and not torch.cuda.is_available():\n print(\"GPU availability test:\ - \ FAIL\")\n raise ValueError(\"GPU toleration was added but there\ - \ is no GPU not available for this task\")\n if not gpu_toleration_added\ - \ and torch.cuda.is_available():\n print(\"GPU availability test:\ - \ FAIL\")\n raise ValueError(\"GPU toleration was not added but there\ - \ is a GPU available for this task\")\n print(\"GPU availability test:\ - \ PASS\")\n\n" - image: quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa - exec-verify-gpu-availability-2: - container: - args: - - --executor_input - - '{{$}}' - - --function_to_execute - - verify_gpu_availability - command: - - sh - - -c - - "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\ - \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\ - \ python3 -m pip install --quiet --no-warn-script-location --index-url $PIP_INDEX_URL\ - \ --trusted-host $PIP_TRUSTED_HOST 'kfp==2.9.0' '--no-deps' 'typing-extensions>=3.7.4,<5;\ - \ python_version<\"3.9\"' && python3 -m pip install --quiet --no-warn-script-location\ - \ --index-url $PIP_INDEX_URL --trusted-host $PIP_TRUSTED_HOST 'torch' &&\ - \ \"$0\" \"$@\"\n" - - sh - - -ec - - 'program_path=$(mktemp -d) - - - printf "%s" "$0" > "$program_path/ephemeral_component.py" - - _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@" - - ' - - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\ - \ *\n\ndef verify_gpu_availability(gpu_toleration_added: bool):\n import\ - \ torch # noqa: PLC0415\n\n cuda_available = torch.cuda.is_available()\n\ - \ device_count = torch.cuda.device_count()\n print(\"------------------------------\"\ - )\n print(\"GPU availability\")\n print(\"------------------------------\"\ - )\n print(\"gpu_toleration_added:\" + str(gpu_toleration_added))\n \ - \ print(\"torch.cuda.is_available():\" + str(cuda_available))\n print(\"\ - torch.cuda.device_count():\" + str(device_count))\n if gpu_toleration_added\ - \ and not torch.cuda.is_available():\n print(\"GPU availability test:\ - \ FAIL\")\n raise ValueError(\"GPU toleration was added but there\ - \ is no GPU not available for this task\")\n if not gpu_toleration_added\ - \ and torch.cuda.is_available():\n print(\"GPU availability test:\ - \ FAIL\")\n raise ValueError(\"GPU toleration was not added but there\ - \ is a GPU available for this task\")\n print(\"GPU availability test:\ - \ PASS\")\n\n" - image: quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa - resources: - accelerator: - count: '1' - type: nvidia.com/gpu -pipelineInfo: - description: Verifies pipeline tasks run on GPU nodes only when tolerations are - added - name: pytorch-verify-gpu-availability -root: - dag: - tasks: - verify-gpu-availability: - cachingOptions: {} - componentRef: - name: comp-verify-gpu-availability - inputs: - parameters: - gpu_toleration_added: - runtimeValue: - constant: false - taskInfo: - name: verify-gpu-availability - verify-gpu-availability-2: - cachingOptions: {} - componentRef: - name: comp-verify-gpu-availability-2 - inputs: - parameters: - gpu_toleration_added: - runtimeValue: - constant: true - taskInfo: - name: verify-gpu-availability-2 -schemaVersion: 2.1.0 -sdkVersion: kfp-2.9.0 ---- -platforms: - kubernetes: - deploymentSpec: - executors: - exec-verify-gpu-availability: - configMapAsEnv: - - configMapName: ds-pipeline-custom-env-vars - keyToEnv: - - configMapKey: pip_index_url - envVar: PIP_INDEX_URL - - configMapKey: pip_trusted_host - envVar: PIP_TRUSTED_HOST - exec-verify-gpu-availability-2: - configMapAsEnv: - - configMapName: ds-pipeline-custom-env-vars - keyToEnv: - - configMapKey: pip_index_url - envVar: PIP_INDEX_URL - - configMapKey: pip_trusted_host - envVar: PIP_TRUSTED_HOST - tolerations: - - effect: NoSchedule - key: nvidia.com/gpu - operator: Exists diff --git a/ods_ci/tests/Tests/1100__data_science_pipelines/1105__data-science-pipelines-gpu-nvidia.robot b/ods_ci/tests/Tests/1100__data_science_pipelines/1105__data-science-pipelines-gpu.robot similarity index 63% rename from ods_ci/tests/Tests/1100__data_science_pipelines/1105__data-science-pipelines-gpu-nvidia.robot rename to ods_ci/tests/Tests/1100__data_science_pipelines/1105__data-science-pipelines-gpu.robot index 87526ac08..086e53afc 100644 --- a/ods_ci/tests/Tests/1100__data_science_pipelines/1105__data-science-pipelines-gpu-nvidia.robot +++ b/ods_ci/tests/Tests/1100__data_science_pipelines/1105__data-science-pipelines-gpu.robot @@ -4,46 +4,65 @@ Resource ../../Resources/RHOSi.resource Resource ../../Resources/Page/ODH/ODHDashboard/ODHDataScienceProject/Projects.resource Resource ../../Resources/CLI/DataSciencePipelines/DataSciencePipelinesBackend.resource Test Tags DataSciencePipelines-Backend -Suite Setup Dsp Nvidia Gpu Suite Setup -Suite Teardown Dsp Nvidia Gpu Suite Teardown +Suite Setup Dsp Gpu Suite Setup +Suite Teardown Dsp Gpu Suite Teardown *** Variables *** # robocop: off=line-too-long -${PROJECT}= dsp-gpu-nvidia -${PIPELINE_GPU_AVAILABILITY_FILEPATH}= tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_verify_gpu_availability_compiled.yaml +${PROJECT}= dsp-gpu +${PIPELINE_NVIDIA_GPU}= tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability_compiled.yaml +${PIPELINE_AMD_GPU}= tests/Resources/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability_compiled.yaml # robocop: on=line-too-long *** Test Cases *** # robocop: off=too-long-test-case -Verify Pipeline Tasks Run On GPU Nodes Only When Tolerations Are Added +Verify Pipeline Tasks Run On Nvidia GPU Nodes Only When Tolerations Are Added [Documentation] Runs a pipeline that tests GPU availability according to GPU tolerations in pipeline tasks: ... - One task should not have GPUs available, as we don't add the GPU tolerations ... - Another task should have GPUs available, as we add the GPU tolerations [Tags] Tier1 Resources-GPU NVIDIA-GPUs + Verify Pipeline Tasks Run On GPU Nodes Only When Tolerations Are Added ${PIPELINE_NVIDIA_GPU} + +# robocop: off=too-long-test-case +Verify Pipeline Tasks Run On AMD GPU Nodes Only When Tolerations Are Added + [Documentation] Runs a pipeline that tests GPU availability according to GPU tolerations in pipeline tasks: + ... - One task should not have GPUs available, as we don't add the GPU tolerations + ... - Another task should have GPUs available, as we add the GPU tolerations + [Tags] Tier1 Resources-GPU AMD-GPUs + + Verify Pipeline Tasks Run On GPU Nodes Only When Tolerations Are Added ${PIPELINE_AMD_GPU} + + +*** Keywords *** +# robocop: off=too-long-test-case +Verify Pipeline Tasks Run On GPU Nodes Only When Tolerations Are Added + [Documentation] Runs a pipeline that tests GPU availability according to GPU tolerations in pipeline tasks: + ... - One task should not have GPUs available, as we don't add the GPU tolerations + ... - Another task should have GPUs available, as we add the GPU tolerations + [Arguments] ${pipeline_package_path} + # robocop: off=unused-variable ${pipeline_id} ${pipeline_version_id} ${pipeline_run_id} ${experiment_id}= ... DataSciencePipelinesBackend.Import Pipeline And Create Run ... namespace=${PROJECT} username=${TEST_USER.USERNAME} password=${TEST_USER.PASSWORD} ... pipeline_name=pytorch-verify-gpu-availability ... pipeline_description=Verifies GPU availability in tasks when using tolerations - ... pipeline_package_path=${PIPELINE_GPU_AVAILABILITY_FILEPATH} + ... pipeline_package_path=${pipeline_package_path} ... pipeline_run_name=pytorch-verify-gpu-availability-run DataSciencePipelinesBackend.Wait For Run Completion And Verify Status ... namespace=${PROJECT} username=${TEST_USER.USERNAME} password=${TEST_USER.PASSWORD} - ... pipeline_run_id=${pipeline_run_id} pipeline_run_timeout=240 + ... pipeline_run_id=${pipeline_run_id} pipeline_run_timeout=600 ... pipeline_run_expected_status=SUCCEEDED [Teardown] DataSciencePipelinesBackend.Delete Pipeline And Related Resources ... namespace=${PROJECT} username=${TEST_USER.USERNAME} password=${TEST_USER.PASSWORD} ... pipeline_id=${pipeline_id} - -*** Keywords *** -Dsp Nvidia Gpu Suite Setup +Dsp Gpu Suite Setup [Documentation] Dsp Acceptance Suite Setup RHOSi Setup Projects.Create Data Science Project From CLI ${PROJECT} @@ -56,7 +75,7 @@ Dsp Nvidia Gpu Suite Setup ... dsp_version=v2 DataSciencePipelinesBackend.Wait Until Pipeline Server Is Deployed namespace=${PROJECT} -Dsp Nvidia Gpu Suite Teardown +Dsp Gpu Suite Teardown [Documentation] Dsp Acceptance Suite Teardown Projects.Delete Project Via CLI By Display Name ${PROJECT} RHOSi Teardown