Automated test to verify AMD GPU's can be used in Data Science Pipelines

red-hat-data-services · Oct 15, 2024 · f932578 · f932578
1 parent 368804b
commit f932578
Show file tree

Hide file tree

Showing 7 changed files with 416 additions and 240 deletions.
diff --git a/...rces/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability.py b/...rces/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability.py
@@ -0,0 +1,54 @@
+from kfp import compiler, dsl, kubernetes
+from kfp.dsl import PipelineTask
+
+#  Runtime: Pytorch with ROCm and Python 3.9 (UBI 9)
+common_base_image = (
+    "quay.io/modh/runtime-images@sha256:76d2a5d9ca0bb35c4d9962a7ee6798672afd3a5c7d226d87ef65f952e7cb93cf"
+)
+
+
+def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_limit: int):
+    print(f"Adding GPU tolerations: {accelerator_type}({accelerator_limit})")
+    task.set_accelerator_type(accelerator=accelerator_type)
+    task.set_accelerator_limit(accelerator_limit)
+    kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule")
+
+
+@dsl.component(
+    base_image=common_base_image
+)
+def verify_gpu_availability(gpu_toleration: bool):
+    import torch
+
+    cuda_available = torch.cuda.is_available()
+    device_count = torch.cuda.device_count()
+    print("------------------------------")
+    print("GPU availability")
+    print("------------------------------")
+    print(f"cuda available: {cuda_available}")
+    print(f"device count: {device_count}")
+    if gpu_toleration:
+        assert torch.cuda.is_available()
+        assert torch.cuda.device_count() > 0
+        t = torch.tensor([5, 5, 5], dtype=torch.int64, device='cuda')
+    else:
+        assert not torch.cuda.is_available()
+        assert torch.cuda.device_count() == 0
+        t = torch.tensor([5, 5, 5], dtype=torch.int64)
+    print(f"tensor: {t}")
+    print("GPU availability test: PASS")
+
+
+@dsl.pipeline(
+    name="pytorch-amd-gpu-availability",
+    description="Verifies pipeline tasks run on GPU nodes only when tolerations are added",
+)
+def pytorch_amd_gpu_availability():
+    verify_gpu_availability(gpu_toleration=False).set_caching_options(False)
+
+    task_with_toleration = verify_gpu_availability(gpu_toleration=True).set_caching_options(False)
+    add_gpu_toleration(task_with_toleration, "amd.com/gpu", 1)
+
+
+if __name__ == "__main__":
+    compiler.Compiler().compile(pytorch_amd_gpu_availability, package_path=__file__.replace(".py", "_compiled.yaml"))
diff --git a/...pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability_compiled.yaml b/...pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_amd_gpu_availability_compiled.yaml
@@ -0,0 +1,139 @@
+# PIPELINE DEFINITION
+# Name: pytorch-amd-gpu-availability
+# Description: Verifies pipeline tasks run on GPU nodes only when tolerations are added
+components:
+  comp-verify-gpu-availability:
+    executorLabel: exec-verify-gpu-availability
+    inputDefinitions:
+      parameters:
+        gpu_toleration:
+          parameterType: BOOLEAN
+  comp-verify-gpu-availability-2:
+    executorLabel: exec-verify-gpu-availability-2
+    inputDefinitions:
+      parameters:
+        gpu_toleration:
+          parameterType: BOOLEAN
+deploymentSpec:
+  executors:
+    exec-verify-gpu-availability:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - verify_gpu_availability
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.9.0'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n    import torch\n\
+          \n    cuda_available = torch.cuda.is_available()\n    device_count = torch.cuda.device_count()\n\
+          \    print(\"------------------------------\")\n    print(\"GPU availability\"\
+          )\n    print(\"------------------------------\")\n    print(f\"cuda available:\
+          \ {cuda_available}\")\n    print(f\"device count: {device_count}\")\n  \
+          \  if gpu_toleration:\n        assert torch.cuda.is_available()\n      \
+          \  assert torch.cuda.device_count() > 0\n        t = torch.tensor([5, 5,\
+          \ 5], dtype=torch.int64, device='cuda')\n    else:\n        assert not torch.cuda.is_available()\n\
+          \        assert torch.cuda.device_count() == 0\n        t = torch.tensor([5,\
+          \ 5, 5], dtype=torch.int64)\n    print(f\"tensor: {t}\")\n    print(\"GPU\
+          \ availability test: PASS\")\n\n"
+        image: quay.io/modh/runtime-images@sha256:76d2a5d9ca0bb35c4d9962a7ee6798672afd3a5c7d226d87ef65f952e7cb93cf
+    exec-verify-gpu-availability-2:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - verify_gpu_availability
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.9.0'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n    import torch\n\
+          \n    cuda_available = torch.cuda.is_available()\n    device_count = torch.cuda.device_count()\n\
+          \    print(\"------------------------------\")\n    print(\"GPU availability\"\
+          )\n    print(\"------------------------------\")\n    print(f\"cuda available:\
+          \ {cuda_available}\")\n    print(f\"device count: {device_count}\")\n  \
+          \  if gpu_toleration:\n        assert torch.cuda.is_available()\n      \
+          \  assert torch.cuda.device_count() > 0\n        t = torch.tensor([5, 5,\
+          \ 5], dtype=torch.int64, device='cuda')\n    else:\n        assert not torch.cuda.is_available()\n\
+          \        assert torch.cuda.device_count() == 0\n        t = torch.tensor([5,\
+          \ 5, 5], dtype=torch.int64)\n    print(f\"tensor: {t}\")\n    print(\"GPU\
+          \ availability test: PASS\")\n\n"
+        image: quay.io/modh/runtime-images@sha256:76d2a5d9ca0bb35c4d9962a7ee6798672afd3a5c7d226d87ef65f952e7cb93cf
+        resources:
+          accelerator:
+            count: '1'
+            type: amd.com/gpu
+pipelineInfo:
+  description: Verifies pipeline tasks run on GPU nodes only when tolerations are
+    added
+  name: pytorch-amd-gpu-availability
+root:
+  dag:
+    tasks:
+      verify-gpu-availability:
+        cachingOptions: {}
+        componentRef:
+          name: comp-verify-gpu-availability
+        inputs:
+          parameters:
+            gpu_toleration:
+              runtimeValue:
+                constant: false
+        taskInfo:
+          name: verify-gpu-availability
+      verify-gpu-availability-2:
+        cachingOptions: {}
+        componentRef:
+          name: comp-verify-gpu-availability-2
+        inputs:
+          parameters:
+            gpu_toleration:
+              runtimeValue:
+                constant: true
+        taskInfo:
+          name: verify-gpu-availability-2
+schemaVersion: 2.1.0
+sdkVersion: kfp-2.9.0
+---
+platforms:
+  kubernetes:
+    deploymentSpec:
+      executors:
+        exec-verify-gpu-availability-2:
+          tolerations:
+          - effect: NoSchedule
+            key: amd.com/gpu
+            operator: Exists
diff --git a/...s/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability.py b/...s/Files/pipeline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability.py
@@ -0,0 +1,54 @@
+from kfp import compiler, dsl, kubernetes
+from kfp.dsl import PipelineTask
+
+#  Runtime: Pytorch with CUDA and Python 3.9 (UBI 9)
+common_base_image = (
+    "quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa"
+)
+
+
+def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_limit: int):
+    print(f"Adding GPU tolerations: {accelerator_type}({accelerator_limit})")
+    task.set_accelerator_type(accelerator=accelerator_type)
+    task.set_accelerator_limit(accelerator_limit)
+    kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule")
+
+
+@dsl.component(
+    base_image=common_base_image
+)
+def verify_gpu_availability(gpu_toleration: bool):
+    import torch
+
+    cuda_available = torch.cuda.is_available()
+    device_count = torch.cuda.device_count()
+    print("------------------------------")
+    print("GPU availability")
+    print("------------------------------")
+    print(f"cuda available: {cuda_available}")
+    print(f"device count: {device_count}")
+    if gpu_toleration:
+        assert torch.cuda.is_available()
+        assert torch.cuda.device_count() > 0
+        t = torch.tensor([5, 5, 5], dtype=torch.int64, device='cuda')
+    else:
+        assert not torch.cuda.is_available()
+        assert torch.cuda.device_count() == 0
+        t = torch.tensor([5, 5, 5], dtype=torch.int64)
+    print(f"tensor: {t}")
+    print("GPU availability test: PASS")
+
+
+@dsl.pipeline(
+    name="pytorch-nvidia-gpu-availability",
+    description="Verifies pipeline tasks run on GPU nodes only when tolerations are added",
+)
+def pytorch_nvidia_gpu_availability():
+    verify_gpu_availability(gpu_toleration=False).set_caching_options(False)
+
+    task_with_toleration = verify_gpu_availability(gpu_toleration=True).set_caching_options(False)
+    add_gpu_toleration(task_with_toleration, "nvidia.com/gpu", 1)
+
+
+if __name__ == "__main__":
+    compiler.Compiler().compile(pytorch_nvidia_gpu_availability, package_path=__file__.replace(".py", "_compiled.yaml"))
diff --git a/...eline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability_compiled.yaml b/...eline-samples/v2/cache-disabled/gpu/pytorch/pytorch_nvidia_gpu_availability_compiled.yaml
@@ -0,0 +1,139 @@
+# PIPELINE DEFINITION
+# Name: pytorch-nvidia-gpu-availability
+# Description: Verifies pipeline tasks run on GPU nodes only when tolerations are added
+components:
+  comp-verify-gpu-availability:
+    executorLabel: exec-verify-gpu-availability
+    inputDefinitions:
+      parameters:
+        gpu_toleration:
+          parameterType: BOOLEAN
+  comp-verify-gpu-availability-2:
+    executorLabel: exec-verify-gpu-availability-2
+    inputDefinitions:
+      parameters:
+        gpu_toleration:
+          parameterType: BOOLEAN
+deploymentSpec:
+  executors:
+    exec-verify-gpu-availability:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - verify_gpu_availability
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.9.0'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n    import torch\n\
+          \n    cuda_available = torch.cuda.is_available()\n    device_count = torch.cuda.device_count()\n\
+          \    print(\"------------------------------\")\n    print(\"GPU availability\"\
+          )\n    print(\"------------------------------\")\n    print(f\"cuda available:\
+          \ {cuda_available}\")\n    print(f\"device count: {device_count}\")\n  \
+          \  if gpu_toleration:\n        assert torch.cuda.is_available()\n      \
+          \  assert torch.cuda.device_count() > 0\n        t = torch.tensor([5, 5,\
+          \ 5], dtype=torch.int64, device='cuda')\n    else:\n        assert not torch.cuda.is_available()\n\
+          \        assert torch.cuda.device_count() == 0\n        t = torch.tensor([5,\
+          \ 5, 5], dtype=torch.int64)\n    print(f\"tensor: {t}\")\n    print(\"GPU\
+          \ availability test: PASS\")\n\n"
+        image: quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa
+    exec-verify-gpu-availability-2:
+      container:
+        args:
+        - --executor_input
+        - '{{$}}'
+        - --function_to_execute
+        - verify_gpu_availability
+        command:
+        - sh
+        - -c
+        - "\nif ! [ -x \"$(command -v pip)\" ]; then\n    python3 -m ensurepip ||\
+          \ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
+          \ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.9.0'\
+          \ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
+          $0\" \"$@\"\n"
+        - sh
+        - -ec
+        - 'program_path=$(mktemp -d)
+
+
+          printf "%s" "$0" > "$program_path/ephemeral_component.py"
+
+          _KFP_RUNTIME=true python3 -m kfp.dsl.executor_main                         --component_module_path                         "$program_path/ephemeral_component.py"                         "$@"
+
+          '
+        - "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
+          \ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n    import torch\n\
+          \n    cuda_available = torch.cuda.is_available()\n    device_count = torch.cuda.device_count()\n\
+          \    print(\"------------------------------\")\n    print(\"GPU availability\"\
+          )\n    print(\"------------------------------\")\n    print(f\"cuda available:\
+          \ {cuda_available}\")\n    print(f\"device count: {device_count}\")\n  \
+          \  if gpu_toleration:\n        assert torch.cuda.is_available()\n      \
+          \  assert torch.cuda.device_count() > 0\n        t = torch.tensor([5, 5,\
+          \ 5], dtype=torch.int64, device='cuda')\n    else:\n        assert not torch.cuda.is_available()\n\
+          \        assert torch.cuda.device_count() == 0\n        t = torch.tensor([5,\
+          \ 5, 5], dtype=torch.int64)\n    print(f\"tensor: {t}\")\n    print(\"GPU\
+          \ availability test: PASS\")\n\n"
+        image: quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa
+        resources:
+          accelerator:
+            count: '1'
+            type: nvidia.com/gpu
+pipelineInfo:
+  description: Verifies pipeline tasks run on GPU nodes only when tolerations are
+    added
+  name: pytorch-nvidia-gpu-availability
+root:
+  dag:
+    tasks:
+      verify-gpu-availability:
+        cachingOptions: {}
+        componentRef:
+          name: comp-verify-gpu-availability
+        inputs:
+          parameters:
+            gpu_toleration:
+              runtimeValue:
+                constant: false
+        taskInfo:
+          name: verify-gpu-availability
+      verify-gpu-availability-2:
+        cachingOptions: {}
+        componentRef:
+          name: comp-verify-gpu-availability-2
+        inputs:
+          parameters:
+            gpu_toleration:
+              runtimeValue:
+                constant: true
+        taskInfo:
+          name: verify-gpu-availability-2
+schemaVersion: 2.1.0
+sdkVersion: kfp-2.9.0
+---
+platforms:
+  kubernetes:
+    deploymentSpec:
+      executors:
+        exec-verify-gpu-availability-2:
+          tolerations:
+          - effect: NoSchedule
+            key: nvidia.com/gpu
+            operator: Exists