Skip to content

Commit

Permalink
Automated test to verify AMD GPU's can be used in Data Science Pipelines
Browse files Browse the repository at this point in the history
  • Loading branch information
diegolovison committed Oct 15, 2024
1 parent 368804b commit f932578
Show file tree
Hide file tree
Showing 7 changed files with 416 additions and 240 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from kfp import compiler, dsl, kubernetes
from kfp.dsl import PipelineTask

# Runtime: Pytorch with ROCm and Python 3.9 (UBI 9)
common_base_image = (
"quay.io/modh/runtime-images@sha256:76d2a5d9ca0bb35c4d9962a7ee6798672afd3a5c7d226d87ef65f952e7cb93cf"
)


def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_limit: int):
print(f"Adding GPU tolerations: {accelerator_type}({accelerator_limit})")
task.set_accelerator_type(accelerator=accelerator_type)
task.set_accelerator_limit(accelerator_limit)
kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule")


@dsl.component(
base_image=common_base_image
)
def verify_gpu_availability(gpu_toleration: bool):
import torch

cuda_available = torch.cuda.is_available()
device_count = torch.cuda.device_count()
print("------------------------------")
print("GPU availability")
print("------------------------------")
print(f"cuda available: {cuda_available}")
print(f"device count: {device_count}")
if gpu_toleration:
assert torch.cuda.is_available()
assert torch.cuda.device_count() > 0
t = torch.tensor([5, 5, 5], dtype=torch.int64, device='cuda')
else:
assert not torch.cuda.is_available()
assert torch.cuda.device_count() == 0
t = torch.tensor([5, 5, 5], dtype=torch.int64)
print(f"tensor: {t}")
print("GPU availability test: PASS")


@dsl.pipeline(
name="pytorch-amd-gpu-availability",
description="Verifies pipeline tasks run on GPU nodes only when tolerations are added",
)
def pytorch_amd_gpu_availability():
verify_gpu_availability(gpu_toleration=False).set_caching_options(False)

task_with_toleration = verify_gpu_availability(gpu_toleration=True).set_caching_options(False)
add_gpu_toleration(task_with_toleration, "amd.com/gpu", 1)


if __name__ == "__main__":
compiler.Compiler().compile(pytorch_amd_gpu_availability, package_path=__file__.replace(".py", "_compiled.yaml"))
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# PIPELINE DEFINITION
# Name: pytorch-amd-gpu-availability
# Description: Verifies pipeline tasks run on GPU nodes only when tolerations are added
components:
comp-verify-gpu-availability:
executorLabel: exec-verify-gpu-availability
inputDefinitions:
parameters:
gpu_toleration:
parameterType: BOOLEAN
comp-verify-gpu-availability-2:
executorLabel: exec-verify-gpu-availability-2
inputDefinitions:
parameters:
gpu_toleration:
parameterType: BOOLEAN
deploymentSpec:
executors:
exec-verify-gpu-availability:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- verify_gpu_availability
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.9.0'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\
\n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\
\ print(\"------------------------------\")\n print(\"GPU availability\"\
)\n print(\"------------------------------\")\n print(f\"cuda available:\
\ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \
\ if gpu_toleration:\n assert torch.cuda.is_available()\n \
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\
\ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:76d2a5d9ca0bb35c4d9962a7ee6798672afd3a5c7d226d87ef65f952e7cb93cf
exec-verify-gpu-availability-2:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- verify_gpu_availability
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.9.0'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\
\n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\
\ print(\"------------------------------\")\n print(\"GPU availability\"\
)\n print(\"------------------------------\")\n print(f\"cuda available:\
\ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \
\ if gpu_toleration:\n assert torch.cuda.is_available()\n \
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\
\ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:76d2a5d9ca0bb35c4d9962a7ee6798672afd3a5c7d226d87ef65f952e7cb93cf
resources:
accelerator:
count: '1'
type: amd.com/gpu
pipelineInfo:
description: Verifies pipeline tasks run on GPU nodes only when tolerations are
added
name: pytorch-amd-gpu-availability
root:
dag:
tasks:
verify-gpu-availability:
cachingOptions: {}
componentRef:
name: comp-verify-gpu-availability
inputs:
parameters:
gpu_toleration:
runtimeValue:
constant: false
taskInfo:
name: verify-gpu-availability
verify-gpu-availability-2:
cachingOptions: {}
componentRef:
name: comp-verify-gpu-availability-2
inputs:
parameters:
gpu_toleration:
runtimeValue:
constant: true
taskInfo:
name: verify-gpu-availability-2
schemaVersion: 2.1.0
sdkVersion: kfp-2.9.0
---
platforms:
kubernetes:
deploymentSpec:
executors:
exec-verify-gpu-availability-2:
tolerations:
- effect: NoSchedule
key: amd.com/gpu
operator: Exists
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from kfp import compiler, dsl, kubernetes
from kfp.dsl import PipelineTask

# Runtime: Pytorch with CUDA and Python 3.9 (UBI 9)
common_base_image = (
"quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa"
)


def add_gpu_toleration(task: PipelineTask, accelerator_type: str, accelerator_limit: int):
print(f"Adding GPU tolerations: {accelerator_type}({accelerator_limit})")
task.set_accelerator_type(accelerator=accelerator_type)
task.set_accelerator_limit(accelerator_limit)
kubernetes.add_toleration(task, key=accelerator_type, operator="Exists", effect="NoSchedule")


@dsl.component(
base_image=common_base_image
)
def verify_gpu_availability(gpu_toleration: bool):
import torch

cuda_available = torch.cuda.is_available()
device_count = torch.cuda.device_count()
print("------------------------------")
print("GPU availability")
print("------------------------------")
print(f"cuda available: {cuda_available}")
print(f"device count: {device_count}")
if gpu_toleration:
assert torch.cuda.is_available()
assert torch.cuda.device_count() > 0
t = torch.tensor([5, 5, 5], dtype=torch.int64, device='cuda')
else:
assert not torch.cuda.is_available()
assert torch.cuda.device_count() == 0
t = torch.tensor([5, 5, 5], dtype=torch.int64)
print(f"tensor: {t}")
print("GPU availability test: PASS")


@dsl.pipeline(
name="pytorch-nvidia-gpu-availability",
description="Verifies pipeline tasks run on GPU nodes only when tolerations are added",
)
def pytorch_nvidia_gpu_availability():
verify_gpu_availability(gpu_toleration=False).set_caching_options(False)

task_with_toleration = verify_gpu_availability(gpu_toleration=True).set_caching_options(False)
add_gpu_toleration(task_with_toleration, "nvidia.com/gpu", 1)


if __name__ == "__main__":
compiler.Compiler().compile(pytorch_nvidia_gpu_availability, package_path=__file__.replace(".py", "_compiled.yaml"))
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
# PIPELINE DEFINITION
# Name: pytorch-nvidia-gpu-availability
# Description: Verifies pipeline tasks run on GPU nodes only when tolerations are added
components:
comp-verify-gpu-availability:
executorLabel: exec-verify-gpu-availability
inputDefinitions:
parameters:
gpu_toleration:
parameterType: BOOLEAN
comp-verify-gpu-availability-2:
executorLabel: exec-verify-gpu-availability-2
inputDefinitions:
parameters:
gpu_toleration:
parameterType: BOOLEAN
deploymentSpec:
executors:
exec-verify-gpu-availability:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- verify_gpu_availability
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.9.0'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\
\n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\
\ print(\"------------------------------\")\n print(\"GPU availability\"\
)\n print(\"------------------------------\")\n print(f\"cuda available:\
\ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \
\ if gpu_toleration:\n assert torch.cuda.is_available()\n \
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\
\ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa
exec-verify-gpu-availability-2:
container:
args:
- --executor_input
- '{{$}}'
- --function_to_execute
- verify_gpu_availability
command:
- sh
- -c
- "\nif ! [ -x \"$(command -v pip)\" ]; then\n python3 -m ensurepip ||\
\ python3 -m ensurepip --user || apt-get install python3-pip\nfi\n\nPIP_DISABLE_PIP_VERSION_CHECK=1\
\ python3 -m pip install --quiet --no-warn-script-location 'kfp==2.9.0'\
\ '--no-deps' 'typing-extensions>=3.7.4,<5; python_version<\"3.9\"' && \"\
$0\" \"$@\"\n"
- sh
- -ec
- 'program_path=$(mktemp -d)
printf "%s" "$0" > "$program_path/ephemeral_component.py"
_KFP_RUNTIME=true python3 -m kfp.dsl.executor_main --component_module_path "$program_path/ephemeral_component.py" "$@"
'
- "\nimport kfp\nfrom kfp import dsl\nfrom kfp.dsl import *\nfrom typing import\
\ *\n\ndef verify_gpu_availability(gpu_toleration: bool):\n import torch\n\
\n cuda_available = torch.cuda.is_available()\n device_count = torch.cuda.device_count()\n\
\ print(\"------------------------------\")\n print(\"GPU availability\"\
)\n print(\"------------------------------\")\n print(f\"cuda available:\
\ {cuda_available}\")\n print(f\"device count: {device_count}\")\n \
\ if gpu_toleration:\n assert torch.cuda.is_available()\n \
\ assert torch.cuda.device_count() > 0\n t = torch.tensor([5, 5,\
\ 5], dtype=torch.int64, device='cuda')\n else:\n assert not torch.cuda.is_available()\n\
\ assert torch.cuda.device_count() == 0\n t = torch.tensor([5,\
\ 5, 5], dtype=torch.int64)\n print(f\"tensor: {t}\")\n print(\"GPU\
\ availability test: PASS\")\n\n"
image: quay.io/modh/runtime-images@sha256:cee154f6db15de27929362f91baa128fc4f79b9c1930ab0f27561174d39aadfa
resources:
accelerator:
count: '1'
type: nvidia.com/gpu
pipelineInfo:
description: Verifies pipeline tasks run on GPU nodes only when tolerations are
added
name: pytorch-nvidia-gpu-availability
root:
dag:
tasks:
verify-gpu-availability:
cachingOptions: {}
componentRef:
name: comp-verify-gpu-availability
inputs:
parameters:
gpu_toleration:
runtimeValue:
constant: false
taskInfo:
name: verify-gpu-availability
verify-gpu-availability-2:
cachingOptions: {}
componentRef:
name: comp-verify-gpu-availability-2
inputs:
parameters:
gpu_toleration:
runtimeValue:
constant: true
taskInfo:
name: verify-gpu-availability-2
schemaVersion: 2.1.0
sdkVersion: kfp-2.9.0
---
platforms:
kubernetes:
deploymentSpec:
executors:
exec-verify-gpu-availability-2:
tolerations:
- effect: NoSchedule
key: nvidia.com/gpu
operator: Exists
Loading

0 comments on commit f932578

Please sign in to comment.