Skip to content

Commit

Permalink
Merge branch 'master' into feature/model-registry-UI
Browse files Browse the repository at this point in the history
  • Loading branch information
lugi0 authored Oct 2, 2024
2 parents fbf8232 + cfd4616 commit 669bb90
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 5 deletions.
10 changes: 6 additions & 4 deletions ods_ci/tasks/Resources/Provisioning/GPU/NVIDIA/gpu_deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,9 @@ oc wait --timeout=3m --for jsonpath='{.status.components.labelSelector.matchExpr
oc wait --timeout=3m --for jsonpath='{.status.components.labelSelector.matchExpressions[].operator}'=Exists operator gpu-operator-certified.nvidia-gpu-operator

function wait_until_pod_ready_status() {
local timeout_seconds=1200
local pod_label=$1
local namespace=nvidia-gpu-operator
local timeout=240
local timeout=${2:-360}
start_time=$(date +%s)
while [ $(($(date +%s) - start_time)) -lt $timeout ]; do
pod_status="$(oc get pod -l app="$pod_label" -n "$namespace" --no-headers=true 2>/dev/null)"
Expand All @@ -42,7 +41,10 @@ function wait_until_pod_ready_status() {
echo "Waiting until GPU Pods or Daemonset of '$pod_label' in namespace '$namespace' are in running state..."
echo "Pods status: '$pod_status'"
echo "Daemonset status: '$daemon_status'"
oc wait --timeout="${timeout_seconds}s" --for=condition=ready pod -n "$namespace" -l app="$pod_label" || \
oc wait --timeout=10s --for=condition=ready pod -n "$namespace" -l app="$pod_label" || \
if [ $? -ne 0 ]; then
continue
fi
oc rollout status --watch --timeout=3m daemonset -n "$namespace" -l app="$pod_label" || continue
break
fi
Expand Down Expand Up @@ -83,7 +85,7 @@ wait_until_pod_ready_status "gpu-operator"
oc apply -f "$GPU_INSTALL_DIR/../nfd_deploy.yaml"
oc get csv -n nvidia-gpu-operator "$CSVNAME" -o jsonpath='{.metadata.annotations.alm-examples}' | jq .[0] > clusterpolicy.json
oc apply -f clusterpolicy.json
wait_until_pod_ready_status "nvidia-device-plugin-daemonset"
wait_until_pod_ready_status "nvidia-device-plugin-daemonset" 600
wait_until_pod_ready_status "nvidia-container-toolkit-daemonset"
wait_until_pod_ready_status "nvidia-dcgm-exporter"
wait_until_pod_ready_status "gpu-feature-discovery"
Expand Down
9 changes: 9 additions & 0 deletions ods_ci/tasks/Resources/Provisioning/GPU/provision-gpu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ GPU_COUNT=${3:-"1"}
KUSTOMIZE_PATH="$PWD/tasks/Resources/Provisioning/Hive/GPU"
MACHINESET_PATH="$KUSTOMIZE_PATH/base/source-machineset.yaml"
PROVIDER_OVERLAY_DIR=$KUSTOMIZE_PATH/overlays/$PROVIDER
MACHINE_WAIT_TIMEOUT=10m
# Check if existing machineset GPU already exists
EXISTING_GPU_MACHINESET="$(oc get machineset -n openshift-machine-api -o jsonpath="{.items[?(@.metadata.annotations['machine\.openshift\.io/GPU']>'0')].metadata.name}")"
if [[ -n "$EXISTING_GPU_MACHINESET" ]] ; then
Expand Down Expand Up @@ -39,3 +40,11 @@ sed -i'' -e "s/INSTANCE_TYPE/$INSTANCE_TYPE/g" $PROVIDER_OVERLAY_DIR/gpu.yaml
oc apply --kustomize $PROVIDER_OVERLAY_DIR
# Add GPU label to the new machine-set
oc patch machinesets -n openshift-machine-api "$NEW_MACHINESET_NAME" -p '{"metadata":{"labels":{"gpu-machineset":"true"}}}' --type=merge
# wait for the machine to be Ready
echo "Waiting for GPU Node to be Ready"
oc wait --timeout=$MACHINE_WAIT_TIMEOUT --for jsonpath='{.status.readyReplicas}'=1 machineset $NEW_MACHINESET_NAME -n openshift-machine-api
if [ $? -ne 0 ]; then
echo "Machine Set $NEW_MACHINESET_NAME does not have its Machines in Running status after $MACHINE_WAIT_TIMEOUT timeout"
echo "Please check the cluster"
exit 1
fi
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Test Tags Dashboard
*** Variables ***
${PRJ_TITLE}= ODS-CI DS Project 2
${PRJ_TITLE_GPU}= ODS-CI DS Project GPU
${PRJ_RESOURCE_NAME}= ods-ci-ds-project-test-additional
${PRJ_RESOURCE_NAME}= ods-ci-ds-pr-test-0410
${PRJ_DESCRIPTION}= ${PRJ_TITLE} is a test project for validating DS Project feature
${TOLERATIONS}= workbench-tolerations
${TOLERATIONS_2}= workbench-tolerations-two
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Resource ../../../Resources/Page/OCPDashboard/Pods/Pods.robot
Library JupyterLibrary
Suite Setup Spawner Suite Setup
Suite Teardown End Web Test
Test Tags Resources-GPU


*** Variables ***
Expand Down

0 comments on commit 669bb90

Please sign in to comment.