diff --git a/.gitignore b/.gitignore index 258ebf6b..2f8d504e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,4 +3,7 @@ .pytest_cache .idea **/err.txt -coverage.out \ No newline at end of file +coverage.out + +# Symlinks created by Makefile build-cli +k8s-tests/chainsaw/*/skyhook-cli \ No newline at end of file diff --git a/.vscode/launch.json b/.vscode/launch.json index edf599d2..5f74014e 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -9,7 +9,7 @@ "type": "go", "request": "launch", "mode": "debug", - "program": "${workspaceRoot}/operator/cmd/main.go", + "program": "${workspaceRoot}/operator/cmd/manager/main.go", "cwd": "${workspaceRoot}/operator", "buildFlags": "--ldflags '-X github.com/NVIDIA/skyhook/operator/internal/version.GIT_SHA=foobars -X github.com/NVIDIA/skyhook/operator/internal/version.VERSION=v0.5.0'", "env": { diff --git a/k8s-tests/chainsaw/cli/lifecycle/chainsaw-test.yaml b/k8s-tests/chainsaw/cli/lifecycle/chainsaw-test.yaml index 14b9b0d7..9f815ea6 100644 --- a/k8s-tests/chainsaw/cli/lifecycle/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/cli/lifecycle/chainsaw-test.yaml @@ -30,6 +30,15 @@ spec: assert: 60s exec: 30s steps: + # Step 0: Reset state from previous runs + - name: reset-state + try: + - script: + timeout: 30s + content: | + ../skyhook-cli reset cli-lifecycle-test --confirm 2>/dev/null || true + echo "✓ State reset complete" + # Step 1: Create a Skyhook - name: setup-skyhook try: @@ -55,7 +64,7 @@ spec: - script: timeout: 30s content: | - ../../../../operator/bin/skyhook pause cli-lifecycle-test --confirm + ../skyhook-cli pause cli-lifecycle-test --confirm - assert: file: assert-paused.yaml @@ -65,7 +74,7 @@ spec: - script: timeout: 30s content: | - ../../../../operator/bin/skyhook resume cli-lifecycle-test + ../skyhook-cli resume cli-lifecycle-test - assert: file: assert-resumed.yaml @@ -75,7 +84,7 @@ spec: - script: timeout: 30s content: | - ../../../../operator/bin/skyhook disable cli-lifecycle-test --confirm + ../skyhook-cli disable cli-lifecycle-test --confirm - assert: file: assert-disabled.yaml @@ -85,7 +94,7 @@ spec: - script: timeout: 30s content: | - ../../../../operator/bin/skyhook enable cli-lifecycle-test + ../skyhook-cli enable cli-lifecycle-test - assert: file: assert-enabled.yaml @@ -95,22 +104,22 @@ spec: - script: timeout: 30s content: | - ../../../../operator/bin/skyhook pause cli-lifecycle-test --confirm - ../../../../operator/bin/skyhook disable cli-lifecycle-test --confirm + ../skyhook-cli pause cli-lifecycle-test --confirm + ../skyhook-cli disable cli-lifecycle-test --confirm - assert: file: assert-paused-and-disabled.yaml - script: timeout: 30s content: | # Resume should only remove pause, not disable - ../../../../operator/bin/skyhook resume cli-lifecycle-test + ../skyhook-cli resume cli-lifecycle-test - assert: file: assert-still-disabled.yaml - script: timeout: 30s content: | # Enable should remove disable - ../../../../operator/bin/skyhook enable cli-lifecycle-test + ../skyhook-cli enable cli-lifecycle-test # Cleanup - name: cleanup diff --git a/k8s-tests/chainsaw/cli/node/chainsaw-test.yaml b/k8s-tests/chainsaw/cli/node/chainsaw-test.yaml index f4d8c481..483f22d8 100644 --- a/k8s-tests/chainsaw/cli/node/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/cli/node/chainsaw-test.yaml @@ -30,6 +30,15 @@ spec: assert: 120s exec: 90s steps: + # Step 0: Reset state from previous runs + - name: reset-state + try: + - script: + timeout: 30s + content: | + ../skyhook-cli reset cli-node-test --confirm 2>/dev/null || true + echo "✓ State reset complete" + # Step 1: Create a Skyhook - name: setup-skyhook try: @@ -42,11 +51,11 @@ spec: - script: timeout: 30s content: | - ../../../../operator/bin/skyhook node list --skyhook cli-node-test + ../skyhook-cli node list --skyhook cli-node-test - script: timeout: 30s content: | - ../../../../operator/bin/skyhook node list --skyhook cli-node-test -o json + ../skyhook-cli node list --skyhook cli-node-test -o json # Step 3: Test node status - name: test-node-status @@ -55,17 +64,17 @@ spec: timeout: 30s content: | NODE=$(kubectl get nodes -l skyhook.nvidia.com/test-node=skyhooke2e -o jsonpath='{.items[0].metadata.name}') - ../../../../operator/bin/skyhook node status "$NODE" + ../skyhook-cli node status "$NODE" - script: timeout: 30s content: | NODE=$(kubectl get nodes -l skyhook.nvidia.com/test-node=skyhooke2e -o jsonpath='{.items[0].metadata.name}') - ../../../../operator/bin/skyhook node status "$NODE" --skyhook cli-node-test + ../skyhook-cli node status "$NODE" --skyhook cli-node-test - script: timeout: 30s content: | NODE=$(kubectl get nodes -l skyhook.nvidia.com/test-node=skyhooke2e -o jsonpath='{.items[0].metadata.name}') - ../../../../operator/bin/skyhook node status "$NODE" -o json + ../skyhook-cli node status "$NODE" -o json # Step 4: Test node ignore/unignore - name: test-node-ignore @@ -74,7 +83,7 @@ spec: timeout: 30s content: | NODE=$(kubectl get nodes -l skyhook.nvidia.com/test-node=skyhooke2e -o jsonpath='{.items[0].metadata.name}') - ../../../../operator/bin/skyhook node ignore "$NODE" + ../skyhook-cli node ignore "$NODE" - assert: file: assert-node-ignored.yaml - name: test-node-unignore @@ -83,7 +92,7 @@ spec: timeout: 30s content: | NODE=$(kubectl get nodes -l skyhook.nvidia.com/test-node=skyhooke2e -o jsonpath='{.items[0].metadata.name}') - ../../../../operator/bin/skyhook node unignore "$NODE" + ../skyhook-cli node unignore "$NODE" - assert: file: assert-node-unignored.yaml @@ -94,7 +103,7 @@ spec: timeout: 30s content: | NODE=$(kubectl get nodes -l skyhook.nvidia.com/test-node=skyhooke2e -o jsonpath='{.items[0].metadata.name}') - ../../../../operator/bin/skyhook node reset "$NODE" --skyhook cli-node-test --confirm + ../skyhook-cli node reset "$NODE" --skyhook cli-node-test --confirm - assert: file: assert-node-reset.yaml @@ -103,9 +112,8 @@ spec: try: - script: content: | - kubectl delete skyhook cli-node-test 2>/dev/null || true + ../skyhook-cli reset cli-node-test --confirm 2>/dev/null || true for NODE in $(kubectl get nodes -l skyhook.nvidia.com/test-node=skyhooke2e -o jsonpath='{.items[*].metadata.name}'); do - kubectl annotate node "$NODE" skyhook.nvidia.com/nodeState_cli-node-test- 2>/dev/null || true kubectl label node "$NODE" skyhook.nvidia.com/ignore- 2>/dev/null || true done diff --git a/k8s-tests/chainsaw/cli/package/chainsaw-test.yaml b/k8s-tests/chainsaw/cli/package/chainsaw-test.yaml index 747b778a..8ca74615 100644 --- a/k8s-tests/chainsaw/cli/package/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/cli/package/chainsaw-test.yaml @@ -29,6 +29,15 @@ spec: assert: 120s exec: 90s steps: + # Step 0: Reset state from previous runs + - name: reset-state + try: + - script: + timeout: 30s + content: | + ../skyhook-cli reset cli-package-test --confirm 2>/dev/null || true + echo "✓ State reset complete" + # Step 1: Create a Skyhook - name: setup-skyhook try: @@ -41,11 +50,11 @@ spec: - script: timeout: 30s content: | - ../../../../operator/bin/skyhook package status hello-world --skyhook cli-package-test + ../skyhook-cli package status hello-world --skyhook cli-package-test - script: timeout: 30s content: | - ../../../../operator/bin/skyhook package status hello-world --skyhook cli-package-test -o json + ../skyhook-cli package status hello-world --skyhook cli-package-test -o json # Step 3: Test package logs - name: test-package-logs @@ -56,7 +65,7 @@ spec: NODE=$(kubectl get nodes -l skyhook.nvidia.com/test-node=skyhooke2e -o jsonpath='{.items[0].metadata.name}') # Run logs command - it should succeed (may show waiting or actual logs) - ../../../../operator/bin/skyhook package logs hello-world --skyhook cli-package-test --node "$NODE" + ../skyhook-cli package logs hello-world --skyhook cli-package-test --node "$NODE" # Step 4: Wait for package to complete, then test rerun - name: test-package-rerun @@ -70,14 +79,14 @@ spec: NODE=$(kubectl get nodes -l skyhook.nvidia.com/test-node=skyhooke2e -o jsonpath='{.items[0].metadata.name}') # Test dry-run first - ../../../../operator/bin/skyhook package rerun hello-world --skyhook cli-package-test --node "$NODE" --dry-run + ../skyhook-cli package rerun hello-world --skyhook cli-package-test --node "$NODE" --dry-run - script: timeout: 30s content: | NODE=$(kubectl get nodes -l skyhook.nvidia.com/test-node=skyhooke2e -o jsonpath='{.items[0].metadata.name}') # Run actual rerun with confirm flag and stage - ../../../../operator/bin/skyhook package rerun hello-world --skyhook cli-package-test --node "$NODE" --stage apply --confirm + ../skyhook-cli package rerun hello-world --skyhook cli-package-test --node "$NODE" --stage apply --confirm - assert: file: assert-package-rerun.yaml @@ -86,8 +95,5 @@ spec: try: - script: content: | - kubectl delete skyhook cli-package-test 2>/dev/null || true - for NODE in $(kubectl get nodes -l skyhook.nvidia.com/test-node=skyhooke2e -o jsonpath='{.items[*].metadata.name}'); do - kubectl annotate node "$NODE" skyhook.nvidia.com/nodeState_cli-package-test- 2>/dev/null || true - done + ../skyhook-cli reset cli-package-test --confirm 2>/dev/null || true diff --git a/k8s-tests/chainsaw/cli/reset/assert-nodes-reset.yaml b/k8s-tests/chainsaw/cli/reset/assert-nodes-reset.yaml new file mode 100644 index 00000000..f59118d4 --- /dev/null +++ b/k8s-tests/chainsaw/cli/reset/assert-nodes-reset.yaml @@ -0,0 +1,32 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Node +metadata: + labels: + skyhook.nvidia.com/test-node: skyhooke2e + ## since skyhook is running it will set it to disabled after we clear it + ## should it do this, not sure i guess, but seems nice to have it that way + # (not): + # skyhook.nvidia.com/status_cli-reset-test: .* + annotations: + # Verify that all skyhook-related annotations are removed(lookup(@ || {}, 'skyhook.nvidia.com/nodeState_cli-reset-test')): null + (contains(keys(@), "skyhook.nvidia.com/nodeState_cli-reset-test")): false + (contains(keys(@), "skyhook.nvidia.com/cordon_cli-reset-test")): false + skyhook.nvidia.com/status_cli-reset-test: disabled ## disabled after clearing + # skyhook.nvidia.com/version_cli-reset-test: .* ## also rest after clearing + diff --git a/k8s-tests/chainsaw/skyhook/pause_skyhook.sh b/k8s-tests/chainsaw/cli/reset/assert-skyhook-complete.yaml old mode 100755 new mode 100644 similarity index 72% rename from k8s-tests/chainsaw/skyhook/pause_skyhook.sh rename to k8s-tests/chainsaw/cli/reset/assert-skyhook-complete.yaml index 8ad57377..bffb2b4a --- a/k8s-tests/chainsaw/skyhook/pause_skyhook.sh +++ b/k8s-tests/chainsaw/cli/reset/assert-skyhook-complete.yaml @@ -1,5 +1,3 @@ -#!/bin/bash -x - # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # @@ -16,14 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + name: cli-reset-test +status: + status: complete -skyhook="$1" -pause="$2" - -## assert is true or false -if [[ "$pause" != "true" && "$pause" != "false" ]]; then - echo "pause must be true or false" - exit 1 -fi - -kubectl patch skyhook ${skyhook} -p '{"spec":{"pause":'$pause'}}' --type=merge diff --git a/k8s-tests/chainsaw/skyhook/rest_nodes.sh b/k8s-tests/chainsaw/cli/reset/assert-skyhook-disabled.yaml old mode 100755 new mode 100644 similarity index 50% rename from k8s-tests/chainsaw/skyhook/rest_nodes.sh rename to k8s-tests/chainsaw/cli/reset/assert-skyhook-disabled.yaml index e19e80a9..5ace1142 --- a/k8s-tests/chainsaw/skyhook/rest_nodes.sh +++ b/k8s-tests/chainsaw/cli/reset/assert-skyhook-disabled.yaml @@ -1,5 +1,3 @@ -#!/bin/bash -eox pipefail - # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # @@ -16,18 +14,10 @@ # See the License for the specific language governing permissions and # limitations under the License. +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + name: cli-reset-test + annotations: + skyhook.nvidia.com/disable: "true" -## helper script -## clears labels and annotate from nodes with the prefix "skyhook.nvidia.com" -## note, a lot of tests have a label setup to target, so you might need to put that back -## example: -## ❯ kubectl label node/kind-worker skyhook.nvidia.com/test-node=skyhooke2e - -for node in $(kubectl get nodes -o name); do - for anno in $(kubectl annotate --list ${node}); do - [[ ${anno} =~ (^skyhook.nvidia.com\/.*)=.* ]] && kubectl annotate ${node} ${BASH_REMATCH[1]}- - done - for label in $(kubectl label --list ${node}); do - [[ ${label} =~ (^skyhook.nvidia.com\/.*)=.* ]] && kubectl label ${node} ${BASH_REMATCH[1]}- - done -done diff --git a/k8s-tests/chainsaw/cli/reset/chainsaw-test.yaml b/k8s-tests/chainsaw/cli/reset/chainsaw-test.yaml new file mode 100644 index 00000000..9927cb51 --- /dev/null +++ b/k8s-tests/chainsaw/cli/reset/chainsaw-test.yaml @@ -0,0 +1,57 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# yaml-language-server: $schema=https://raw.githubusercontent.com/kyverno/chainsaw/main/.schemas/json/test-chainsaw-v1alpha1.json +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: cli-reset +spec: + description: | + Test the skyhook reset command: + - reset: resets all nodes for a Skyhook + timeouts: + assert: 120s + exec: 90s + steps: + # Step 1: Create a Skyhook and wait for it to complete + - name: setup-skyhook + try: + - apply: + file: skyhook.yaml + - assert: + file: assert-skyhook-complete.yaml + + # Step 2: Disable the skyhook so it doesn't restart processing after reset + - name: disable-skyhook + try: + - script: + timeout: 30s + content: | + ../skyhook-cli disable cli-reset-test --confirm + - assert: + file: assert-skyhook-disabled.yaml + + # Step 3: Test reset command + - name: test-reset + try: + - script: + timeout: 30s + content: | + ../skyhook-cli reset cli-reset-test --confirm + - assert: + file: assert-nodes-reset.yaml + diff --git a/k8s-tests/chainsaw/cli/reset/skyhook.yaml b/k8s-tests/chainsaw/cli/reset/skyhook.yaml new file mode 100644 index 00000000..637d65d2 --- /dev/null +++ b/k8s-tests/chainsaw/cli/reset/skyhook.yaml @@ -0,0 +1,34 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: skyhook.nvidia.com/v1alpha1 +kind: Skyhook +metadata: + name: cli-reset-test +spec: + nodeSelectors: + matchLabels: + skyhook.nvidia.com/test-node: skyhooke2e + packages: + test-package: + version: "1.0.0" + image: ghcr.io/nvidia/skyhook-packages/shellscript + configMap: + apply.sh: | + #!/bin/bash + echo "Reset test apply" + sleep 2 + diff --git a/k8s-tests/chainsaw/deployment-policy/legacy-compatibility/chainsaw-test.yaml b/k8s-tests/chainsaw/deployment-policy/legacy-compatibility/chainsaw-test.yaml index 243d8929..84cad838 100644 --- a/k8s-tests/chainsaw/deployment-policy/legacy-compatibility/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/deployment-policy/legacy-compatibility/chainsaw-test.yaml @@ -33,6 +33,17 @@ spec: exec: 180s assert: 120s steps: + - name: reset-state + try: + - script: + content: | + # Clear node annotations from previous runs using skyhook CLI + ../skyhook-cli reset legacy-interruption-budget-test --confirm 2>/dev/null || true + # Clean up any leftover labels + chmod +x ../label-nodes.sh + ../label-nodes.sh clean-all skyhook.nvidia.com/test-node 2>/dev/null || true + echo "✓ State reset complete" + - name: setup-nodes try: - script: @@ -76,6 +87,6 @@ spec: - script: content: | # Clean up node annotations - ../../skyhook/rest_test.sh legacy-interruption-budget-test + ../skyhook-cli reset legacy-interruption-budget-test --confirm 2>/dev/null || true # Clean up labels ../label-nodes.sh clean-all skyhook.nvidia.com/test-node diff --git a/k8s-tests/chainsaw/deployment-policy/linear-strategy/chainsaw-test.yaml b/k8s-tests/chainsaw/deployment-policy/linear-strategy/chainsaw-test.yaml index d3576180..27482876 100644 --- a/k8s-tests/chainsaw/deployment-policy/linear-strategy/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/deployment-policy/linear-strategy/chainsaw-test.yaml @@ -31,6 +31,18 @@ spec: exec: 300s steps: + - name: reset-state + try: + - script: + content: | + # Clear node annotations from previous runs using skyhook CLI + ../skyhook-cli reset linear-strategy-test --confirm 2>/dev/null || true + # Clean up any leftover labels + chmod +x ../label-nodes.sh + ../label-nodes.sh clean-all tier 2>/dev/null || true + ../label-nodes.sh clean-all skyhook.nvidia.com/test-node 2>/dev/null || true + echo "✓ State reset complete" + - name: setup-nodes try: - script: @@ -107,7 +119,7 @@ spec: - script: content: | # Clean up node annotations - ../../skyhook/rest_test.sh linear-strategy-test + ../skyhook-cli reset linear-strategy-test --confirm 2>/dev/null || true # Clean up labels ../label-nodes.sh clean-all tier ../label-nodes.sh clean-all skyhook.nvidia.com/test-node diff --git a/k8s-tests/chainsaw/deployment-policy/multi-compartment/chainsaw-test.yaml b/k8s-tests/chainsaw/deployment-policy/multi-compartment/chainsaw-test.yaml index 96e05ce5..e9a40106 100644 --- a/k8s-tests/chainsaw/deployment-policy/multi-compartment/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/deployment-policy/multi-compartment/chainsaw-test.yaml @@ -31,6 +31,18 @@ spec: exec: 300s # Increased for 15-node rollout steps: + - name: reset-state + try: + - script: + content: | + # Clear node annotations from previous runs using skyhook CLI + ../skyhook-cli reset multi-compartment-test --confirm 2>/dev/null || true + # Clean up any leftover labels + chmod +x ../label-nodes.sh + ../label-nodes.sh clean-all priority 2>/dev/null || true + ../label-nodes.sh clean-all skyhook.nvidia.com/test-node 2>/dev/null || true + echo "✓ State reset complete" + - name: setup-nodes try: - script: @@ -124,7 +136,7 @@ spec: - script: content: | # Clean up node annotations - ../../skyhook/rest_test.sh multi-compartment-test + ../skyhook-cli reset multi-compartment-test --confirm 2>/dev/null || true # Clean up labels ../label-nodes.sh clean-all priority ../label-nodes.sh clean-all skyhook.nvidia.com/test-node diff --git a/k8s-tests/chainsaw/deployment-policy/overlapping-selectors/assert-compartments.yaml b/k8s-tests/chainsaw/deployment-policy/overlapping-selectors/assert-compartments.yaml index c8470480..510987f3 100644 --- a/k8s-tests/chainsaw/deployment-policy/overlapping-selectors/assert-compartments.yaml +++ b/k8s-tests/chainsaw/deployment-policy/overlapping-selectors/assert-compartments.yaml @@ -32,3 +32,15 @@ status: prod-us-west: matched: 2 ceiling: 1 + nodeStatus: + # REGRESSION TEST: Nodes assigned to compartments but not yet selected for batches should be "waiting", not "unknown" + # This verifies that nodes waiting for batch selection have the correct status + # + # With 6 nodes total (2 per compartment) and initialBatch=1 per compartment: + # - 3 nodes should be "in_progress" (1 per compartment, selected for batch 1) + # - 3 nodes should be "waiting" (1 per compartment, not yet selected) + # + # Count occurrences using JMESPath array filtering: verify exactly 3 "waiting" and 3 "in_progress" + # This verifies nodes are correctly marked as "waiting" (not "unknown") + (length(values(@)[?@ == 'waiting'])): 3 + (length(values(@)[?@ == 'in_progress'])): 3 diff --git a/k8s-tests/chainsaw/deployment-policy/overlapping-selectors/chainsaw-test.yaml b/k8s-tests/chainsaw/deployment-policy/overlapping-selectors/chainsaw-test.yaml index 23c26afd..f447cf3e 100644 --- a/k8s-tests/chainsaw/deployment-policy/overlapping-selectors/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/deployment-policy/overlapping-selectors/chainsaw-test.yaml @@ -32,6 +32,19 @@ spec: exec: 300s steps: + - name: reset-state + try: + - script: + content: | + # Clear node annotations from previous runs using skyhook CLI + ../skyhook-cli reset overlapping-selectors-test --confirm 2>/dev/null || true + # Clean up any leftover labels + chmod +x ../label-nodes.sh + ../label-nodes.sh clean-all region 2>/dev/null || true + ../label-nodes.sh clean-all env 2>/dev/null || true + ../label-nodes.sh clean-all skyhook.nvidia.com/test-node 2>/dev/null || true + echo "✓ State reset complete" + - name: setup-nodes try: - script: @@ -131,7 +144,7 @@ spec: - script: content: | # Clean up node annotations - ../../skyhook/rest_test.sh overlapping-selectors-test + ../skyhook-cli reset overlapping-selectors-test --confirm 2>/dev/null || true # Clean up labels ../label-nodes.sh clean-all region ../label-nodes.sh clean-all env diff --git a/k8s-tests/chainsaw/helm/helm-chart-test/chainsaw-test.yaml b/k8s-tests/chainsaw/helm/helm-chart-test/chainsaw-test.yaml index bd58f73c..64cd4744 100644 --- a/k8s-tests/chainsaw/helm/helm-chart-test/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/helm/helm-chart-test/chainsaw-test.yaml @@ -27,6 +27,14 @@ spec: assert: 240s exec: 240s steps: + - name: reset-state + try: + - script: + timeout: 30s + content: | + ../skyhook-cli reset helm-test-skyhook --confirm 2>/dev/null || true + echo "✓ State reset complete" + - try: - script: content: | @@ -60,6 +68,7 @@ spec: - script: content: | ## Cleanup deployment policy test resources + ../skyhook-cli reset helm-test-skyhook --confirm 2>/dev/null || true kubectl delete skyhook helm-test-skyhook --ignore-not-found || true kubectl delete deploymentpolicy helm-test-policy -n skyhook --ignore-not-found || true - script: diff --git a/k8s-tests/chainsaw/helm/helm-webhook-test/chainsaw-test.yaml b/k8s-tests/chainsaw/helm/helm-webhook-test/chainsaw-test.yaml index 2f6a68a7..7b3a080e 100644 --- a/k8s-tests/chainsaw/helm/helm-webhook-test/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/helm/helm-webhook-test/chainsaw-test.yaml @@ -34,6 +34,16 @@ spec: assert: 180s exec: 180s steps: + - name: reset-state + try: + - script: + timeout: 30s + content: | + ../skyhook-cli reset skyhook-valid-policy --confirm 2>/dev/null || true + ../skyhook-cli reset skyhook-missing-policy --confirm 2>/dev/null || true + ../skyhook-cli reset invalid-skyhook --confirm 2>/dev/null || true + echo "✓ State reset complete" + - try: - script: content: | diff --git a/k8s-tests/chainsaw/skyhook/cleanup-pods/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/cleanup-pods/chainsaw-test.yaml index fbddbe64..db701dfe 100644 --- a/k8s-tests/chainsaw/skyhook/cleanup-pods/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/cleanup-pods/chainsaw-test.yaml @@ -32,7 +32,7 @@ spec: - script: content: | ## remove annotation from last run - ../rest_test.sh cleanup-pods "node-role.kubernetes.io/control-plane notin ()" + ../skyhook-cli reset cleanup-pods --confirm 2>/dev/null || true - create: file: setup.yaml diff --git a/k8s-tests/chainsaw/skyhook/cleanup-webhook.sh b/k8s-tests/chainsaw/skyhook/cleanup-webhook.sh deleted file mode 100755 index cca9575c..00000000 --- a/k8s-tests/chainsaw/skyhook/cleanup-webhook.sh +++ /dev/null @@ -1,32 +0,0 @@ -#!/bin/bash - -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -## This script will cleanup the webhook secret and configurations -## Some if you want to remove -NAMESPACE=${NAMESPACE:-skyhook} -WEBHOOK_SECRET_NAME=${WEBHOOK_SECRET_NAME:-webhook-cert} -VALIDATING_WEBHOOK_CONFIGURATION_NAME=${VALIDATING_WEBHOOK_CONFIGURATION_NAME:-skyhook-operator-validating-webhook} -MUTATING_WEBHOOK_CONFIGURATION_NAME=${MUTATING_WEBHOOK_CONFIGURATION_NAME:-skyhook-operator-mutating-webhook} - -# Delete the webhook secret -kubectl delete secret -n $NAMESPACE $WEBHOOK_SECRET_NAME - -# Get the webhook configurations -kubectl delete validatingwebhookconfiguration $VALIDATING_WEBHOOK_CONFIGURATION_NAME -kubectl delete mutatingwebhookconfiguration $MUTATING_WEBHOOK_CONFIGURATION_NAME diff --git a/k8s-tests/chainsaw/skyhook/config-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/config-skyhook/chainsaw-test.yaml index aff4e67f..dd4cce34 100644 --- a/k8s-tests/chainsaw/skyhook/config-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/config-skyhook/chainsaw-test.yaml @@ -45,7 +45,7 @@ spec: - script: content: | ## remove annotation/labels from last run - ../rest_test.sh config-skyhook + ../skyhook-cli reset config-skyhook --confirm 2>/dev/null || true - create: file: skyhook.yaml - assert: diff --git a/k8s-tests/chainsaw/skyhook/delete-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/delete-skyhook/chainsaw-test.yaml index 5fa41ead..b1d18a2f 100644 --- a/k8s-tests/chainsaw/skyhook/delete-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/delete-skyhook/chainsaw-test.yaml @@ -28,7 +28,7 @@ spec: - script: content: | ## remove annotation from last run - ../rest_test.sh delete-skyhook + ../skyhook-cli reset delete-skyhook --confirm 2>/dev/null || true - apply: file: skyhook.yaml - assert: diff --git a/k8s-tests/chainsaw/skyhook/depends-on/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/depends-on/chainsaw-test.yaml index 15771724..9f6b646c 100644 --- a/k8s-tests/chainsaw/skyhook/depends-on/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/depends-on/chainsaw-test.yaml @@ -30,7 +30,7 @@ spec: - script: content: | ## remove annotation from last run - ../rest_test.sh depends-on + ../skyhook-cli reset depends-on --confirm 2>/dev/null || true - create: file: skyhook.yaml - assert: diff --git a/k8s-tests/chainsaw/skyhook/failure-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/failure-skyhook/chainsaw-test.yaml index 4833de86..2b4ce149 100644 --- a/k8s-tests/chainsaw/skyhook/failure-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/failure-skyhook/chainsaw-test.yaml @@ -39,7 +39,7 @@ spec: - script: content: | ## remove annotation from last run - ../rest_test.sh failure-skyhook + ../skyhook-cli reset failure-skyhook --confirm 2>/dev/null || true - create: file: skyhook.yaml - assert: diff --git a/k8s-tests/chainsaw/skyhook/interrupt-grouping/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/interrupt-grouping/chainsaw-test.yaml index a6e03ba7..b4b2c5bc 100644 --- a/k8s-tests/chainsaw/skyhook/interrupt-grouping/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/interrupt-grouping/chainsaw-test.yaml @@ -43,7 +43,7 @@ spec: - script: content: | ## remove annotation from last run - ../rest_test.sh interrupt-grouping + ../skyhook-cli reset interrupt-grouping --confirm 2>/dev/null || true - create: file: skyhook.yaml - assert: diff --git a/k8s-tests/chainsaw/skyhook/interrupt/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/interrupt/chainsaw-test.yaml index 569ce1b1..9d86858f 100644 --- a/k8s-tests/chainsaw/skyhook/interrupt/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/interrupt/chainsaw-test.yaml @@ -45,7 +45,7 @@ spec: - script: content: | ## remove annotation from last run - ../rest_test.sh interrupt + ../skyhook-cli reset interrupt --confirm 2>/dev/null || true - create: file: pod.yaml - wait: diff --git a/k8s-tests/chainsaw/skyhook/rest_test.sh b/k8s-tests/chainsaw/skyhook/rest_test.sh deleted file mode 100755 index 8bf60c6d..00000000 --- a/k8s-tests/chainsaw/skyhook/rest_test.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash -x - -# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -## helper script -## clears labels and annotate from nodes with the prefix for tests - -## Usage: -## ./rest_test.sh [selector] -## Examples: -## ./rest_test.sh my-skyhook # uses default selector -## ./rest_test.sh my-skyhook "skyhook.nvidia.com/test-node=skyhooke2e" # label selector -## ./rest_test.sh my-skyhook "node-role.kubernetes.io/control-plane notin ()" # expression selector - -## the name of the skyhook to reset -skyhook="$1" - -## the selector to use for finding nodes (defaults to test node selector) -selector="${2:-skyhook.nvidia.com/test-node=skyhooke2e}" - -for node in $(kubectl get nodes -l "$selector" -o name); do - kubectl annotate ${node} skyhook.nvidia.com/nodeState_${skyhook}- - kubectl annotate ${node} skyhook.nvidia.com/status_${skyhook}- - kubectl annotate ${node} skyhook.nvidia.com/cordon_${skyhook}- - kubectl annotate ${node} skyhook.nvidia.com/version_${skyhook}- - kubectl label ${node} skyhook.nvidia.com/status_${skyhook}- -done diff --git a/k8s-tests/chainsaw/skyhook/runtime-required/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/runtime-required/chainsaw-test.yaml index 285c27c5..d7e9f14a 100644 --- a/k8s-tests/chainsaw/skyhook/runtime-required/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/runtime-required/chainsaw-test.yaml @@ -38,7 +38,7 @@ spec: - script: content: | ## remove annotation from last run - ../rest_test.sh runtime-required + ../skyhook-cli reset runtime-required --confirm 2>/dev/null || true ## add taints to nodes ../nodes_add_taint.sh all skyhook.nvidia.com=runtime-required:NoSchedule skyhook.nvidia.com/test-node=skyhooke2e - create: diff --git a/k8s-tests/chainsaw/skyhook/simple-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/simple-skyhook/chainsaw-test.yaml index 48a8a08c..cb279ba0 100644 --- a/k8s-tests/chainsaw/skyhook/simple-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/simple-skyhook/chainsaw-test.yaml @@ -28,7 +28,7 @@ spec: - script: content: | ## remove annotation from last run - ../rest_test.sh simple-skyhook + ../skyhook-cli reset simple-skyhook --confirm 2>/dev/null || true - apply: ## set a limit range in the namespace file: limitrange.yaml - apply: diff --git a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/chainsaw-test.yaml index 9bc8349d..9f740e9a 100644 --- a/k8s-tests/chainsaw/skyhook/simple-update-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/simple-update-skyhook/chainsaw-test.yaml @@ -38,7 +38,7 @@ spec: - script: content: | ## remove annotation/labels from last run - ../rest_test.sh simple-update-skyhook + ../skyhook-cli reset simple-update-skyhook --confirm 2>/dev/null || true - apply: file: skyhook.yaml - assert: diff --git a/k8s-tests/chainsaw/skyhook/skyhook-upgrade/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/skyhook-upgrade/chainsaw-test.yaml index 45da93d9..a248e44c 100644 --- a/k8s-tests/chainsaw/skyhook/skyhook-upgrade/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/skyhook-upgrade/chainsaw-test.yaml @@ -35,7 +35,7 @@ spec: - script: content: | ## remove annotation from last run - ../rest_test.sh skyhook-upgrade + ../skyhook-cli reset skyhook-upgrade --confirm 2>/dev/null || true - script: content: | ## Set up old format annotation to test migration diff --git a/k8s-tests/chainsaw/skyhook/strict_order/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/strict_order/chainsaw-test.yaml index 71b03f3a..a9579959 100644 --- a/k8s-tests/chainsaw/skyhook/strict_order/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/strict_order/chainsaw-test.yaml @@ -34,10 +34,10 @@ spec: - script: content: | ## remove annotation/labels from last run - ../rest_test.sh strict-order-skyhook-zzz - ../rest_test.sh strict-order-skyhook-b - ../rest_test.sh strict-order-skyhook-c - ../rest_test.sh strict-order-skyhook-d + ../skyhook-cli reset strict-order-skyhook-zzz --confirm 2>/dev/null || true + ../skyhook-cli reset strict-order-skyhook-b --confirm 2>/dev/null || true + ../skyhook-cli reset strict-order-skyhook-c --confirm 2>/dev/null || true + ../skyhook-cli reset strict-order-skyhook-d --confirm 2>/dev/null || true - apply: file: skyhook.yaml - script: diff --git a/k8s-tests/chainsaw/skyhook/taint-scheduling/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/taint-scheduling/chainsaw-test.yaml index ab0eab43..d52807d2 100644 --- a/k8s-tests/chainsaw/skyhook/taint-scheduling/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/taint-scheduling/chainsaw-test.yaml @@ -38,7 +38,7 @@ spec: - script: content: | ## remove annotation from last run - ../rest_test.sh taint-scheduling + ../skyhook-cli reset taint-scheduling --confirm 2>/dev/null || true ## add taints to nodes ../nodes_add_taint.sh all nvidia.com/gpu:NoSchedule skyhook.nvidia.com/test-node=skyhooke2e - create: diff --git a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/chainsaw-test.yaml index 591b5b7e..026e6bfe 100644 --- a/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/uninstall-upgrade-skyhook/chainsaw-test.yaml @@ -45,7 +45,7 @@ spec: - script: content: | ## remove annotation/labels from last run - ../rest_test.sh uninstall-upgrade-skyhook + ../skyhook-cli reset uninstall-upgrade-skyhook --confirm 2>/dev/null || true - create: file: skyhook.yaml - assert: diff --git a/k8s-tests/chainsaw/skyhook/validate-packages/chainsaw-test.yaml b/k8s-tests/chainsaw/skyhook/validate-packages/chainsaw-test.yaml index acb20c07..ca426601 100644 --- a/k8s-tests/chainsaw/skyhook/validate-packages/chainsaw-test.yaml +++ b/k8s-tests/chainsaw/skyhook/validate-packages/chainsaw-test.yaml @@ -41,7 +41,7 @@ spec: - script: content: | ## remove annotation/labels from last run - ../rest_test.sh validate-packages + ../skyhook-cli reset validate-packages --confirm 2>/dev/null || true - apply: file: skyhook.yaml - sleep: diff --git a/k8s-tests/operator-agent/interrupt/chainsaw-test.yaml b/k8s-tests/operator-agent/interrupt/chainsaw-test.yaml index aa430564..b8b86ce7 100644 --- a/k8s-tests/operator-agent/interrupt/chainsaw-test.yaml +++ b/k8s-tests/operator-agent/interrupt/chainsaw-test.yaml @@ -29,7 +29,7 @@ spec: - script: content: | ## remove annotation from last run - ../../chainsaw/skyhook/rest_test.sh interrupt-agent-operator + ../../../../operator/bin/skyhook reset interrupt-agent-operator --confirm 2>/dev/null || true - script: content: | ## reinstall the debug pod in case it was deleted diff --git a/k8s-tests/operator-agent/simple/chainsaw-test.yaml b/k8s-tests/operator-agent/simple/chainsaw-test.yaml index f78bfd30..2f5ed23a 100644 --- a/k8s-tests/operator-agent/simple/chainsaw-test.yaml +++ b/k8s-tests/operator-agent/simple/chainsaw-test.yaml @@ -29,7 +29,7 @@ spec: - script: content: | ## remove annotation from last run - ../../chainsaw/skyhook/rest_test.sh simple-agent-operator + ../../../../operator/bin/skyhook reset simple-agent-operator --confirm 2>/dev/null || true - script: content: | ## reinstall the debug pod in case it was deleted diff --git a/operator/Makefile b/operator/Makefile index acac5812..b8e3ef7c 100644 --- a/operator/Makefile +++ b/operator/Makefile @@ -142,7 +142,7 @@ $(REPORTING): mkdir -p $@ .PHONY: test -test:: reporting manifests generate fmt vet lint unit-tests e2e-tests cli-e2e-tests helm-tests operator-agent-tests ## Run all tests. +test:: reporting manifests generate fmt vet lint build-cli unit-tests e2e-tests cli-e2e-tests helm-tests operator-agent-tests ## Run all tests. ifndef CI ## we double define test so we can do thing different if in ci vs local @@ -168,7 +168,7 @@ unit-tests: reporting manifests generate envtest ginkgo kill ## Run unit tests. ## https://kyverno.github.io/chainsaw/latest/configuration/file/ CHAINSAW_ARGS:=--exec-timeout 30s --parallel 1 -e2e-tests: chainsaw install run ## Run end to end tests. +e2e-tests: chainsaw install run ensure-test-symlinks ## Run end to end tests. ## requires a cluster to be running with access ## locally use kind to create clusters ## in ci, the plan current is to have a real cluster, and create a node pool for testing @@ -176,17 +176,33 @@ e2e-tests: chainsaw install run ## Run end to end tests. $(MAKE) kill go tool covdata textfmt -i=$(REPORTING)/int -o reporting/int.coverprofile -cli-e2e-tests: chainsaw install run build-cli-with-coverage ## Run CLI end to end tests with coverage. +cli-e2e-tests: chainsaw install run build-cli-with-coverage ensure-test-symlinks ## Run CLI end to end tests with coverage. ## Tests the kubectl-skyhook CLI commands against a real cluster ## Requires nodes labeled with skyhook.nvidia.com/test-node=skyhooke2e @mkdir -p $(REPORTING)/cli-cover + @# Ensure bin/skyhook exists (build-cli-with-coverage only creates bin/skyhook-cover) + @if [ ! -f bin/skyhook ]; then \ + if [ -f bin/skyhook-cover ]; then \ + cp bin/skyhook-cover bin/skyhook; \ + else \ + echo "Error: bin/skyhook-cover not found. Run build-cli-with-coverage first."; \ + exit 1; \ + fi; \ + fi @# Replace CLI binary with a wrapper that sets GOCOVERDIR for coverage collection @mv bin/skyhook bin/skyhook.orig 2>/dev/null || true @echo '#!/bin/bash' > bin/skyhook - @echo 'SCRIPT_DIR="$$(cd "$$(dirname "$${BASH_SOURCE[0]}")" && pwd)"' >> bin/skyhook + @echo '# Resolve actual script location (handles symlinks)' >> bin/skyhook + @echo 'if [ -L "$${BASH_SOURCE[0]}" ]; then' >> bin/skyhook + @echo ' SCRIPT_DIR="$$(cd "$$(dirname "$$(readlink -f "$${BASH_SOURCE[0]}")")" && pwd)"' >> bin/skyhook + @echo 'else' >> bin/skyhook + @echo ' SCRIPT_DIR="$$(cd "$$(dirname "$${BASH_SOURCE[0]}")" && pwd)"' >> bin/skyhook + @echo 'fi' >> bin/skyhook @echo 'export GOCOVERDIR="$$SCRIPT_DIR/../reporting/cli-cover"' >> bin/skyhook @echo 'exec "$$SCRIPT_DIR/skyhook-cover" "$$@"' >> bin/skyhook @chmod +x bin/skyhook + @# Update symlinks to point to the new wrapper script + @$(MAKE) ensure-test-symlinks $(CHAINSAW) test --test-dir ../k8s-tests/chainsaw/cli $(CHAINSAW_ARGS) $(MAKE) kill @# Restore original CLI binary @@ -201,7 +217,7 @@ cli-e2e-tests: chainsaw install run build-cli-with-coverage ## Run CLI end to en echo "ℹ️ No CLI coverage data (this is OK - CLI is covered by unit tests)"; \ fi -helm-tests: helm chainsaw +helm-tests: helm chainsaw ensure-test-symlinks ## Here we need to run the operator so that the old CRD can deleted along with ## any leftover SCRs. Without this the SCRs may have finalizers which rely on ## the operator and will cause the deletion and tests to hang. @@ -218,13 +234,14 @@ operator-agent-tests: chainsaw install ## Run operator agent tests. ## ../k8s-tests/operator-agent/setup.sh kind-worker teardown go tool covdata textfmt -i=$(REPORTING)/int -o reporting/int.coverprofile -deployment-policy-tests: chainsaw install run ## Run all deployment policy E2E tests (requires 15-node cluster). +deployment-policy-tests: chainsaw install run ensure-test-symlinks ## Run all deployment policy E2E tests (requires 15-node cluster). ## requires a 15-node cluster to be running with access ## use 'make create-deployment-policy-cluster' to create the cluster $(CHAINSAW) test --test-dir ../k8s-tests/chainsaw/deployment-policy $(CHAINSAW_ARGS) $(MAKE) kill go tool covdata textfmt -i=$(REPORTING)/int -o reporting/int.coverprofile +.PHONY: create-deployment-policy-cluster create-deployment-policy-cluster: ## Create a 15-node Kind cluster for deployment policy tests. @echo "🔧 Creating 15-node Kind cluster for deployment policy tests..." @# Fix for Podman: increase kernel keyring quota @@ -239,8 +256,12 @@ create-deployment-policy-cluster: ## Create a 15-node Kind cluster for deploymen fi; \ fi kind delete cluster --name skyhook-dp-test 2>/dev/null || true - kind create cluster --name skyhook-dp-test --config ../k8s-tests/chainsaw/deployment-policy/kind-config.yaml - @echo "✅ Cluster created! Use 'kubectl config use-context kind-skyhook-dp-test' to switch to it." + kind create cluster --name skyhook-dp-test --config ../k8s-tests/chainsaw/deployment-policy/kind-config.yaml --image=kindest/node:v$(KIND_VERSION) + +.PHONY: delete-deployment-policy-cluster +delete-deployment-policy-cluster: ## Delete the 15-node Kind cluster for deployment policy tests. + kind delete cluster --name skyhook-dp-test 2>/dev/null || true + kubectl config use-context kind-kind || true ifeq ($(DOCKER_CMD),docker) DOCKER_AUTH_FILE=${HOME}/.docker/config.json @@ -302,13 +323,26 @@ build: build-manager build-cli ## Build manager and CLI binaries. build-manager: manifests generate fmt vet lint ## Build manager binary. go build $(GOFLAGS) $(GO_LDFLAGS) -o bin/manager cmd/manager/main.go +.PHONY: ensure-test-symlinks +ensure-test-symlinks: ## Ensure symlinks exist in test directories (idempotent). + @# Create symlinks in test directories for easy access + @# This target is idempotent and can be called even if binaries are cached + @# Note: symlinks will be created even if bin/skyhook doesn't exist yet (it will be created by build targets) + @for dir in cli deployment-policy skyhook helm; do \ + ln -sf ../../../operator/bin/skyhook ../k8s-tests/chainsaw/$$dir/skyhook-cli 2>/dev/null || true; \ + done + .PHONY: build-cli CLI_LDFLAGS := -ldflags "-X github.com/NVIDIA/skyhook/operator/internal/version.VERSION=$(VERSION) -X github.com/NVIDIA/skyhook/operator/internal/version.GIT_SHA=$(GIT_SHA)" -build-cli: ## Build CLI binary. +build-cli: ensure-test-symlinks ## Build CLI binary. go build $(GOFLAGS) $(CLI_LDFLAGS) -o bin/skyhook cmd/cli/main.go -build-cli-with-coverage: ## Build CLI binary with coverage instrumentation. +build-cli-with-coverage: ensure-test-symlinks ## Build CLI binary with coverage instrumentation. go build $(GOFLAGS) -cover $(CLI_LDFLAGS) -o bin/skyhook-cover cmd/cli/main.go + @# Ensure bin/skyhook exists (copy from cover binary if not cached) so symlinks work + @if [ ! -f bin/skyhook ]; then \ + cp bin/skyhook-cover bin/skyhook; \ + fi .PHONY: run ENABLE_WEBHOOKS?=false diff --git a/operator/api/v1alpha1/zz_generated.deepcopy.go b/operator/api/v1alpha1/zz_generated.deepcopy.go index b4bb334c..63e6652d 100644 --- a/operator/api/v1alpha1/zz_generated.deepcopy.go +++ b/operator/api/v1alpha1/zz_generated.deepcopy.go @@ -1,5 +1,3 @@ -//go:build !ignore_autogenerated - /* * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. * SPDX-License-Identifier: Apache-2.0 @@ -18,6 +16,8 @@ * limitations under the License. */ +//go:build !ignore_autogenerated + // Code generated by controller-gen. DO NOT EDIT. package v1alpha1 diff --git a/operator/cmd/cli/app/cli.go b/operator/cmd/cli/app/cli.go index 9c471a61..0e48d1a3 100644 --- a/operator/cmd/cli/app/cli.go +++ b/operator/cmd/cli/app/cli.go @@ -73,6 +73,7 @@ func NewSkyhookCommand(ctx *context.CLIContext) *cobra.Command { NewResumeCmd(ctx), NewDisableCmd(ctx), NewEnableCmd(ctx), + NewResetCmd(ctx), ) return skyhookCmd diff --git a/operator/cmd/cli/app/reset.go b/operator/cmd/cli/app/reset.go new file mode 100644 index 00000000..d301ec90 --- /dev/null +++ b/operator/cmd/cli/app/reset.go @@ -0,0 +1,218 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package app + +import ( + "bufio" + "context" + "encoding/json" + "fmt" + "strings" + + "github.com/spf13/cobra" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/NVIDIA/skyhook/operator/api/v1alpha1" + "github.com/NVIDIA/skyhook/operator/internal/cli/client" + cliContext "github.com/NVIDIA/skyhook/operator/internal/cli/context" + "github.com/NVIDIA/skyhook/operator/internal/cli/utils" +) + +const ( + nodeStateAnnotationPrefix = v1alpha1.METADATA_PREFIX + "/nodeState_" + statusAnnotationPrefix = v1alpha1.METADATA_PREFIX + "/status_" + cordonAnnotationPrefix = v1alpha1.METADATA_PREFIX + "/cordon_" + versionAnnotationPrefix = v1alpha1.METADATA_PREFIX + "/version_" + statusLabelPrefix = v1alpha1.METADATA_PREFIX + "/status_" +) + +// resetOptions holds the options for the reset command +type resetOptions struct { + confirm bool +} + +// NewResetCmd creates the reset command +func NewResetCmd(ctx *cliContext.CLIContext) *cobra.Command { + opts := &resetOptions{} + + cmd := &cobra.Command{ + Use: "reset ", + Short: "Reset all nodes for a Skyhook", + Long: `Reset all package state on all nodes for a specific Skyhook, forcing a complete re-run. + +This command removes all Skyhook state from all nodes that have state for the +specified Skyhook, causing the operator to re-execute all packages from the beginning. + +Unlike 'node reset' which resets specific nodes, 'skyhook reset' resets ALL nodes +that have state for the specified Skyhook.`, + Example: ` # Reset all nodes for gpu-init Skyhook + kubectl skyhook reset gpu-init --confirm + + # Preview changes without applying (dry-run) + kubectl skyhook reset gpu-init --dry-run`, + Args: cobra.ExactArgs(1), + RunE: func(cmd *cobra.Command, args []string) error { + skyhookName := args[0] + + clientFactory := client.NewFactory(ctx.GlobalFlags.ConfigFlags) + kubeClient, err := clientFactory.Client() + if err != nil { + return fmt.Errorf("initializing kubernetes client: %w", err) + } + + return runReset(cmd.Context(), cmd, kubeClient, skyhookName, opts, ctx) + }, + } + + cmd.Flags().BoolVarP(&opts.confirm, "confirm", "y", false, "Skip confirmation prompt") + + return cmd +} + +func runReset(ctx context.Context, cmd *cobra.Command, kubeClient *client.Client, skyhookName string, opts *resetOptions, cliCtx *cliContext.CLIContext) error { + // Get all nodes + nodeList, err := kubeClient.Kubernetes().CoreV1().Nodes().List(ctx, metav1.ListOptions{}) + if err != nil { + return fmt.Errorf("listing nodes: %w", err) + } + + // Find nodes that have the specified Skyhook annotation + annotationKey := nodeStateAnnotationPrefix + skyhookName + nodesToReset := make([]string, 0) + nodeStates := make(map[string]v1alpha1.NodeState) + + for _, node := range nodeList.Items { + annotation, ok := node.Annotations[annotationKey] + if !ok { + continue + } + + var nodeState v1alpha1.NodeState + if err := json.Unmarshal([]byte(annotation), &nodeState); err != nil { + if cliCtx.GlobalFlags.Verbose { + _, _ = fmt.Fprintf(cmd.ErrOrStderr(), "Warning: skipping node %q - invalid annotation: %v\n", node.Name, err) + } + continue + } + + nodesToReset = append(nodesToReset, node.Name) + nodeStates[node.Name] = nodeState + } + + if len(nodesToReset) == 0 { + _, _ = fmt.Fprintf(cmd.OutOrStdout(), "No nodes have state for Skyhook %q\n", skyhookName) + return nil + } + + // Print summary + _, _ = fmt.Fprintf(cmd.OutOrStdout(), "Skyhook: %s\n", skyhookName) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), "Nodes to reset (%d):\n", len(nodesToReset)) + for _, nodeName := range nodesToReset { + nodeState := nodeStates[nodeName] + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " - %s (%d packages)\n", nodeName, len(nodeState)) + } + + // Dry run check + if cliCtx.GlobalFlags.DryRun { + _, _ = fmt.Fprintf(cmd.OutOrStdout(), "\n[dry-run] No changes applied\n") + return nil + } + + // Confirmation + if !opts.confirm { + _, _ = fmt.Fprintf(cmd.OutOrStdout(), "\nThis will remove ALL package state for Skyhook %q on %d node(s).\n", skyhookName, len(nodesToReset)) + _, _ = fmt.Fprintf(cmd.OutOrStdout(), "All packages will re-run from the beginning.\n") + _, _ = fmt.Fprintf(cmd.OutOrStdout(), "Continue? [y/N]: ") + + reader := bufio.NewReader(cmd.InOrStdin()) + response, err := reader.ReadString('\n') + if err != nil { + return fmt.Errorf("reading confirmation: %w", err) + } + + response = strings.ToLower(strings.TrimSpace(response)) + if response != "y" && response != "yes" { + _, _ = fmt.Fprintf(cmd.OutOrStdout(), "Aborted\n") + return nil + } + } + + // Apply changes - clear all skyhook-related annotations and labels + var updateErrors []string + successCount := 0 + + for _, nodeName := range nodesToReset { + // Clear all annotations and labels for this skyhook + annotationsToRemove := []string{ + nodeStateAnnotationPrefix + skyhookName, + statusAnnotationPrefix + skyhookName, + cordonAnnotationPrefix + skyhookName, + versionAnnotationPrefix + skyhookName, + } + labelsToRemove := []string{ + statusLabelPrefix + skyhookName, + } + + // Try to remove the main nodeState annotation first - this is the critical one + mainAnnotationKey := nodeStateAnnotationPrefix + skyhookName + if err := utils.RemoveNodeAnnotation(ctx, kubeClient.Kubernetes(), nodeName, mainAnnotationKey); err != nil { + updateErrors = append(updateErrors, fmt.Sprintf("%s: failed to remove nodeState annotation: %v", nodeName, err)) + continue + } + + // Remove other annotations (non-critical, so we don't fail if they don't exist) + for _, annKey := range annotationsToRemove { + if annKey == mainAnnotationKey { + continue // Already removed + } + if err := utils.RemoveNodeAnnotation(ctx, kubeClient.Kubernetes(), nodeName, annKey); err != nil { + // Don't fail if annotation doesn't exist - just log it + if cliCtx.GlobalFlags.Verbose { + _, _ = fmt.Fprintf(cmd.ErrOrStderr(), "Warning: failed to remove annotation %q from node %q: %v\n", annKey, nodeName, err) + } + } + } + + // Remove labels (non-critical, so we don't fail if they don't exist) + for _, labelKey := range labelsToRemove { + if err := utils.RemoveNodeLabel(ctx, kubeClient.Kubernetes(), nodeName, labelKey); err != nil { + // Don't fail if label doesn't exist - just log it + if cliCtx.GlobalFlags.Verbose { + _, _ = fmt.Fprintf(cmd.ErrOrStderr(), "Warning: failed to remove label %q from node %q: %v\n", labelKey, nodeName, err) + } + } + } + + successCount++ + } + + // Print results + if len(updateErrors) > 0 { + _, _ = fmt.Fprintf(cmd.OutOrStdout(), "\nErrors resetting some nodes:\n") + for _, e := range updateErrors { + _, _ = fmt.Fprintf(cmd.OutOrStdout(), " - %s\n", e) + } + } + + if successCount > 0 { + _, _ = fmt.Fprintf(cmd.OutOrStdout(), "\nSuccessfully reset %d node(s) for Skyhook %q\n", successCount, skyhookName) + } + + return nil +} diff --git a/operator/cmd/cli/app/reset_test.go b/operator/cmd/cli/app/reset_test.go new file mode 100644 index 00000000..206c49e4 --- /dev/null +++ b/operator/cmd/cli/app/reset_test.go @@ -0,0 +1,323 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package app + +import ( + "bytes" + gocontext "context" + "encoding/json" + "fmt" + "strings" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/spf13/cobra" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" + + "github.com/NVIDIA/skyhook/operator/api/v1alpha1" + "github.com/NVIDIA/skyhook/operator/internal/cli/client" + "github.com/NVIDIA/skyhook/operator/internal/cli/context" +) + +// Helper function to create a test node with nodeState annotation +func createTestNodeWithState(kube *fake.Clientset, nodeName, skyhookName string) (string, error) { + nodeState := v1alpha1.NodeState{ + "pkg1|1.0": {Name: "pkg1", Version: "1.0", Stage: v1alpha1.StageApply, State: v1alpha1.StateComplete}, + } + nodeStateJSON, _ := json.Marshal(nodeState) + annotationKey := nodeStateAnnotationPrefix + skyhookName + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: nodeName, + Annotations: map[string]string{ + annotationKey: string(nodeStateJSON), + }, + }, + } + _, err := kube.CoreV1().Nodes().Create(gocontext.Background(), node, metav1.CreateOptions{}) + if err != nil { + return "", err + } + return annotationKey, nil +} + +// Helper function to verify annotation was removed +func verifyAnnotationRemoved(kube *fake.Clientset, nodeName, annotationKey string) error { + updatedNode, err := kube.CoreV1().Nodes().Get(gocontext.Background(), nodeName, metav1.GetOptions{}) + if err != nil { + return err + } + _, exists := updatedNode.Annotations[annotationKey] + if exists { + return fmt.Errorf("annotation %s still exists on node %s", annotationKey, nodeName) + } + return nil +} + +var _ = Describe("Reset Command", func() { + Describe("NewResetCmd", func() { + It("should create command with correct properties", func() { + ctx := context.NewCLIContext(nil) + cmd := NewResetCmd(ctx) + + Expect(cmd.Use).To(Equal("reset ")) + Expect(cmd.Short).To(ContainSubstring("Reset all nodes")) + }) + + It("should require exactly one argument", func() { + ctx := context.NewCLIContext(nil) + cmd := NewResetCmd(ctx) + + err := cmd.Args(cmd, []string{}) + Expect(err).To(HaveOccurred()) + + err = cmd.Args(cmd, []string{"skyhook1", "skyhook2"}) + Expect(err).To(HaveOccurred()) + }) + + It("should have confirm flag", func() { + ctx := context.NewCLIContext(nil) + cmd := NewResetCmd(ctx) + + confirmFlag := cmd.Flags().Lookup("confirm") + Expect(confirmFlag).NotTo(BeNil()) + Expect(confirmFlag.Shorthand).To(Equal("y")) + }) + }) + + Describe("runReset", func() { + var ( + output *bytes.Buffer + mockKube *fake.Clientset + kubeClient *client.Client + cmd *cobra.Command + cliCtx *context.CLIContext + skyhookName = "my-skyhook" + ) + + BeforeEach(func() { + output = &bytes.Buffer{} + mockKube = fake.NewClientset() + kubeClient = client.NewWithClientsAndConfig(mockKube, nil, nil) + cliCtx = context.NewCLIContext(context.NewCLIConfig(context.WithOutputWriter(output))) + + cmd = &cobra.Command{} + cmd.SetOut(output) + cmd.SetErr(output) + cmd.SetIn(strings.NewReader("y\n")) // Default to 'yes' for prompts + }) + + It("should show no nodes when skyhook has no state", func() { + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-1", + }, + } + _, err := mockKube.CoreV1().Nodes().Create(gocontext.Background(), node, metav1.CreateOptions{}) + Expect(err).NotTo(HaveOccurred()) + + opts := &resetOptions{confirm: true} + err = runReset(gocontext.Background(), cmd, kubeClient, "my-skyhook", opts, cliCtx) + Expect(err).NotTo(HaveOccurred()) + Expect(output.String()).To(ContainSubstring("No nodes have state")) + }) + + It("should reset all nodes with state for skyhook", func() { + nodeState := v1alpha1.NodeState{ + "pkg1|1.0": {Name: "pkg1", Version: "1.0", Stage: v1alpha1.StageApply, State: v1alpha1.StateComplete}, + } + nodeStateJSON, _ := json.Marshal(nodeState) + annotationKey := nodeStateAnnotationPrefix + skyhookName + statusAnnotationKey := statusAnnotationPrefix + skyhookName + cordonAnnotationKey := cordonAnnotationPrefix + skyhookName + versionAnnotationKey := versionAnnotationPrefix + skyhookName + statusLabelKey := statusLabelPrefix + skyhookName + + node1 := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-1", + Annotations: map[string]string{ + annotationKey: string(nodeStateJSON), + statusAnnotationKey: "complete", + cordonAnnotationKey: "true", + versionAnnotationKey: "1.0.0", + }, + Labels: map[string]string{ + statusLabelKey: "complete", + }, + }, + } + node2 := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-2", + Annotations: map[string]string{ + annotationKey: string(nodeStateJSON), + }, + }, + } + _, err := mockKube.CoreV1().Nodes().Create(gocontext.Background(), node1, metav1.CreateOptions{}) + Expect(err).NotTo(HaveOccurred()) + _, err = mockKube.CoreV1().Nodes().Create(gocontext.Background(), node2, metav1.CreateOptions{}) + Expect(err).NotTo(HaveOccurred()) + + opts := &resetOptions{confirm: true} + err = runReset(gocontext.Background(), cmd, kubeClient, skyhookName, opts, cliCtx) + Expect(err).NotTo(HaveOccurred()) + Expect(output.String()).To(ContainSubstring("Successfully reset 2 node(s)")) + + // Verify all annotations and labels were removed from node1 + updatedNode1, err := mockKube.CoreV1().Nodes().Get(gocontext.Background(), "worker-1", metav1.GetOptions{}) + Expect(err).NotTo(HaveOccurred()) + _, exists := updatedNode1.Annotations[annotationKey] + Expect(exists).To(BeFalse()) + _, exists = updatedNode1.Annotations[statusAnnotationKey] + Expect(exists).To(BeFalse()) + _, exists = updatedNode1.Annotations[cordonAnnotationKey] + Expect(exists).To(BeFalse()) + _, exists = updatedNode1.Annotations[versionAnnotationKey] + Expect(exists).To(BeFalse()) + _, exists = updatedNode1.Labels[statusLabelKey] + Expect(exists).To(BeFalse()) + + // Verify annotation was removed from node2 + updatedNode2, err := mockKube.CoreV1().Nodes().Get(gocontext.Background(), "worker-2", metav1.GetOptions{}) + Expect(err).NotTo(HaveOccurred()) + _, exists = updatedNode2.Annotations[annotationKey] + Expect(exists).To(BeFalse()) + }) + + It("should respect dry-run flag", func() { + nodeState := v1alpha1.NodeState{ + "pkg1|1.0": {Name: "pkg1", Version: "1.0", Stage: v1alpha1.StageApply, State: v1alpha1.StateComplete}, + } + nodeStateJSON, _ := json.Marshal(nodeState) + annotationKey := nodeStateAnnotationPrefix + skyhookName + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-1", + Annotations: map[string]string{ + annotationKey: string(nodeStateJSON), + }, + }, + } + _, err := mockKube.CoreV1().Nodes().Create(gocontext.Background(), node, metav1.CreateOptions{}) + Expect(err).NotTo(HaveOccurred()) + + // Enable dry-run + cliCtx.GlobalFlags.DryRun = true + + opts := &resetOptions{confirm: true} + err = runReset(gocontext.Background(), cmd, kubeClient, skyhookName, opts, cliCtx) + Expect(err).NotTo(HaveOccurred()) + Expect(output.String()).To(ContainSubstring("[dry-run]")) + + // Verify annotation was NOT removed + updatedNode, err := mockKube.CoreV1().Nodes().Get(gocontext.Background(), "worker-1", metav1.GetOptions{}) + Expect(err).NotTo(HaveOccurred()) + _, exists := updatedNode.Annotations[annotationKey] + Expect(exists).To(BeTrue()) + }) + + It("should prompt for confirmation when confirm flag is not set", func() { + annotationKey, err := createTestNodeWithState(mockKube, "worker-1", skyhookName) + Expect(err).NotTo(HaveOccurred()) + + opts := &resetOptions{confirm: false} + err = runReset(gocontext.Background(), cmd, kubeClient, skyhookName, opts, cliCtx) + Expect(err).NotTo(HaveOccurred()) + Expect(output.String()).To(ContainSubstring("Continue?")) + + // Verify annotation was removed after confirmation + err = verifyAnnotationRemoved(mockKube, "worker-1", annotationKey) + Expect(err).NotTo(HaveOccurred()) + }) + + It("should abort when user declines confirmation", func() { + nodeState := v1alpha1.NodeState{ + "pkg1|1.0": {Name: "pkg1", Version: "1.0", Stage: v1alpha1.StageApply, State: v1alpha1.StateComplete}, + } + nodeStateJSON, _ := json.Marshal(nodeState) + annotationKey := nodeStateAnnotationPrefix + skyhookName + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-1", + Annotations: map[string]string{ + annotationKey: string(nodeStateJSON), + }, + }, + } + _, err := mockKube.CoreV1().Nodes().Create(gocontext.Background(), node, metav1.CreateOptions{}) + Expect(err).NotTo(HaveOccurred()) + + // Set input to 'n' for no + cmd.SetIn(strings.NewReader("n\n")) + + opts := &resetOptions{confirm: false} + err = runReset(gocontext.Background(), cmd, kubeClient, skyhookName, opts, cliCtx) + Expect(err).NotTo(HaveOccurred()) + Expect(output.String()).To(ContainSubstring("Aborted")) + + // Verify annotation was NOT removed + updatedNode, err := mockKube.CoreV1().Nodes().Get(gocontext.Background(), "worker-1", metav1.GetOptions{}) + Expect(err).NotTo(HaveOccurred()) + _, exists := updatedNode.Annotations[annotationKey] + Expect(exists).To(BeTrue()) + }) + + It("should handle nodes with invalid annotation gracefully", func() { + annotationKey := nodeStateAnnotationPrefix + skyhookName + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-1", + Annotations: map[string]string{ + annotationKey: "invalid-json", + }, + }, + } + _, err := mockKube.CoreV1().Nodes().Create(gocontext.Background(), node, metav1.CreateOptions{}) + Expect(err).NotTo(HaveOccurred()) + + opts := &resetOptions{confirm: true} + err = runReset(gocontext.Background(), cmd, kubeClient, skyhookName, opts, cliCtx) + Expect(err).NotTo(HaveOccurred()) + Expect(output.String()).To(ContainSubstring("No nodes have state")) + }) + + It("should continue even if some annotations/labels don't exist", func() { + // Node only has nodeState annotation, not the others + annotationKey, err := createTestNodeWithState(mockKube, "worker-1", skyhookName) + Expect(err).NotTo(HaveOccurred()) + + opts := &resetOptions{confirm: true} + err = runReset(gocontext.Background(), cmd, kubeClient, skyhookName, opts, cliCtx) + Expect(err).NotTo(HaveOccurred()) + Expect(output.String()).To(ContainSubstring("Successfully reset 1 node")) + + // Verify nodeState annotation was removed + err = verifyAnnotationRemoved(mockKube, "worker-1", annotationKey) + Expect(err).NotTo(HaveOccurred()) + }) + }) +}) diff --git a/operator/internal/controller/cluster_state_v2.go b/operator/internal/controller/cluster_state_v2.go index e0c18de9..2d8f99a2 100644 --- a/operator/internal/controller/cluster_state_v2.go +++ b/operator/internal/controller/cluster_state_v2.go @@ -798,8 +798,15 @@ func IntrospectNode(node wrapper.SkyhookNode, skyhook SkyhookNodes) bool { if node.IsComplete() { node.SetStatus(v1alpha1.StatusComplete) } else { - // node will update to it's correct status on next reconcile - node.SetStatus(v1alpha1.StatusUnknown) + // In normal operation, all nodes are in at least the default compartment + // If compartments exist, node is waiting for its batch; otherwise Unknown (error state) + compartments := skyhook.GetCompartments() + if len(compartments) > 0 { + node.SetStatus(v1alpha1.StatusWaiting) + } else { + // No compartments exist (error state, e.g., deployment policy missing) + node.SetStatus(v1alpha1.StatusUnknown) + } } return node.Changed() } @@ -813,6 +820,16 @@ func IntrospectNode(node wrapper.SkyhookNode, skyhook SkyhookNodes) bool { node.SetStatus(v1alpha1.StatusUnknown) } + // If node is Unknown and not complete, check if it's waiting for its batch + // In normal operation, all nodes are in at least the default compartment + if nodeStatus == v1alpha1.StatusUnknown && !node.IsComplete() { + compartments := skyhook.GetCompartments() + if len(compartments) > 0 { + node.SetStatus(v1alpha1.StatusWaiting) + return node.Changed() + } + } + return node.Changed() } diff --git a/operator/internal/controller/cluster_state_v2_test.go b/operator/internal/controller/cluster_state_v2_test.go index 6c6082ef..9621c190 100644 --- a/operator/internal/controller/cluster_state_v2_test.go +++ b/operator/internal/controller/cluster_state_v2_test.go @@ -1079,6 +1079,55 @@ var _ = Describe("CleanupRemovedNodes", func() { Expect(changed).To(BeTrue()) }) + It("should set Unknown nodes to Waiting when in compartments", func() { + skyhookNode, err := wrapper.NewSkyhookNode(testNode, testSkyhook) + Expect(err).NotTo(HaveOccurred()) + skyhookNode.SetStatus(v1alpha1.StatusUnknown) + + // Create a compartment and add the node + compartment := wrapper.NewCompartmentWrapper(&v1alpha1.Compartment{ + Name: v1alpha1.DefaultCompartmentName, + Budget: v1alpha1.DeploymentBudget{ + Percent: ptr(100), + }, + }, nil) + compartment.AddNode(skyhookNode) + + skyhookNodes := &skyhookNodes{ + skyhook: wrapper.NewSkyhookWrapper(testSkyhook), + nodes: []wrapper.SkyhookNode{skyhookNode}, + compartments: make(map[string]*wrapper.Compartment), + } + skyhookNodes.AddCompartment(v1alpha1.DefaultCompartmentName, compartment) + + // Call IntrospectSkyhook which calls IntrospectNode + changed := IntrospectSkyhook(skyhookNodes, []SkyhookNodes{skyhookNodes}) + + // Verify the node status changed from Unknown to Waiting + Expect(changed).To(BeTrue()) + Expect(skyhookNode.Status()).To(Equal(v1alpha1.StatusWaiting)) + }) + + It("should keep Unknown nodes Unknown when no compartments exist", func() { + skyhookNode, err := wrapper.NewSkyhookNode(testNode, testSkyhook) + Expect(err).NotTo(HaveOccurred()) + skyhookNode.SetStatus(v1alpha1.StatusUnknown) + + skyhookNodes := &skyhookNodes{ + skyhook: wrapper.NewSkyhookWrapper(testSkyhook), + nodes: []wrapper.SkyhookNode{skyhookNode}, + compartments: make(map[string]*wrapper.Compartment), // No compartments + } + + // Call IntrospectSkyhook which calls IntrospectNode + _ = IntrospectSkyhook(skyhookNodes, []SkyhookNodes{skyhookNodes}) + + // Verify the node status stays Unknown (error state - no compartments) + // Note: IntrospectSkyhook might return true due to UpdateCondition, but the important + // thing is that the node status remains Unknown when no compartments exist + Expect(skyhookNode.Status()).To(Equal(v1alpha1.StatusUnknown)) + }) + It("should handle multiple nodes correctly when disabled", func() { // Set up the skyhook as disabled testSkyhook.Annotations["skyhook.nvidia.com/disable"] = annotationTrueValue diff --git a/operator/internal/wrapper/compartment.go b/operator/internal/wrapper/compartment.go index 87540ca6..0f8a80df 100644 --- a/operator/internal/wrapper/compartment.go +++ b/operator/internal/wrapper/compartment.go @@ -139,7 +139,9 @@ func (c *Compartment) createNewBatch() []SkyhookNode { } selectedNodes := make([]SkyhookNode, 0) - priority := []v1alpha1.Status{v1alpha1.StatusUnknown, v1alpha1.StatusBlocked, v1alpha1.StatusErroring} + // Prioritize Waiting nodes (nodes waiting for their batch) over Unknown nodes + // Unknown is kept for backwards compatibility and for nodes transitioning states + priority := []v1alpha1.Status{v1alpha1.StatusWaiting, v1alpha1.StatusUnknown, v1alpha1.StatusBlocked, v1alpha1.StatusErroring} for _, status := range priority { for _, node := range c.Nodes {