Skip to content

Commit

Permalink
Enhance node ready logic at KubernetesClient , increase timeout to 30…
Browse files Browse the repository at this point in the history
…min and add unit tests (#411)

Add the same checks done by the clusterloader2 to check if node is ready
to be scheduled:
https://github.com/Azure/perf-tests/blob/395a79947d2c98aa8376c204071ef865e552a292/clusterloader2/pkg/util/cluster.go#L92-L138

---------

Co-authored-by: Rafael Mendes Pereira <[email protected]>
  • Loading branch information
rafael-mendes-pereira and Rafael Mendes Pereira authored Nov 26, 2024
1 parent 0a68ec1 commit e7867fd
Show file tree
Hide file tree
Showing 4 changed files with 114 additions and 1 deletion.
4 changes: 4 additions & 0 deletions .github/workflows/python-validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@ jobs:
- name: Python Version Check
run: python --version

- name: Install Python Dependencies
working-directory: ${{ env.PYTHON_MODULES_DIR }}
run: pip install -r requirements.txt

- name: Run Python Unit Tests
working-directory: ${{ env.PYTHON_MODULES_DIR }}
run: python -m unittest discover
51 changes: 50 additions & 1 deletion modules/python/clusterloader2/kubernetes_client.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,21 @@
from kubernetes import client, config


# https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/#taint-based-evictions
# https://kubernetes.io/docs/reference/labels-annotations-taints/
builtin_taints_keys = [
"node.kubernetes.io/not-ready",
"node.kubernetes.io/unreachable",
"node.kubernetes.io/pid-pressure",
"node.kubernetes.io/out-of-disk",
"node.kubernetes.io/memory-pressure",
"node.kubernetes.io/disk-pressure",
"node.kubernetes.io/network-unavailable",
"node.kubernetes.io/unschedulable",
"node.cloudprovider.kubernetes.io/uninitialized",
"node.cloudprovider.kubernetes.io/shutdown",
]

class KubernetesClient:
def __init__(self, kubeconfig=None):
config.load_kube_config(kubeconfig)
Expand All @@ -12,5 +28,38 @@ def get_nodes(self, label_selector=None, field_selector=None):
return self.api.list_node(label_selector=label_selector, field_selector=field_selector).items

def get_ready_nodes(self):
"""
Get a list of nodes that are ready to be scheduled. Should apply all those conditions:
- 'Ready' condition status is True
- 'NetworkUnavailable' condition status is not present or is False
- Spec unschedulable is False
- Spec taints do not have any of the builtin taints keys with effect 'NoSchedule' or 'NoExecute'
"""
nodes = self.get_nodes()
return [node for node in nodes for condition in node.status.conditions if condition.type == "Ready" and condition.status == "True"]
return [
node for node in nodes
if self._is_node_schedulable(node) and self._is_node_untainted(node)
]

def _is_node_schedulable(self, node):
status_conditions = {cond.type: cond.status for cond in node.status.conditions}
is_schedulable = (
status_conditions.get("Ready") == "True"
and status_conditions.get("NetworkUnavailable") != "True"
and node.spec.unschedulable is not True
)
if not is_schedulable:
print(f"Node NOT Ready: '{node.metadata.name}' is not schedulable. status_conditions: {status_conditions}. unschedulable: {node.spec.unschedulable}")

return is_schedulable

def _is_node_untainted(self, node):
if not node.spec.taints:
return True

for taint in node.spec.taints:
if taint.key in builtin_taints_keys and taint.effect in ("NoSchedule", "NoExecute"):
print(f"Node NOT Ready: '{node.metadata.name}' has taint '{taint.key}' with effect '{taint.effect}'")
return False

return True
59 changes: 59 additions & 0 deletions modules/python/tests/test_kubernetes_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import unittest
from unittest.mock import patch
from kubernetes.client.models import (
V1Node, V1NodeStatus, V1NodeCondition, V1NodeSpec, V1ObjectMeta, V1Taint
)
from clusterloader2.kubernetes_client import KubernetesClient

class TestKubernetesClient(unittest.TestCase):

@patch('kubernetes.config.load_kube_config')
def setUp(self, mock_load_kube_config):
self.client = KubernetesClient()
return super().setUp()

def _create_node(self, name, ready_status, network_unavailable_status=None, unschedulable=False, taints=None):
conditions = [V1NodeCondition(type="Ready", status=ready_status)]
if network_unavailable_status is not None:
conditions.append(V1NodeCondition(type="NetworkUnavailable", status=network_unavailable_status))
return V1Node(
metadata=V1ObjectMeta(name=name),
status=V1NodeStatus(conditions=conditions),
spec=V1NodeSpec(unschedulable=unschedulable, taints=taints)
)

@patch('clusterloader2.kubernetes_client.KubernetesClient.get_nodes')
def test_get_ready_nodes_with_network_unavailable(self, mock_get_nodes):
# Mock nodes
# Nodes ready to be scheduled
node_ready_network_available = self._create_node(name="node_ready_network_available", ready_status="True", network_unavailable_status="False")
node_ready_no_network_condition = self._create_node(name="node_ready_no_network_condition", ready_status="True")
node_ready_taint_no_effect = self._create_node(
name="node_ready_taint_no_effect", ready_status="True", taints=[V1Taint(key="node.cloudprovider.kubernetes.io/shutdown", effect="")])
# Nodes NOT ready to be scheduled
node_not_ready = self._create_node(name="node_not_ready", ready_status="False")
node_ready_network_unavailable = self._create_node(name="node_ready_network_unavailable", ready_status="True", network_unavailable_status="True")
node_ready_unschedulable_true = self._create_node(name="node_ready_unschedulable", ready_status="True", unschedulable=True)
node_ready_shutdown_taint = self._create_node(
name="node_ready_shutdown_taint", ready_status="True", taints=[V1Taint(key="node.cloudprovider.kubernetes.io/shutdown", effect="NoSchedule")])


mock_get_nodes.return_value = [
node_not_ready,
node_ready_network_available,
node_ready_network_unavailable,
node_ready_no_network_condition,
node_ready_unschedulable_true,
node_ready_shutdown_taint,
node_ready_taint_no_effect
]

ready_nodes = self.client.get_ready_nodes()

self.maxDiff = None
self.assertCountEqual(ready_nodes,
[node_ready_network_available, node_ready_no_network_condition, node_ready_taint_no_effect]
)

if __name__ == '__main__':
unittest.main()
1 change: 1 addition & 0 deletions steps/topology/service-churn/validate-resources.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ steps:
- template: /steps/engine/clusterloader2/slo/validate.yml
parameters:
desired_nodes: 1006
operation_timeout_in_minutes: 30

0 comments on commit e7867fd

Please sign in to comment.