Skip to content

Commit

Permalink
fix service churn feature pipeline name (#417)
Browse files Browse the repository at this point in the history
  • Loading branch information
jshr-w authored and agrawaliti committed Dec 30, 2024
1 parent f0799ea commit d69ce5d
Show file tree
Hide file tree
Showing 51 changed files with 1,993 additions and 70 deletions.
8 changes: 8 additions & 0 deletions jobs/competitive-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,9 @@ parameters:
- name: run_id
type: string
default: ''
- name: run_id_2
type: string
default: ''
- name: timeout_in_minutes
type: number
default: 60 # default when not specified is 60 minutes
Expand All @@ -48,6 +51,9 @@ parameters:
- name: ssh_key_enabled
type: boolean
default: true
- name: use_secondary_cluster
type: boolean
default: false

jobs:
- job: ${{ parameters.cloud }}
Expand All @@ -62,10 +68,12 @@ jobs:
cloud: ${{ parameters.cloud }}
region: ${{ parameters.regions[0] }}
run_id: ${{ parameters.run_id }}
run_id_2: ${{ parameters.run_id_2 }}
test_modules_dir: ${{ parameters.test_modules_dir }}
retry_attempt_count: ${{ parameters.retry_attempt_count }}
credential_type: ${{ parameters.credential_type }}
ssh_key_enabled: ${{ parameters.ssh_key_enabled }}
use_secondary_cluster: ${{ parameters.use_secondary_cluster }}
- template: /steps/provision-resources.yml
parameters:
cloud: ${{ parameters.cloud }}
Expand Down
9 changes: 5 additions & 4 deletions modules/python/clusterloader2/autoscale/autoscale.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,11 @@ def collect_clusterloader2(
index = match.group()
if index not in summary:
summary[index] = {
"up": { "failures": 0 },
"up": { "failures": 0 },
"down": { "failures": 0 }
}
else:
continue
continue

failure = testcase["failure"]
if "WaitForRunningPodsUp" in name:
Expand All @@ -76,7 +76,7 @@ def collect_clusterloader2(
elif "WaitForNodesDown" in name:
summary[index]["down"]["wait_for_nodes_seconds"] = -1 if failure else testcase["time"]
summary[index]["down"]["failures"] += 1 if failure else 0

content = ""
for index in summary:
for key in summary[index]:
Expand All @@ -85,14 +85,15 @@ def collect_clusterloader2(
"wait_for_pods_seconds": summary[index][key]["wait_for_pods_seconds"],
"autoscale_result": "success" if summary[index][key]["failures"] == 0 else "failure"
}
# TODO: Expose optional parameter to include test details
result = {
"timestamp": datetime.now(timezone.utc).strftime('%Y-%m-%dT%H:%M:%SZ'),
"autoscale_type": key,
"cpu_per_node": cpu_per_node,
"node_count": node_count,
"pod_count": pod_count,
"data": data,
"raw_data": raw_data,
# "raw_data": raw_data,
"cloud_info": cloud_info,
"run_id": run_id,
"run_url": run_url
Expand Down
68 changes: 60 additions & 8 deletions modules/python/clusterloader2/kubernetes_client.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# TODO: Move this file to a separate folder called 'clients'
from kubernetes import client, config


# https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/#taint-based-evictions
# https://kubernetes.io/docs/reference/labels-annotations-taints/
builtin_taints_keys = [
Expand All @@ -20,22 +20,27 @@ class KubernetesClient:
def __init__(self, kubeconfig=None):
config.load_kube_config(kubeconfig)
self.api = client.CoreV1Api()
self.app = client.AppsV1Api()
self.storage = client.StorageV1Api()

def get_app_client(self):
return self.app

def describe_node(self, node_name):
return self.api.read_node(node_name)

def get_nodes(self, label_selector=None, field_selector=None):
return self.api.list_node(label_selector=label_selector, field_selector=field_selector).items
def get_ready_nodes(self):

def get_ready_nodes(self, label_selector=None, field_selector=None):
"""
Get a list of nodes that are ready to be scheduled. Should apply all those conditions:
- 'Ready' condition status is True
- 'NetworkUnavailable' condition status is not present or is False
- Spec unschedulable is False
- Spec taints do not have any of the builtin taints keys with effect 'NoSchedule' or 'NoExecute'
"""
nodes = self.get_nodes()
nodes = self.get_nodes(label_selector=label_selector, field_selector=field_selector)
return [
node for node in nodes
if self._is_node_schedulable(node) and self._is_node_untainted(node)
Expand All @@ -50,16 +55,63 @@ def _is_node_schedulable(self, node):
)
if not is_schedulable:
print(f"Node NOT Ready: '{node.metadata.name}' is not schedulable. status_conditions: {status_conditions}. unschedulable: {node.spec.unschedulable}")

return is_schedulable

def _is_node_untainted(self, node):
if not node.spec.taints:
return True

for taint in node.spec.taints:
if taint.key in builtin_taints_keys and taint.effect in ("NoSchedule", "NoExecute"):
print(f"Node NOT Ready: '{node.metadata.name}' has taint '{taint.key}' with effect '{taint.effect}'")
return False

return True
return True

def get_pods_by_namespace(self, namespace, label_selector=None, field_selector=None):
return self.api.list_namespaced_pod(namespace=namespace, label_selector=label_selector, field_selector=field_selector).items

def get_running_pods_by_namespace(self, namespace=None, label_selector=None, field_selector=None):
pods = self.get_pods_by_namespace(namespace=namespace, label_selector=label_selector, field_selector=field_selector)
return [pod for pod in pods if pod.status.phase == "Running"]

def get_persistent_volume_claims_by_namespace(self, namespace):
return self.api.list_namespaced_persistent_volume_claim(namespace=namespace).items

def get_bound_persistent_volume_claims_by_namespace(self, namespace):
claims = self.get_persistent_volume_claims_by_namespace(namespace=namespace)
return [claim for claim in claims if claim.status.phase == "Bound"]

def delete_persistent_volume_claim_by_namespace(self, namespace):
pvcs = self.get_persistent_volume_claims_by_namespace(namespace=namespace)
for pvc in pvcs:
try:
self.api.delete_namespaced_persistent_volume_claim(pvc.metadata.name, namespace, body=client.V1DeleteOptions())
except client.rest.ApiException as e:
print(f"Error deleting PVC '{pvc.metadata.name}': {e}")

def get_volume_attachments(self):
return self.storage.list_volume_attachment().items

def get_attached_volume_attachments(self):
volume_attachments = self.get_volume_attachments()
return [attachment for attachment in volume_attachments if attachment.status.attached]

def create_namespace(self, namespace):
"""
Returns the namespace object if it exists, otherwise creates it.
"""
try:
namespace = self.api.read_namespace(namespace)
print(f"Namespace '{namespace.metadata.name}' already exists.")
return namespace
except client.rest.ApiException as e:
if e.status == 404:
body = client.V1Namespace(metadata=client.V1ObjectMeta(name=namespace))
return self.api.create_namespace(body)
else:
raise e

def delete_namespace(self, namespace):
return self.api.delete_namespace(namespace)
33 changes: 27 additions & 6 deletions modules/python/clusterloader2/slo/config/deployment_template.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@

{{$Image := DefaultParam .Image "mcr.microsoft.com/oss/kubernetes/pause:3.6"}}

{{$EnableNetworkPolicyEnforcementLatencyTest := DefaultParam .EnableNetworkPolicyEnforcementLatencyTest false}}
{{$TargetLabelValue := DefaultParam .TargetLabelValue "enforcement-latency"}}
# Run a server pod for network policy enforcement latency test only on every Nth pod.
# Default every third pod.
{{$NetPolServerOnEveryNthPod := 3}}
{{$RunNetPolicyTest := and $EnableNetworkPolicyEnforcementLatencyTest (eq (Mod .Index $NetPolServerOnEveryNthPod) 0)}}

apiVersion: apps/v1
kind: Deployment
metadata:
Expand All @@ -16,7 +23,7 @@ spec:
replicas: {{.Replicas}}
selector:
matchLabels:
name: {{.Name}}
name: {{if $RunNetPolicyTest}}policy-load-{{end}}{{.Name}}
strategy:
type: RollingUpdate
rollingUpdate:
Expand All @@ -25,29 +32,43 @@ spec:
template:
metadata:
labels:
name: {{.Name}}
name: {{if $RunNetPolicyTest}}policy-load-{{end}}{{.Name}}
group: {{.Group}}
{{if .SvcName}}
svc: {{.SvcName}}-{{.Index}}
{{end}}
restart: {{.deploymentLabel}}
{{if $RunNetPolicyTest}}
net-pol-test: {{$TargetLabelValue}}
{{end}}
spec:
nodeSelector:
slo: "true"
{{if $RunNetPolicyTest}}
hostNetwork: false
containers:
- image: nginx
name: nginx-server
ports:
- containerPort: 80
resources:
requests:
cpu: {{$CpuRequest}}
memory: {{$MemoryRequest}}
{{else}}
containers:
- env:
- name: ENV_VAR
value: a
image: {{$Image}}
imagePullPolicy: IfNotPresent
name: {{.Name}}
ports:
ports: []
resources:
requests:
cpu: {{$CpuRequest}}
memory: {{$MemoryRequest}}
# Add not-ready/unreachable tolerations for 15 minutes so that node
# failure doesn't trigger pod deletion.
{{end}}
tolerations:
- key: "node.kubernetes.io/not-ready"
operator: "Exists"
Expand All @@ -60,4 +81,4 @@ spec:
- key: "slo"
operator: "Equal"
value: "true"
effect: "NoSchedule"
effect: "NoSchedule"
63 changes: 58 additions & 5 deletions modules/python/clusterloader2/slo/config/load-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ name: load-config

# Config options for test type
{{$SERVICE_TEST := DefaultParam .CL2_SERVICE_TEST true}}
{{$NETWORK_TEST := DefaultParam .CL2_NETWORK_TEST false}}

# Config options for test parameters
{{$nodesPerNamespace := DefaultParam .CL2_NODES_PER_NAMESPACE 100}}
Expand All @@ -12,12 +13,12 @@ name: load-config
{{$groupName := DefaultParam .CL2_GROUP_NAME "service-discovery"}}

# TODO(jshr-w): This should eventually use >1 namespace.
{{$namespaces := 1}}
{{$namespaces := DefaultParam .CL2_NO_OF_NAMESPACES 1}}
{{$nodes := DefaultParam .CL2_NODES 1000}}

{{$deploymentQPS := DivideFloat $loadTestThroughput $deploymentSize}}
{{$operationTimeout := DefaultParam .CL2_OPERATION_TIMEOUT "15m"}}
{{$totalPods := MultiplyInt $namespaces $nodes $podsPerNode}}
{{$totalPods := MultiplyInt $namespaces $nodesPerNamespace $podsPerNode}}
{{$podsPerNamespace := DivideInt $totalPods $namespaces}}
{{$deploymentsPerNamespace := DivideInt $podsPerNamespace $deploymentSize}}

Expand All @@ -29,9 +30,9 @@ name: load-config

# Service test
{{$BIG_GROUP_SIZE := DefaultParam .BIG_GROUP_SIZE 4000}}
{{$SMALL_GROUP_SIZE := DefaultParam .SMALL_GROUP_SIZE 20}}
{{$SMALL_GROUP_SIZE := DefaultParam .CL2_DEPLOYMENT_SIZE 20}}
{{$bigDeploymentsPerNamespace := DefaultParam .bigDeploymentsPerNamespace 1}}
{{$smallDeploymentPods := SubtractInt $podsPerNamespace (MultiplyInt $bigDeploymentsPerNamespace $BIG_GROUP_SIZE)}}
{{$smallDeploymentPods := DivideInt $totalPods $namespaces}}
{{$smallDeploymentsPerNamespace := DivideInt $smallDeploymentPods $SMALL_GROUP_SIZE}}

namespace:
Expand All @@ -53,7 +54,7 @@ tuningSets:
qps: {{$deploymentQPS}}

steps:
- name: Log - namespaces={{$namespaces}}, nodesPerNamespace={{$nodesPerNamespace}}, podsPerNode={{$podsPerNode}}, totalPods={{$totalPods}}, podsPerNamespace={{$podsPerNamespace}}, deploymentsPerNamespace={{$deploymentsPerNamespace}}, deploymentSize={{$deploymentSize}}, deploymentQPS={{$deploymentQPS}}
- name: Log - namespaces={{$namespaces}}, nodes={{$nodes}}, nodesPerNamespace={{$nodesPerNamespace}}, podsPerNode={{$podsPerNode}}, totalPods={{$totalPods}}, podsPerNamespace={{$podsPerNamespace}}, deploymentsPerNamespace={{$deploymentsPerNamespace}}, deploymentSize={{$deploymentSize}}, deploymentQPS={{$deploymentQPS}}
measurements:
- Identifier: Dummy
Method: Sleep
Expand All @@ -74,6 +75,13 @@ steps:
action: start
{{end}}

{{if $NETWORK_TEST}}
- module:
path: /modules/network-policy/net-policy-metrics.yaml
params:
action: start
{{end}}

{{range $i := Loop $repeats}}
{{if $SERVICE_TEST}}
- module:
Expand All @@ -85,6 +93,15 @@ steps:
bigServicesPerNamespace: {{$bigDeploymentsPerNamespace}}
{{end}}

{{if $NETWORK_TEST}}
- module:
path: modules/network-policy/net-policy-enforcement-latency.yaml
params:
setup: true
run: true
testType: "pod-creation"
{{end}}

- module:
path: /modules/reconcile-objects.yaml
params:
Expand All @@ -101,6 +118,27 @@ steps:
Group: {{$groupName}}
deploymentLabel: start

{{if $NETWORK_TEST}}
- module:
path: modules/network-policy/net-policy-metrics.yaml
params:
action: gather
usePolicyCreationMetrics: true
usePodCreationMetrics: true

- module:
path: modules/network-policy/net-policy-enforcement-latency.yaml
params:
complete: true
testType: "pod-creation"

- module:
path: modules/network-policy/net-policy-enforcement-latency.yaml
params:
run: true
testType: "policy-creation"
{{end}}

- module:
path: /modules/reconcile-objects.yaml
params:
Expand Down Expand Up @@ -152,3 +190,18 @@ steps:
params:
action: gather
group: {{$groupName}}

{{if $NETWORK_TEST}}
- module:
path: modules/network-policy/net-policy-metrics.yaml
params:
action: gather
usePolicyCreationMetrics: true
usePodCreationMetrics: true

- module:
path: modules/network-policy/net-policy-enforcement-latency.yaml
params:
complete: true
testType: "policy-creation"
{{end}}
Loading

0 comments on commit d69ce5d

Please sign in to comment.