Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add test of scheduler performance across change in default configuration #422

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
trigger: none
schedules:
- cron: "19 1/12 * * *"
displayName: "1:19 AM and PM every day"
branches:
include:
- main
always: true

variables:
SCENARIO_TYPE: perf-eval
SCENARIO_NAME: cluster-churn-n1000p50k-sched
SCENARIO_VERSION: main

stages:
- stage: aws_eastus1_sched_baseline
dependsOn: []
jobs:
- template: /jobs/competitive-test.yml
parameters:
cloud: aws
regions:
- us-east-1
engine: clusterloader2
engine_input:
image: "ghcr.io/azure/clusterloader2:v20241022"
topology: slo
matrix:
aws_vpc_cni:
cpu_per_node: 4
node_count: 1000
node_per_step: 100
max_pods: 110
repeats: 1
scale_timeout: "30m"
cl2_config_file: cluster-scale-config.yaml
service_test: False
max_parallel: 1
timeout_in_minutes: 720
credential_type: service_connection
ssh_key_enabled: false
- stage: azure_eastus2_sched_upstream_default
dependsOn: []
variables:
- group: Scheduler-Upstream-Default-Config
jobs:
- template: /jobs/competitive-test.yml
parameters:
cloud: azure
regions:
- eastus2
engine: clusterloader2
engine_input:
image: "ghcr.io/azure/clusterloader2:v20241022"
topology: slo
matrix:
azure_sched:
cpu_per_node: 4
node_count: 1000
node_per_step: 100
max_pods: 110
repeats: 1
scale_timeout: "30m"
cl2_config_file: cluster-scale-config.yaml
service_test: False
max_parallel: 1
timeout_in_minutes: 720
credential_type: service_connection
ssh_key_enabled: false
- stage: azure_eastus2_sched_override_default
dependsOn: []
variables:
- group: Scheduler-Override-Default-Config
jobs:
- template: /jobs/competitive-test.yml
parameters:
cloud: azure
regions:
- eastus2
engine: clusterloader2
engine_input:
image: "ghcr.io/azure/clusterloader2:v20241022"
topology: slo
matrix:
azure_sched:
cpu_per_node: 4
node_count: 1000
node_per_step: 100
max_pods: 110
repeats: 1
scale_timeout: "30m"
cl2_config_file: cluster-scale-config.yaml
service_test: False
max_parallel: 1
timeout_in_minutes: 720
credential_type: service_connection
ssh_key_enabled: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
scenario_type = "perf-eval"
scenario_name = "cluster-churn-n1000p50k-sched"
deletion_delay = "12h"
owner = "aks"

network_config_list = [
{
role = "slo"
vpc_name = "slo-vpc"
vpc_cidr_block = "10.0.0.0/16"
secondary_ipv4_cidr_blocks = ["10.1.0.0/16"]
subnet = [
{
name = "slo-subnet-1"
cidr_block = "10.0.0.0/16"
zone_suffix = "a"
map_public_ip_on_launch = true
},
{
name = "slo-subnet-2"
cidr_block = "10.1.0.0/17"
zone_suffix = "b"
map_public_ip_on_launch = true
},
{
name = "slo-subnet-3"
cidr_block = "10.1.128.0/17"
zone_suffix = "c"
map_public_ip_on_launch = true
}
]
security_group_name = "slo-sg"
route_tables = [
{
name = "internet-rt"
cidr_block = "0.0.0.0/0"
}
],
route_table_associations = [
{
name = "slo-subnet-rt-assoc-1"
subnet_name = "slo-subnet-1"
route_table_name = "internet-rt"
},
{
name = "slo-subnet-rt-assoc-2"
subnet_name = "slo-subnet-2"
route_table_name = "internet-rt"
},
{
name = "slo-subnet-rt-assoc-3"
subnet_name = "slo-subnet-3"
route_table_name = "internet-rt"
}
]
sg_rules = {
ingress = []
egress = [
{
from_port = 0
to_port = 0
protocol = "-1"
cidr_block = "0.0.0.0/0"
}
]
}
}
]

eks_config_list = [{
role = "slo"
eks_name = "slo"
enable_karpenter = true
vpc_name = "slo-vpc"
policy_arns = ["AmazonEKSClusterPolicy", "AmazonEKSVPCResourceController", "AmazonEKSWorkerNodePolicy", "AmazonEKS_CNI_Policy", "AmazonEC2ContainerRegistryReadOnly"]
eks_managed_node_groups = [
{
name = "default"
ami_type = "AL2_x86_64"
instance_types = ["m4.4xlarge"]
min_size = 5
max_size = 5
desired_size = 5
capacity_type = "ON_DEMAND"
},
{
name = "prompool"
ami_type = "AL2_x86_64"
instance_types = ["m4.16xlarge"]
min_size = 1
max_size = 1
desired_size = 1
capacity_type = "ON_DEMAND"
labels = { "prometheus" = "true" }
}
]

eks_addons = []
kubernetes_version = "1.30"
}]
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
scenario_type = "perf-eval"
scenario_name = "cluster-churn-n1000p50k-sched"
deletion_delay = "12h"
owner = "aks"

network_config_list = [
{
role = "slo"
vnet_name = "slo-vnet"
vnet_address_space = "10.0.0.0/9"
subnet = [
{
name = "slo-subnet-1"
address_prefix = "10.0.0.0/16"
}
]
network_security_group_name = ""
nic_public_ip_associations = []
nsr_rules = []
}
]

aks_config_list = [
{
role = "slo"
aks_name = "slo"
dns_prefix = "slo"
subnet_name = "slo-subnet-1"
sku_tier = "Standard"
network_profile = {
network_plugin = "azure"
network_plugin_mode = "overlay"
pod_cidr = "10.128.0.0/9"
service_cidr = "192.168.0.0/16"
dns_service_ip = "192.168.0.10"
}
default_node_pool = {
name = "default"
node_count = 5
auto_scaling_enabled = false
vm_size = "Standard_D8_v3"
os_disk_type = "Managed"
only_critical_addons_enabled = false
temporary_name_for_rotation = "defaulttmp"
}
extra_node_pool = [
{
name = "prompool"
node_count = 1
auto_scaling_enabled = false
vm_size = "Standard_D64_v3"
max_pods = 110
node_labels = { "prometheus" = "true" }
},
{
name = "userpool0"
node_count = 0
min_count = 0
max_count = 500
auto_scaling_enabled = true
vm_size = "Standard_D4_v3"
max_pods = 110
node_taints = ["slo=true:NoSchedule"]
node_labels = { "slo" = "true" }
},
{
name = "userpool1"
node_count = 0
min_count = 0
max_count = 500
auto_scaling_enabled = true
vm_size = "Standard_D4_v3"
max_pods = 110
node_taints = ["slo=true:NoSchedule"]
node_labels = { "slo" = "true" }
}
]
kubernetes_version = "1.30"
}
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"run_id" : "123456789",
"region" : "us-east-1"
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"run_id" : "123456789",
"region" : "eastus"
}
Loading