diff --git a/.github/workflows/bench-benchmark.yml b/.github/workflows/bench-benchmark.yml new file mode 100644 index 000000000..db3c7b7fe --- /dev/null +++ b/.github/workflows/bench-benchmark.yml @@ -0,0 +1,144 @@ +name: "Benchmark with Bench client" + +on: + push: + branches: + - main + release: + types: [published, prereleased] + workflow_dispatch: + inputs: + destroy-on-session-end: + description: "Whether to destroy infrastructure right after the bench job has ended" + type: boolean + required: false + default: true + prefix: + description: "Prefix for the infrastructure. The bucket associated with this prefix must be created before." + required: false + default: "ci-bench" + +jobs: + define-matrix: + name: Define matrix + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.types.outputs.matrix }} + steps: + - id: types + name: Define matrix + env: + TRIGGER: ${{ github.event_name }} + REF_NAME: ${{ github.ref_name }} + run: | + set -ex + if [ "$TRIGGER" == 'push' ]; then + echo '{"include":[{"type": "localhost", "ntasks":3000, "polling-limit": 300}]}' > matrix.json + echo "FILE_PREFIX=$REF_NAME" >> $GITHUB_ENV + elif [ "$TRIGGER" == 'release' ]; then + echo '{"include":[{"type": "localhost", "ntasks":3000, "polling-limit": 300}, {"type": "aws", "ntasks":1200000, "polling-limit": 1000, "parameters-file-path": "benchmarking/aws/parameters.tfvars"}]}' > matrix.json + echo "FILE_PREFIX=release/$REF_NAME" >> $GITHUB_ENV + elif [ "$TRIGGER" == 'workflow_dispatch' ]; then + echo '{"include":[{"type": "aws", "ntasks":1200000, "polling-limit": 1000, "parameters-file-path": "benchmarking/aws/parameters.tfvars"}]}' > matrix.json + echo "FILE_PREFIX=manual/$REF_NAME" >> $GITHUB_ENV + fi + echo "matrix=$(cat matrix.json)" >> "$GITHUB_OUTPUT" + + benchmark: + name: ${{ matrix.type }} + runs-on: ubuntu-latest + needs: define-matrix + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.define-matrix.outputs.matrix) }} + env: + prefix: ${{ inputs.prefix || 'ci-bench' }} + parameters-file-path: ${{ matrix.parameters-file-path }} + outputs: + terraform-output: ${{ steps.deploy.outputs.terraform-output }} + armonik-endpoint: ${{ steps.get-armonik-endpoint.outputs.endpoint }} + steps: + - name: Checkout + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + + - name: Install Dependencies + uses: aneoconsulting/ArmoniK.Action.Deploy/dependencies@main + with: + terraform: true + k3s: true + docker: true + aws: true + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_REGION: eu-west-3 + + - name: Get Core version + run: | + set -ex + echo "core-version=$(cat versions.tfvars.json | jq -r '.armonik_versions.core')" >> $GITHUB_ENV + + - id: deploy + name: "Deploy ArmoniK" + uses: aneoconsulting/ArmoniK.Action.Deploy/deploy@main + with: + type: ${{ matrix.type }} + prefix: ${{ env.prefix }} + core-version: ${{ env.core-version }} + parameters-file-path: ${{ env.parameters-file-path }} + + - id: get-armonik-endpoint + name: "Get ArmoniK's control plane endpoint" + env: + TYPE: ${{ matrix.type }} + run: | + set -ex + grpc_endpoint=$(cat "infrastructure/quick-deploy/$TYPE/generated/armonik-output.json" | jq -r '.armonik.control_plane_url' | sed -r 's/(http:\/\/)([^:]*)(:.*)/\2/') + echo "grpc-endpoint=$grpc_endpoint" >> "$GITHUB_OUTPUT" + sleep 60 + + - id: bench + name: Run Bench + uses: aneoconsulting/ArmoniK.Action.Deploy/bench@main + with: + type: ${{ matrix.type }} + armonik-core-version: ${{ env.core-version }} + ntasks: ${{ matrix.ntasks }} + session-name: bench + grpc-client-endpoint: ${{ steps.get-armonik-endpoint.outputs.grpc-endpoint }} + timeout: 1200 + + - id: get-bench-stats + name: Get Bench Stats + uses: aneoconsulting/ArmoniK.Action.Deploy/get-throughput@main + with: + grpc-client-endpoint: ${{ steps.get-armonik-endpoint.outputs.grpc-endpoint }} + session-name: ${{ steps.bench.outputs.session-name }} + poll-duration-limit: ${{ matrix.polling-limit }} + + - name: Upload benchmark results to artifact registry + uses: actions/upload-artifact@v4 + with: + name: benchclient_benchmark_${{ github.event_name }}_${{ matrix.type }}_${{ github.run_id }} + path: ${{ steps.get-bench-stats.outputs.bench-file-path }} + + - name: Upload benchmark results to s3 + env: + EVENT_NAME: ${{ github.event_name }} + BENCH_RESULTS_PATH: ${{ steps.get-bench-stats.outputs.bench-file-path }} + TYPE: ${{ matrix.type }} + GHRUNID: ${{ github.run_id }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_EC2_METADATA_DISABLED: true + run: | + DATE=$(date +"%Y-%m-%d") + aws s3 cp "$BENCH_RESULTS_PATH" "s3://armonik-bench-storage/${FILE_PREFIX}/${GHRUNID}_${DATE}/benchclient_benchmark_${EVENT_NAME}_${TYPE}.json" + + - if: ${{ (github.event_name == 'workflow_dispatch' && inputs.destroy-on-session-end) || (github.event_name != 'workflow_dispatch' && always()) }} + id: destroy + name: Destroy deployment + uses: aneoconsulting/ArmoniK.Action.Deploy/destroy@main + with: + type: ${{ matrix.type }} + prefix: ${{ env.prefix }} + parameters-file-path: ${{ env.parameters-file-path }} \ No newline at end of file diff --git a/benchmarking/aws/2-21-0_8e69cdd2f5ca8dd99652ce651bc4ff9eaa8776eb/parameters.tfvars b/benchmarking/aws/2-21-0_8e69cdd2f5ca8dd99652ce651bc4ff9eaa8776eb/parameters.tfvars new file mode 100644 index 000000000..143a06817 --- /dev/null +++ b/benchmarking/aws/2-21-0_8e69cdd2f5ca8dd99652ce651bc4ff9eaa8776eb/parameters.tfvars @@ -0,0 +1,408 @@ +# Tags +tags = { + "name" = "bench" + "origin" = "terraform" + "csp" = "aws" + "Terraform" = "true" +} + +vpc = { + enable_private_subnet = false +} + +# AWS EKS +eks = { + cluster_version = "1.25" + node_selector = { service = "monitoring" } + cluster_endpoint_public_access = true + map_roles = [] + map_users = [] +} + +eks_managed_node_groups = { + workers = { + name = "workers" + launch_template_description = "Node group for ArmoniK Compute-plane pods" + ami_type = "AL2_x86_64" + instance_types = ["c5a.4xlarge"] + capacity_type = "ON_DEMAND" # "SPOT" + min_size = 8 + desired_size = 8 + max_size = 8 + labels = { + service = "workers" + "node.kubernetes.io/lifecycle" = "ondemand" # "spot" + } + taints = { + dedicated = { + key = "service" + value = "workers" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + + metrics = { + name = "metrics" + launch_template_description = "Node group for metrics: Metrics exporter and Prometheus" + ami_type = "AL2_x86_64" + instance_types = ["c5.large"] + capacity_type = "ON_DEMAND" + min_size = 1 + desired_size = 1 + max_size = 5 + labels = { + service = "metrics" + "node.kubernetes.io/lifecycle" = "ondemand" + } + taints = { + dedicated = { + key = "service" + value = "metrics" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + # Node group for ArmoniK control-plane: control-plane and Ingress + control_plane = { + name = "control-plane" + launch_template_description = "Node group for ArmoniK Control-plane and Ingress" + ami_type = "AL2_x86_64" + instance_types = ["c5a.4xlarge"] + capacity_type = "ON_DEMAND" + min_size = 1 + desired_size = 1 + max_size = 2 + labels = { + service = "control-plane" + "node.kubernetes.io/lifecycle" = "ondemand" + } + taints = { + dedicated = { + key = "service" + value = "control-plane" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + # Node group for monitoring: metrics server, keda, seq, grafana, cluster-autoscaler, coreDNS, termination handler + monitoring = { + name = "monitoring" + launch_template_description = "Node group for monitoring" + ami_type = "AL2_x86_64" + instance_types = ["c5.large"] + capacity_type = "ON_DEMAND" + min_size = 1 + desired_size = 1 + max_size = 5 + labels = { + service = "monitoring" + "node.kubernetes.io/lifecycle" = "ondemand" + } + taints = { + dedicated = { + key = "service" + value = "monitoring" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + # Node group for data-plane + # state_database, inner_storage, task_queue + state_database = { + name = "mongodb" + launch_template_description = "Node group for MongoDB" + ami_type = "AL2_x86_64" + instance_types = ["c5a.8xlarge"] + use_custom_launch_template = true + block_device_mappings = { + xvda = { + device_name = "/dev/xvda" + ebs = { + volume_size = 75 + volume_type = "gp3" + iops = 5000 + throughput = 1000 + encrypted = null + kms_key_id = null + delete_on_termination = true + } + } + } + capacity_type = "ON_DEMAND" + min_size = 1 + desired_size = 1 + max_size = 1 + labels = { + service = "state-database" + "node.kubernetes.io/lifecycle" = "ondemand" + } + taints = { + dedicated = { + key = "service" + value = "state-database" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } +} + +self_managed_node_groups = { + others = { + name = "others" + launch_template_description = "Node group for others" + instance_type = "c5.large" + min_size = 0 + desired_size = 0 + max_size = 5 + force_delete = true + force_delete_warm_pool = true + instance_market_options = { + market_type = "spot" + } + bootstrap_extra_args = "--kubelet-extra-args '--node-labels=node.kubernetes.io/lifecycle=spot'" + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + others_mixed = { + name = "others-mixed" + launch_template_description = "Mixed On demand and SPOT instances for other pods" + min_size = 1 + desired_size = 1 + max_size = 5 + use_mixed_instances_policy = true + mixed_instances_policy = { + on_demand_allocation_strategy = "lowest-price" + on_demand_base_capacity = 0 + on_demand_percentage_above_base_capacity = 20 # 20% On-Demand Instances, 80% Spot Instances + spot_allocation_strategy = "price-capacity-optimized" + spot_instance_pools = null + spot_max_price = null + } + override = [ + { + instance_type = "c5.4xlarge" + weighted_capacity = "1" + }, + { + instance_type = "c5.2xlarge" + weighted_capacity = "2" + }, + ] + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } +} + +# List of fargate profiles +fargate_profiles = {} + +metrics_server = { + node_selector = { service = "monitoring" } +} + +keda = { + node_selector = { service = "monitoring" } +} + +# Object storage +# Uncomment either the `elasticache` or the `s3_os` parameter +elasticache = { + engine = "redis" + engine_version = "6.x" + node_type = "cache.r4.large" + num_cache_clusters = 1 +} + +#s3_os = {} + +mq = { + engine_type = "ActiveMQ" + engine_version = "5.17.6" + host_instance_type = "mq.m5.xlarge" +} + +mongodb = { + node_selector = { service = "state-database" } + replicas = 2 + mongodb_resources = { + limits = { + "cpu" = "30" + "memory" = "60Gi" + "ephemeral-storage" = "20Gi" + } + requests = { + "cpu" = "14" + "memory" = "29Gi" + "ephemeral-storage" = "4Gi" + } + } +} + +seq = { + node_selector = { service = "monitoring" } +} + +grafana = { + node_selector = { service = "monitoring" } +} + +node_exporter = { + node_selector = {} +} + +windows_exporter = { + node_selector = { + "plateform" = "windows" + } +} + +prometheus = { + node_selector = { service = "metrics" } +} + +metrics_exporter = { + node_selector = { service = "metrics" } +} + + +fluent_bit = { + is_daemonset = true + node_selector = {} +} + +logging_level = "Information" + +control_plane = { + limits = { + cpu = "2000m" + memory = "4096Mi" + } + requests = { + cpu = "1000m" + memory = "2048Mi" + } + default_partition = "default" + replicas = 12 + node_selector = { service = "control-plane" } +} + +admin_gui = { + limits = { + cpu = "1000m" + memory = "1024Mi" + } + requests = { + cpu = "100m" + memory = "128Mi" + } + node_selector = { service = "monitoring" } +} + +compute_plane = { + bench = { + node_selector = { service = "workers" } + replicas = 120 + polling_agent = { + limits = { + cpu = "1000m" + memory = "1024Mi" + } + requests = { + cpu = "500m" + memory = "256Mi" + } + } + worker = [ + { + image = "dockerhubaneo/armonik_core_bench_test_worker" + limits = { + cpu = "1000m" + memory = "1024Mi" + } + requests = { + cpu = "500m" + memory = "512Mi" + } + } + ] + }, +} + +ingress = { + tls = false + mtls = false + generate_client_cert = false + node_selector = { service = "control-plane" } +} + +# Job to insert partitions in the database +job_partitions_in_database = { + node_selector = { service = "control-plane" } +} + +# Authentication behavior +authentication = { + node_selector = { service = "control-plane" } +} + +configurations = { + core = { + env = { + Amqp__AllowHostMismatch = false + Amqp__MaxPriority = "10" + Amqp__MaxRetries = "5" + Amqp__QueueStorage__LockRefreshPeriodicity = "00:00:45" + Amqp__QueueStorage__PollPeriodicity = "00:00:10" + Amqp__QueueStorage__LockRefreshExtension = "00:02:00" + MongoDB__TableStorage__PollingDelayMin = "00:00:01" + MongoDB__TableStorage__PollingDelayMax = "00:00:10" + MongoDB__TableStorage__PollingDelay = "00:00:01" + MongoDB__DataRetention = "1.00:00:00" # 1 day retention + MongoDB__AllowInsecureTls = true + Redis__Timeout = 3000 + Redis__SslHost = "" + Redis__TtlTimeSpan = "1.00:00:00" # 1 day retention + Submitter__DeletePayload = true + } + } + control = { + env = { + Submitter__MaxErrorAllowed = 50 + } + } + jobs = { env = { MongoDB__DataRetention = "1.00:00:00" } } +} + +environment_description = { + name = "aws-dev" + version = "0.0.0" + description = "AWS environment" + color = "#80ff80" +} + +upload_images = false diff --git a/benchmarking/aws/2-21-0_8e69cdd2f5ca8dd99652ce651bc4ff9eaa8776eb/versions.tfvars.json b/benchmarking/aws/2-21-0_8e69cdd2f5ca8dd99652ce651bc4ff9eaa8776eb/versions.tfvars.json new file mode 100644 index 000000000..90cb085d7 --- /dev/null +++ b/benchmarking/aws/2-21-0_8e69cdd2f5ca8dd99652ce651bc4ff9eaa8776eb/versions.tfvars.json @@ -0,0 +1,92 @@ +{ + "armonik_versions": { + "armonik": "2.21.0", + "infra": "0.7.0", + "infra_plugins": "0.1.1", + "core": "0.29.1", + "api": "3.21.0", + "gui": "0.13.3", + "extcsharp": "0.12.11", + "samples": "2.21.0" + }, + "armonik_images": { + "armonik": [ + ], + "infra": [ + "https://github.com/aneoconsulting/ArmoniK.Infra.git" + ], + "infra_plugins": [ + "dockerhubaneo/armonik_pdc_update" + ], + "core": [ + "dockerhubaneo/armonik_pollingagent", + "dockerhubaneo/armonik_control_metrics", + "dockerhubaneo/armonik_control_partition_metrics", + "dockerhubaneo/armonik_control", + "dockerhubaneo/armonik_core_stream_test_worker", + "dockerhubaneo/armonik_core_stream_test_client", + "dockerhubaneo/armonik_core_htcmock_test_worker", + "dockerhubaneo/armonik_core_htcmock_test_client", + "dockerhubaneo/armonik_core_bench_test_worker", + "dockerhubaneo/armonik_core_bench_test_client" + ], + "api": [ + ], + "gui": [ + "dockerhubaneo/armonik_admin_app", + "dockerhubaneo/armonik_admin_api" + ], + "extcsharp": [ + "dockerhubaneo/armonik_worker_dll" + ], + "samples": [ + "dockerhubaneo/armonik_demo_helloworld_worker", + "dockerhubaneo/armonik_demo_subtasking_worker", + "dockerhubaneo/armonik_demo_linearsubtasking_worker", + "dockerhubaneo/armonik_demo_multipleresults_worker" + ] + }, + "image_tags": { + "registry.k8s.io/autoscaling/cluster-autoscaler": "v1.31.0", + "registry.k8s.io/metrics-server/metrics-server": "v0.7.2", + "ghcr.io/kedacore/keda": "2.16.0", + "ghcr.io/kedacore/keda-metrics-apiserver": "2.16.0", + "public.ecr.aws/aws-ec2/aws-node-termination-handler": "v1.22.1", + "public.ecr.aws/efs-csi-driver/amazon/aws-efs-csi-driver": "v2.1.0", + "public.ecr.aws/eks-distro/kubernetes-csi/livenessprobe": "v2.14.0-eks-1-31-7", + "public.ecr.aws/eks-distro/kubernetes-csi/node-driver-registrar": "v2.12.0-eks-1-31-7", + "public.ecr.aws/eks-distro/kubernetes-csi/external-provisioner": "v5.1.0-eks-1-31-7", + "symptoma/activemq": "5.18.4", + "mongo": "8.0.3", + "bitnami/mongodb": "8.0.3-debian-12-r0", + "bitnami/mongodb-sharded": "8.0.3-debian-12-r0", + "rtsp/mongosh": "2.3.3", + "redis": "7.4.1-alpine", + "minio/minio": "RELEASE.2024-11-07T00-52-20Z", + "datalust/seq": "2024.3", + "datalust/seqcli": "2024.3", + "grafana/grafana": "11.3.0", + "prom/node-exporter": "v1.8.2", + "prom/prometheus": "v3.0.0", + "fluent/fluent-bit": "3.1.10", + "nginxinc/nginx-unprivileged": "1.27.2-alpine-slim", + "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner": "v4.0.18", + "bitnami/rabbitmq": "4.0.3", + "ghcr.io/chaos-mesh/chaos-mesh": "v2.7.0", + "ghcr.io/chaos-mesh/chaos-daemon": "v2.7.0", + "ghcr.io/chaos-mesh/chaos-dashboard": "v2.7.0", + "ghcr.io/prometheus-community/windows-exporter": "0.29.2-ltsc2022", + "mcr.microsoft.com/windows/nanoserver": "ltsc2022" + }, + "helm_charts" : { + "keda" : { "repository" : "https://kedacore.github.io/charts" , "version" : "2.16.0"}, + "metrics_server" : { "repository" : "https://kubernetes-sigs.github.io/metrics-server/" , "version" :"3.12.2"}, + "cluster_autoscaler" : {"repository" : "https://kubernetes.github.io/autoscaler" , "version" : "9.43.2"}, + "termination_handler" : {"repository" : "https://aws.github.io/eks-charts" , "version" : "0.21.0" }, + "efs_csi_driver" : { "repository" :"https://kubernetes-sigs.github.io/aws-efs-csi-driver/" , "version": "3.0.8" }, + "rabbitmq" : { "repository" : "https://charts.bitnami.com/bitnami" , "version" : "13.0.2"}, + "chaos_mesh" : { "repository" : "https://charts.chaos-mesh.org" , "version" : "2.6.3"}, + "mongodb" : { "repository": "oci://registry-1.docker.io/bitnamicharts", "version" : "16.2.2"}, + "mongodb-sharded" : { "repository": "oci://registry-1.docker.io/bitnamicharts", "version" : "9.0.3" } + } +} diff --git a/benchmarking/aws/README.md b/benchmarking/aws/README.md new file mode 100644 index 000000000..e371bfe3b --- /dev/null +++ b/benchmarking/aws/README.md @@ -0,0 +1,16 @@ +# ArmoniK benchmarking on AWS +This folder contains a Terraform parameters file describing the infrastructure that has been chosen as *ArmoniK reference infrastructure on AWS* that is used for regular benchmarks at each ArmoniK release. This file might evolve alongside ArmoniK. + +Thus for reproducibility concerns, this folder also contains subfolders that save dumps of the exact infrastructure configuration used for each version of ArmoniK benchmarked as well as the versions of ArmoniK's components. + +The subfolders are actually named as following, given a version *X.X.X* and a commit SHA *123abc* : **X-X-X_123abc** + +### How to reproduce an ArmoniK infrastructure for benchmarking ? + +1. Choose an ArmoniK version and save the associated subfolder name. + +2. Make sure you are at the root of the ArmoniK folder. + +3. Go to AWS quick-deploy : `cd infrastructure/quick-deploy/aws` + +4. Deploy ArmoniK with the appropriate Terraform parameters files located in the subfolder you selected : `make deploy PARAMETERS_FILE=../../../../benchmarking/aws/{ARMONIK_VERSION}_{COMMIT_SHA}/parameters.tfvars VERSIONS_FILE=../../../../benchmarking/aws/{ARMONIK_VERSION}_{COMMIT_SHA}/versions.tfvars.json` \ No newline at end of file diff --git a/benchmarking/aws/parameters.tfvars b/benchmarking/aws/parameters.tfvars new file mode 100644 index 000000000..143a06817 --- /dev/null +++ b/benchmarking/aws/parameters.tfvars @@ -0,0 +1,408 @@ +# Tags +tags = { + "name" = "bench" + "origin" = "terraform" + "csp" = "aws" + "Terraform" = "true" +} + +vpc = { + enable_private_subnet = false +} + +# AWS EKS +eks = { + cluster_version = "1.25" + node_selector = { service = "monitoring" } + cluster_endpoint_public_access = true + map_roles = [] + map_users = [] +} + +eks_managed_node_groups = { + workers = { + name = "workers" + launch_template_description = "Node group for ArmoniK Compute-plane pods" + ami_type = "AL2_x86_64" + instance_types = ["c5a.4xlarge"] + capacity_type = "ON_DEMAND" # "SPOT" + min_size = 8 + desired_size = 8 + max_size = 8 + labels = { + service = "workers" + "node.kubernetes.io/lifecycle" = "ondemand" # "spot" + } + taints = { + dedicated = { + key = "service" + value = "workers" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + + metrics = { + name = "metrics" + launch_template_description = "Node group for metrics: Metrics exporter and Prometheus" + ami_type = "AL2_x86_64" + instance_types = ["c5.large"] + capacity_type = "ON_DEMAND" + min_size = 1 + desired_size = 1 + max_size = 5 + labels = { + service = "metrics" + "node.kubernetes.io/lifecycle" = "ondemand" + } + taints = { + dedicated = { + key = "service" + value = "metrics" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + # Node group for ArmoniK control-plane: control-plane and Ingress + control_plane = { + name = "control-plane" + launch_template_description = "Node group for ArmoniK Control-plane and Ingress" + ami_type = "AL2_x86_64" + instance_types = ["c5a.4xlarge"] + capacity_type = "ON_DEMAND" + min_size = 1 + desired_size = 1 + max_size = 2 + labels = { + service = "control-plane" + "node.kubernetes.io/lifecycle" = "ondemand" + } + taints = { + dedicated = { + key = "service" + value = "control-plane" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + # Node group for monitoring: metrics server, keda, seq, grafana, cluster-autoscaler, coreDNS, termination handler + monitoring = { + name = "monitoring" + launch_template_description = "Node group for monitoring" + ami_type = "AL2_x86_64" + instance_types = ["c5.large"] + capacity_type = "ON_DEMAND" + min_size = 1 + desired_size = 1 + max_size = 5 + labels = { + service = "monitoring" + "node.kubernetes.io/lifecycle" = "ondemand" + } + taints = { + dedicated = { + key = "service" + value = "monitoring" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + # Node group for data-plane + # state_database, inner_storage, task_queue + state_database = { + name = "mongodb" + launch_template_description = "Node group for MongoDB" + ami_type = "AL2_x86_64" + instance_types = ["c5a.8xlarge"] + use_custom_launch_template = true + block_device_mappings = { + xvda = { + device_name = "/dev/xvda" + ebs = { + volume_size = 75 + volume_type = "gp3" + iops = 5000 + throughput = 1000 + encrypted = null + kms_key_id = null + delete_on_termination = true + } + } + } + capacity_type = "ON_DEMAND" + min_size = 1 + desired_size = 1 + max_size = 1 + labels = { + service = "state-database" + "node.kubernetes.io/lifecycle" = "ondemand" + } + taints = { + dedicated = { + key = "service" + value = "state-database" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } +} + +self_managed_node_groups = { + others = { + name = "others" + launch_template_description = "Node group for others" + instance_type = "c5.large" + min_size = 0 + desired_size = 0 + max_size = 5 + force_delete = true + force_delete_warm_pool = true + instance_market_options = { + market_type = "spot" + } + bootstrap_extra_args = "--kubelet-extra-args '--node-labels=node.kubernetes.io/lifecycle=spot'" + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + others_mixed = { + name = "others-mixed" + launch_template_description = "Mixed On demand and SPOT instances for other pods" + min_size = 1 + desired_size = 1 + max_size = 5 + use_mixed_instances_policy = true + mixed_instances_policy = { + on_demand_allocation_strategy = "lowest-price" + on_demand_base_capacity = 0 + on_demand_percentage_above_base_capacity = 20 # 20% On-Demand Instances, 80% Spot Instances + spot_allocation_strategy = "price-capacity-optimized" + spot_instance_pools = null + spot_max_price = null + } + override = [ + { + instance_type = "c5.4xlarge" + weighted_capacity = "1" + }, + { + instance_type = "c5.2xlarge" + weighted_capacity = "2" + }, + ] + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } +} + +# List of fargate profiles +fargate_profiles = {} + +metrics_server = { + node_selector = { service = "monitoring" } +} + +keda = { + node_selector = { service = "monitoring" } +} + +# Object storage +# Uncomment either the `elasticache` or the `s3_os` parameter +elasticache = { + engine = "redis" + engine_version = "6.x" + node_type = "cache.r4.large" + num_cache_clusters = 1 +} + +#s3_os = {} + +mq = { + engine_type = "ActiveMQ" + engine_version = "5.17.6" + host_instance_type = "mq.m5.xlarge" +} + +mongodb = { + node_selector = { service = "state-database" } + replicas = 2 + mongodb_resources = { + limits = { + "cpu" = "30" + "memory" = "60Gi" + "ephemeral-storage" = "20Gi" + } + requests = { + "cpu" = "14" + "memory" = "29Gi" + "ephemeral-storage" = "4Gi" + } + } +} + +seq = { + node_selector = { service = "monitoring" } +} + +grafana = { + node_selector = { service = "monitoring" } +} + +node_exporter = { + node_selector = {} +} + +windows_exporter = { + node_selector = { + "plateform" = "windows" + } +} + +prometheus = { + node_selector = { service = "metrics" } +} + +metrics_exporter = { + node_selector = { service = "metrics" } +} + + +fluent_bit = { + is_daemonset = true + node_selector = {} +} + +logging_level = "Information" + +control_plane = { + limits = { + cpu = "2000m" + memory = "4096Mi" + } + requests = { + cpu = "1000m" + memory = "2048Mi" + } + default_partition = "default" + replicas = 12 + node_selector = { service = "control-plane" } +} + +admin_gui = { + limits = { + cpu = "1000m" + memory = "1024Mi" + } + requests = { + cpu = "100m" + memory = "128Mi" + } + node_selector = { service = "monitoring" } +} + +compute_plane = { + bench = { + node_selector = { service = "workers" } + replicas = 120 + polling_agent = { + limits = { + cpu = "1000m" + memory = "1024Mi" + } + requests = { + cpu = "500m" + memory = "256Mi" + } + } + worker = [ + { + image = "dockerhubaneo/armonik_core_bench_test_worker" + limits = { + cpu = "1000m" + memory = "1024Mi" + } + requests = { + cpu = "500m" + memory = "512Mi" + } + } + ] + }, +} + +ingress = { + tls = false + mtls = false + generate_client_cert = false + node_selector = { service = "control-plane" } +} + +# Job to insert partitions in the database +job_partitions_in_database = { + node_selector = { service = "control-plane" } +} + +# Authentication behavior +authentication = { + node_selector = { service = "control-plane" } +} + +configurations = { + core = { + env = { + Amqp__AllowHostMismatch = false + Amqp__MaxPriority = "10" + Amqp__MaxRetries = "5" + Amqp__QueueStorage__LockRefreshPeriodicity = "00:00:45" + Amqp__QueueStorage__PollPeriodicity = "00:00:10" + Amqp__QueueStorage__LockRefreshExtension = "00:02:00" + MongoDB__TableStorage__PollingDelayMin = "00:00:01" + MongoDB__TableStorage__PollingDelayMax = "00:00:10" + MongoDB__TableStorage__PollingDelay = "00:00:01" + MongoDB__DataRetention = "1.00:00:00" # 1 day retention + MongoDB__AllowInsecureTls = true + Redis__Timeout = 3000 + Redis__SslHost = "" + Redis__TtlTimeSpan = "1.00:00:00" # 1 day retention + Submitter__DeletePayload = true + } + } + control = { + env = { + Submitter__MaxErrorAllowed = 50 + } + } + jobs = { env = { MongoDB__DataRetention = "1.00:00:00" } } +} + +environment_description = { + name = "aws-dev" + version = "0.0.0" + description = "AWS environment" + color = "#80ff80" +} + +upload_images = false diff --git a/tools/ci/bench-job-template.yml b/tools/ci/bench-job-template.yml new file mode 100644 index 000000000..919a16066 --- /dev/null +++ b/tools/ci/bench-job-template.yml @@ -0,0 +1,41 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: bench-session + namespace: armonik +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 1200 + template: + spec: + restartPolicy: Never + containers: + - name: bench-session + image: dockerhubaneo/armonik_core_bench_test_client:@@ARMONIK_CORE_VERSION@@ # version should at least be 0.27.4 + env: + - name: BenchOptions__NTasks + value: "@@NTASKS@@" + - name: BenchOptions__Partition + value: bench + - name: BenchOptions__Options__SessionName + value: "@@SESSION_NAME@@" + - name: BenchOptions__PayloadSize + value: "1" + - name: BenchOptions__ResultSize + value: "1" + - name: BenchOptions__TaskDurationMs + value: "0" + - name: BenchOptions__DegreeOfParallelism + value: "10" + - name: BenchOptions__PauseSessionDuringSubmission + value: "true" + - name: BenchOptions__PurgeData + value: "false" + - name: BenchOptions__DownloadResults + value: "false" + - name: GrpcClient__Endpoint + value: http://@@GRPC_CLIENT_ENDPOINT@@:5001 + resources: + requests: + cpu: "1" + memory: "500Mi" diff --git a/tools/ci/python/program.py b/tools/ci/python/program.py new file mode 100644 index 000000000..019ba2eb0 --- /dev/null +++ b/tools/ci/python/program.py @@ -0,0 +1,326 @@ +import datetime +import time +from typing import Any +import grpc +import argparse +import json +import sys +from logging import Filter, LogRecord +import logging.config +from pathlib import Path +from armonik.client import ArmoniKTasks, ArmoniKSessions +from armonik.common import Task, TaskStatus, Session, SessionStatus, Direction + + +class LogMsgStripFilter(Filter): + """Return a copy of the string with leading and trailing whitespace removed.""" + + def filter(self, record: LogRecord) -> bool: + try: + record.msg = record.msg.strip() + except AttributeError: + pass + return True + + +class ContextFilter(Filter): + """Process context and return and empty dict when not provided""" + + def filter(self, record: Any) -> bool: + try: + _ = record.context + if isinstance(_, dict): + record.context = json.dumps(_) + except AttributeError: + record.context = {} + return True + + +class SessionNotFoundError(Exception): + """Exception raised when a session cannot be found""" + + pass + + +LEVEL = "INFO" +LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "console": { + "datefmt": "%Y-%m-%dT%H:%M:%S", + "format": "%(asctime)s.%(msecs)03dZ%(levelname)s [%(funcName)s] | {" + '"message": "%(message)s", "filename": "%(filename)s", "line": %(lineno)d, ' + '"context": %(context)s}', + } + }, + "filters": { + "log_msg_strip_filter": {"()": LogMsgStripFilter}, + "context_filter": {"()": ContextFilter}, + }, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "level": LEVEL, + "formatter": "console", + "filters": ["log_msg_strip_filter", "context_filter"], + } + }, + "loggers": {"my_logger": {"handlers": ["console"], "level": "INFO"}}, +} + +logging.config.dictConfig(LOGGING) +logger = logging.getLogger("my_logger") + + +def get_session_id_by_name(session_name: str, grpc_channel) -> str: + """ + Retrieves a session id by its name defined as HtcMock.Options.SessionName . + If multiple sessions have the same name, the one retrieved is the last from the list returned by the API + + Args: + session_name: name of the session + grpc_channel: gRPC channel with ArmoniK's control plane + + Returns: + Session id + + Exception: + SessionNotFoundError: When session_name cannot match any session's SessionName + """ + + sessions_client = ArmoniKSessions(grpc_channel) + + try: + session_id = sessions_client.list_sessions( + Session.options["SessionName"] == session_name + )[1][-1].session_id + return session_id + except IndexError: + raise SessionNotFoundError + + +def get_session_stats(session_id: str, grpc_channel: grpc.Channel) -> dict: + """ + Retrieves stats for a session. + For now retrieves throughput and number of tasks completed. + + Args: + session_id: id of the session + grpc_channel: gRPC channel with ArmoniK's control plane + + Returns: + Dictionnary with metric name as key and metric value as value. + """ + + tasks_client = ArmoniKTasks(grpc_channel) + + tasks_count, tasks_list = tasks_client.list_tasks( + (Task.status == TaskStatus.COMPLETED) & (Task.session_id == session_id), + page=0, + page_size=1, + sort_field=Task.processed_at, + sort_direction=Direction.ASC, + ) + first_processed_task = tasks_list[0] + + last_ended_task = tasks_client.list_tasks( + (Task.status == TaskStatus.COMPLETED) & (Task.session_id == session_id), + page=0, + page_size=1, + sort_field=Task.ended_at, + sort_direction=Direction.DESC, + )[1][0] + + logger.info( + "Session stats summary", + extra={ + "context": { + "Task count:": tasks_count, + "First task started at": first_processed_task.started_at.strftime( + "%m/%d/%Y, %H:%M:%S" + ), + "Last task to end ended at": last_ended_task.ended_at.strftime( + "%m/%d/%Y, %H:%M:%S" + ), + } + }, + ) + + return { + "tasks_count": tasks_count, + "throughput": tasks_count + / (last_ended_task.ended_at - first_processed_task.started_at).total_seconds(), + } + + +def poll_session_ending( + session_id: str, grpc_channel: grpc.Channel, polling_limit: float +): + """ + Polls for a session to be completed (CANCELLED status). + + Args: + session_id: name of the session + grpc_channel: gRPC channel with ArmoniK's control plane + polling_limit: number of seconds to poll before timeout + + Exception: + If the session isn't completed in time, raises Timeout Error + """ + + sessions_client = ArmoniKSessions(grpc_channel) + + timeout_date = datetime.datetime.now() + datetime.timedelta(seconds=polling_limit) + + logger.info( + "Session polling", + extra={ + "context": { + "Session polled": session_id, + "Started to poll at": datetime.datetime.now().strftime( + "%m/%d/%Y, %H:%M:%S" + ), + "Will end polling at": timeout_date.strftime("%m/%d/%Y, %H:%M:%S"), + } + }, + ) + + while datetime.datetime.now() < timeout_date: + session_status = sessions_client.get_session(session_id).status + if session_status != SessionStatus.CLOSED: + logger.info( + "Waiting for session to end", + extra={"context": {"Session id": session_id}}, + ) + time.sleep(5) + else: + logger.info( + "Session finished", extra={"context": {"Session id": session_id}} + ) + return + + logger.error( + "Polling timeout exceeded", extra={"context": {"Session id": session_id}} + ) + + raise TimeoutError("Polling duration was exceeded.") + + +def main(session_name: str, grpc_endpoint: str, polling_limit: float) -> list[dict]: + """ + Retrieves a session's stats by its name. + + Args: + session_id: name of the session + grpc_channel: gRPC channel with ArmoniK's control plane + polling_limit: number of seconds to poll before timeout + + Returns: + The path to the JSON file containing the session's stats + """ + + with grpc.insecure_channel(f"{grpc_endpoint}:5001") as channel: + session_id = get_session_id_by_name(session_name, channel) + poll_session_ending(session_id, channel, polling_limit) + session_stats = get_session_stats(session_id, channel) + + session_stats_json = [ + { + "metadata": {"session_id": session_id, "session_name": session_name}, + "metrics": { + "throughput": { + "name": "Throughput", + "unit": "Task per second", + "value": session_stats["throughput"], + }, + "tasks_count": { + "name": "Total number of tasks", + "unit": "Task", + "value": session_stats["tasks_count"], + }, + }, + } + ] + + logger.debug( + "Session stats", + extra={ + "context": { + "Session name": session_name, + "Session id": session_id, + "Bench Results": session_stats_json, + } + }, + ) + + return session_stats_json + + +def write_json_output(session_stats_json: dict, path: str = "") -> str: + """ + Writes a session stats file in JSON. + + Args: + session_id: name of the session + grpc_channel: gRPC channel with ArmoniK's control plane + polling_limit: number of seconds to poll before timeout + path: relative path where to store session's stats + + Returns: + Absolute path to the JSON file containing the session's stats. + """ + + file_directory = Path(path) + file_directory.mkdir(parents=True, exist_ok=True) + + file_name = f"session_{session_stats[0]['metadata']['session_id']}_benchmark_{session_stats[0]['metrics']['tasks_count']['value']}tasks.json" + + absolute_file_path = file_directory.resolve() / file_name + + content = json.dumps(session_stats) + + logger.debug( + "JSON file to be written", + extra={ + "context": { + "directory": file_directory, + "filename": file_name, + "path": absolute_file_path, + "content": content, + } + }, + ) + + with open(absolute_file_path, "w") as output_file: + output_file.write(content) + + return absolute_file_path + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("grpc_endpoint", type=str) + parser.add_argument("-n", "--session-name", type=str, default="") + parser.add_argument("-l", "--polling-limit", type=float, default=300) + parser.add_argument("-p", "--output-path", type=str, default="") + + args = parser.parse_args() + + try: + session_stats = main(args.session_name, args.grpc_endpoint, args.polling_limit) + output_path = write_json_output(session_stats) + print(output_path, file=sys.stdout) + except SessionNotFoundError: + logger.error( + "Session not found", + extra={"context": {"Session name provided": args.session_name}}, + ) + sys.exit(1) + except TimeoutError: + logger.error( + "Session exceeded polling duration", + extra={"context": {"Session name provided": args.session_name}}, + ) + sys.exit(1) \ No newline at end of file diff --git a/tools/ci/python/requirements.txt b/tools/ci/python/requirements.txt new file mode 100644 index 000000000..a4aae0b36 --- /dev/null +++ b/tools/ci/python/requirements.txt @@ -0,0 +1,2 @@ +armonik==3.21.0 +argparse==1.4.0 \ No newline at end of file