From e9d1240e0d591cd567a1cdeac373f4c9fb91bc06 Mon Sep 17 00:00:00 2001 From: sleep Date: Thu, 9 Jan 2025 09:26:59 +0100 Subject: [PATCH 01/24] Added SQS support to ArmoniK --- infrastructure/quick-deploy/aws/armonik.tf | 8 ++++++-- infrastructure/quick-deploy/aws/storage.tf | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/infrastructure/quick-deploy/aws/armonik.tf b/infrastructure/quick-deploy/aws/armonik.tf index b77ff7a0b..95ff1fda4 100644 --- a/infrastructure/quick-deploy/aws/armonik.tf +++ b/infrastructure/quick-deploy/aws/armonik.tf @@ -26,6 +26,7 @@ module "armonik" { pod_configuration = null } }, v, { + service_account_name = "armonikserviceaccount" polling_agent = merge(v.polling_agent, { image = local.ecr_images["${v.polling_agent.image}:${try(coalesce(v.polling_agent.tag), "")}"].name tag = local.ecr_images["${v.polling_agent.image}:${try(coalesce(v.polling_agent.tag), "")}"].tag @@ -36,8 +37,9 @@ module "armonik" { })] }) } control_plane = merge(var.control_plane, { - image = local.ecr_images["${var.control_plane.image}:${try(coalesce(var.control_plane.tag), "")}"].name - tag = local.ecr_images["${var.control_plane.image}:${try(coalesce(var.control_plane.tag), "")}"].tag + image = local.ecr_images["${var.control_plane.image}:${try(coalesce(var.control_plane.tag), "")}"].name + tag = local.ecr_images["${var.control_plane.image}:${try(coalesce(var.control_plane.tag), "")}"].tag + service_account_name = "armonikserviceaccount" }) admin_gui = merge(var.admin_gui, { image = local.ecr_images["${var.admin_gui.image}:${try(coalesce(var.admin_gui.tag), "")}"].name @@ -76,4 +78,6 @@ module "armonik" { image = local.ecr_images["${var.pod_deletion_cost.image}:${try(coalesce(var.pod_deletion_cost.tag), "")}"].image tag = local.ecr_images["${var.pod_deletion_cost.image}:${try(coalesce(var.pod_deletion_cost.tag), "")}"].tag }) + + depends_on = [module.aws_service_account] } diff --git a/infrastructure/quick-deploy/aws/storage.tf b/infrastructure/quick-deploy/aws/storage.tf index aacf072ac..e28be80cc 100644 --- a/infrastructure/quick-deploy/aws/storage.tf +++ b/infrastructure/quick-deploy/aws/storage.tf @@ -17,6 +17,7 @@ module "s3_fs" { sse_algorithm = can(coalesce(var.kms_key)) ? var.s3_fs.sse_algorithm : "aws:kms" ownership = var.s3_fs.ownership versioning = var.s3_fs.versioning + role_name = module.aws_service_account.service_account_iam_role_name } # Shared storage @@ -143,6 +144,15 @@ module "mq" { kms_key_id = local.kms_key } +module "aws_service_account" { + namespace = local.namespace + source = "./generated/infra-modules/service-account/aws" + prefix = local.prefix + name = "armonikserviceaccount" + oidc_provider_arn = module.eks.aws_eks_module.oidc_provider_arn + oidc_issuer_url = module.eks.aws_eks_module.cluster_oidc_issuer_url +} + # MongoDB module "mongodb" { count = can(coalesce(var.mongodb_sharding)) ? 0 : 1 @@ -277,6 +287,14 @@ module "mongodb_efs_persistent_volume" { tags = local.tags } + +resource "aws_iam_policy_attachment" "armonik_decrypt_object" { + name = "storage-s3-encrypt-decrypt-armonik" + roles = [module.aws_service_account.service_account_iam_role_name] + policy_arn = aws_iam_policy.decrypt_object.arn +} + + # Decrypt objects in S3 data "aws_iam_policy_document" "decrypt_object" { statement { From 907d65916b73a362ac029ed9ad1f6201be9f4e86 Mon Sep 17 00:00:00 2001 From: Mohamed Khairallah Gharbi Date: Mon, 25 Nov 2024 15:10:38 +0100 Subject: [PATCH 02/24] create gcp documentation --- .docs/content/1.installation/4.gcp/_dir.yml | 2 + .../4.gcp/gcp-all-in-one-deployment.md | 153 ++++++++++++++++++ 2 files changed, 155 insertions(+) create mode 100644 .docs/content/1.installation/4.gcp/_dir.yml create mode 100644 .docs/content/1.installation/4.gcp/gcp-all-in-one-deployment.md diff --git a/.docs/content/1.installation/4.gcp/_dir.yml b/.docs/content/1.installation/4.gcp/_dir.yml new file mode 100644 index 000000000..0dc1d33df --- /dev/null +++ b/.docs/content/1.installation/4.gcp/_dir.yml @@ -0,0 +1,2 @@ +title: GCP +navigation.icon: vscode-icons:file-type-gcp diff --git a/.docs/content/1.installation/4.gcp/gcp-all-in-one-deployment.md b/.docs/content/1.installation/4.gcp/gcp-all-in-one-deployment.md new file mode 100644 index 000000000..5167c7bc7 --- /dev/null +++ b/.docs/content/1.installation/4.gcp/gcp-all-in-one-deployment.md @@ -0,0 +1,153 @@ +# GCP all in one deployment Guide + +This guide will help you deploy your Armonik project on Google Cloud Platform (GCP). + + + +## Step 1: Preparation + +### 1. Install Google CLI + +When receiving your credentials (email and password). +Download and install the Google CLI by following the instructions on the [following link](https://cloud.google.com/sdk/docs/install?hl=fr#deb) + + +### 2. Initial Setup + + +After installation, authenticate using the provided credentials (email and password) and select the project **armonik gcp 13469** +Follow [following tutorial](https://cloud.google.com/docs/authentication/provide-credentials-adc?hl=fr#how-to) to authenticate into the CLI. + +You can authenticate using the following command: + +```bash +gcloud auth login +``` + +To configure the project, if you don't know the project ID, you can list all the projects using the following command: + +```bash +gcloud projects list +``` + +To configure the project, use the following command: + +```bash +gcloud config set project +``` + +Once authenticated, you should see a confirmation page with the message: + +*You are now authenticated with the gcloud CLI!* + + +## Step 2: Deployment + +### 1. Boostrap + +Navigate to the **gcp folder** (infrastructure/quick-deploy/gcp) and generate a prefix key by running the following command: + +```bash +make bootstrap-deploy PREFIX= +``` + +### 2. Deploy + +To deploy your resources, execute: +``` +make deploy PREFIX= +``` + +## Step 3: Cleanup + +### 1. Destroy the deployment + +#### After using the deployment, you have to make sure to destroy it to avoid any additional costs. #### + +To destroy the deployment, use the following command: + +```bash +make destroy PREFIX= +``` + +### 2. Destroy the GCP Prefix Key + +To clean up the GCP prefix key, use the following command: + +```bash +make bootstrap-destroy PREFIX= +``` + + +## Step 4: Add a Sample Partition + +Just like with an **AWS** or **localhost** deployment, you can add a sample partition to test deployment on the **GCP** environment. + + +## Troubleshooting + +### 1. Deployment Error with New Partition +In case of an error during deployment after adding a new partition with the value replicas > 0, you can put the replicas to 0 and redeploy the resources. +If the deployment succeeds, you can then update the replicas to the desired value and redeploy the resources. + +--- +### 2. Pub/Sub Issue client side +If you encounter an issue with the Pub/Sub client, with the following error message: `FAILED_PRECONDITION: Requested entity was not found.`, you can follow the steps in the [following link](https://cloud.google.com/pubsub/docs/customer-managed-encryption-keys#troubleshooting) to resolve the issue. + +--- +#### Error Message + +When running the application, you may encounter the following error: +grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with: status = StatusCode.FAILED_PRECONDITION details = "Cloud Pub/Sub did not have the necessary permissions configured to support this operation. Please verify that the service account [SERVICE_ACCOUNT] was granted the Cloud KMS CryptoKey Encrypter/Decrypter role for the project containing the CryptoKey resource [PROJECT_ID]/locations/[LOCATION]/keyRings/[KEY_RING]/cryptoKeys/[CRYPTO_KEY]." debug_error_string = "UNKNOWN:Error received from peer {created_time:"[TIMESTAMP]", grpc_status:9, grpc_message:"Cloud Pub/Sub did not have the necessary permissions configured to support this operation. Please verify that the service account [SERVICE_ACCOUNT] was granted the Cloud KMS CryptoKey Encrypter/Decrypter role for the project containing the CryptoKey resource [PROJECT_ID]/locations/[LOCATION]/keyRings/[KEY_RING]/cryptoKeys/[CRYPTO_KEY]."}" + + +--- + +#### Problem Description + +This error occurs because Cloud Pub/Sub requires access to the specified CMEK to encrypt or decrypt messages. The service account used by Pub/Sub does not have the **Cloud KMS CryptoKey Encrypter/Decrypter** role for the specified CryptoKey. + +#### Key Components in the Error: +- **Service Account**: `service-[NUMERIC_ID]@gcp-sa-pubsub.iam.gserviceaccount.com` +- **CryptoKey Resource**: + - Project: `[PROJECT_ID]` + - Location: `[LOCATION]` (e.g., `europe-west1`) + - Key Ring: `[KEY_RING]` + - CryptoKey: `[CRYPTO_KEY]` +- **Missing Role**: `roles/cloudkms.cryptoKeyEncrypterDecrypter` + +Without this role, Cloud Pub/Sub cannot perform encryption or decryption using the CMEK. + +--- + +## Solution + +To resolve the issue, grant the **Cloud KMS CryptoKey Encrypter/Decrypter** role to the Pub/Sub service account for the specified CryptoKey. + +### Step 1: Identify the Service Account +The service account mentioned in the error typically has the format: service-[NUMERIC_ID]@gcp-sa-pubsub.iam.gserviceaccount.com + +This service account is automatically created by Google Cloud to manage Pub/Sub operations. +--- +### Step 2: Grant the Necessary Role + +#### ** Using the Google Cloud Console** +1. Open the [Google Cloud Console](https://console.cloud.google.com). +2. Navigate to **Key Management > CryptoKeys**. +3. Locate the CryptoKey resource: + - **Project**: `[PROJECT_ID]` + - **Location**: `[LOCATION]` (e.g., `europe-west1`) + - **Key Ring**: `[KEY_RING]` + - **CryptoKey**: `[CRYPTO_KEY]` +4. Click on the CryptoKey and go to the **Permissions** tab. +5. Add the service account as a principal: + - **Principal**: `service-[NUMERIC_ID]@gcp-sa-pubsub.iam.gserviceaccount.com` + - **Role**: `Cloud KMS CryptoKey Encrypter/Decrypter`. +6. Save the changes. + +--- +After granting the role, Cloud Pub/Sub should be able to access the specified CMEK for encryption and decryption operations. You can now retry the operation that triggered the error. You should deploy the resources again to ensure that the changes take effect. +When the deployment is successful, you can verify that the Pub/Sub client can now access the CMEK without any issues. + + + From ca83e02920821f7c9adf06d6ea9a5f2c0300453a Mon Sep 17 00:00:00 2001 From: nico_dreylaq Date: Mon, 25 Nov 2024 16:36:22 +0100 Subject: [PATCH 03/24] chores: add doc aws --- .../3.aws/aws-deployment-troubleshooting.md | 103 ++++++++++++++++++ .../1.installation/3.aws/aws-first-setup.md | 53 +++++++++ 2 files changed, 156 insertions(+) create mode 100644 .docs/content/1.installation/3.aws/aws-deployment-troubleshooting.md create mode 100644 .docs/content/1.installation/3.aws/aws-first-setup.md diff --git a/.docs/content/1.installation/3.aws/aws-deployment-troubleshooting.md b/.docs/content/1.installation/3.aws/aws-deployment-troubleshooting.md new file mode 100644 index 000000000..ea8aeec33 --- /dev/null +++ b/.docs/content/1.installation/3.aws/aws-deployment-troubleshooting.md @@ -0,0 +1,103 @@ +# AWS Deployment Troubleshooting Guide 🛠️ + +This guide covers common issues you might encounter while deploying your application on AWS and provides solutions to address them. + +## 1. Error: Partition Not Found + +If you encounter a Partition Not Found error during deployment or runtime, ensure the following: +- Set the Replication Factor to 1 in the **parameters.tfvars**: + +```tf + default = { + node_selector = { service = "workers" } + replicas = 1 # HERE +``` + +- Check Spelling in Configuration Files. +- Double-check for any typos in partition names or configurations. Even minor spelling mistakes can cause the error. + +For example in **parameters.tfvars**: + +```tf +default = { + node_selector = { service = "workers" } + replicas = 1 # HERE +``` + +instead of +``` +defautl = { + node_selector = { service = "workers" } + replicas = 1 # HERE + +``` +## 2. Error: Incorrect URL or Missing Endpoint + +If your application fails to connect to a service, verify the following: +### Use the Correct URL Generated by AWS: + +After deploying your application or service, AWS typically generates a URL for the *control_plane* or other services. Make sure to: + +- Copy the URL provided by AWS during deployment. +- Update the client configuration with the correct endpoint before building your project. + +Ensure the endpoint is properly defined in your client application, as shown below: + +```csharp + var endpoint = new Option("--endpoint", + description: "Endpoint pour la connexion au control plane ArmoniK.", + getDefaultValue: () => "http://localhost:5001"); +``` + +### Adapting the URL When Using Docker + +When running your application in a Docker container, remember to update the endpoint dynamically. For instance: +```bash +docker run --rm --name client --endpoint "http://:5001" --partition +``` + +- Replace with the actual IP or hostname of the deployed service. +- Replace with the appropriate partition name you’re using. +- Always double-check that the endpoint and partition names are spelled correctly to avoid runtime errors. + +## 3. Missing Images + +If you encounter an error related to the Docker image not being found or incorrectly configured, follow these steps to ensure the image is built and referenced correctly. + +### Ensure the Worker Image is Built Correctly + +You need to build the Docker image for the worker using the following command: + +```bash +docker build -f "./csharp/native/LinearSubTasking/Worker/Dockerfile" -t subtasking "./csharp/native/" +``` +- **-f**: Specifies the path to the Dockerfile for your worker. +- **-t subtasking**: Tags the image as subtasking, which will be used as the image name. +- **"./csharp/native/"**: Specifies the context for the build (i.e., the root directory containing your source files). + +Voici la documentation mise à jour pour inclure la gestion des erreurs liées aux images Docker : +3. Error: Docker Image Not Found + +If you encounter an error related to the Docker image not being found or incorrectly configured, follow these steps to ensure the image is built and referenced correctly. +Step 1: Ensure the Worker Image is Built Correctly + +You need to build the Docker image for the worker using the following command: + +docker build -f "./csharp/native/LinearSubTasking/Worker/Dockerfile" -t subtasking "./csharp/native/" + + -f: Specifies the path to the Dockerfile for your worker. + -t subtasking: Tags the image as subtasking, which will be used as the image name. + "./csharp/native/": Specifies the context for the build (i.e., the root directory containing your source files). + +### Verify the Image Tag Matches the Configuration + +The image name (**subtasking** in this case) must match the value specified in your Terraform parameters file (**parameters.tfvars**). Check the following: + +1. Open the **parameters.tfvars** file. +2. Look for the worker_image: +```hcl + worker = [ + { + image = "subtasking" + ... +``` \ No newline at end of file diff --git a/.docs/content/1.installation/3.aws/aws-first-setup.md b/.docs/content/1.installation/3.aws/aws-first-setup.md new file mode 100644 index 000000000..87d6f6211 --- /dev/null +++ b/.docs/content/1.installation/3.aws/aws-first-setup.md @@ -0,0 +1,53 @@ +# AWS Setup 🚀 + + +This guide will help you install and configure the AWS CLI on your system and set up your AWS environment for use. + +# 1. Installation & Configuration + +Follow the official AWS CLI install guide [here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) + +For Linux-based systems, you can use the following commands: +```bash +curl "https://awscli.amazonaws.com/aws-cli-exe-linux-x86_64.zip" -o "awscliv2.zip" +unzip awscliv2.zip +sudo ./aws/install +``` + +Ensure the AWS CLI is installed correctly by checking its version: +```bash +aws --version +``` + +## 2. Configure AWS CLI + +Once installed, you need to configure the AWS CLI with your credentials: + +```bash +aws configure +``` +During configuration: +- Enter your **Access Key ID** and **Secret Access Key** (provided by AWS). +- Choose a default region, e.g., *eu-west-3* (optional but recommended). +- Specify the default output format (e.g., *json*, *table*, or *text*). + +To use AWS environnement, you can use the following: + +```bash +aws sso login +``` + +You should click on the URL provided in the output to open the SSO authorization page in a browser. + +## Step 1: Authorize in the Browser + +- Click on the first URL provided in the output. +- This will open the SSO authorization page in your default browser. +- After logging in, you’ll be prompted to grant permissions. + +## Step 2: Verify Login + +- Once authorized, the CLI will confirm successful login. +- Your authorization page should look similar to this: + +![aws-first-setup](aws-cli-access.png) \ No newline at end of file From 5ba39285169ef463afbc55b714d0327028107287 Mon Sep 17 00:00:00 2001 From: nico_dreylaq Date: Mon, 25 Nov 2024 16:54:41 +0100 Subject: [PATCH 04/24] chores: renaming files --- .../3.aws/{aws-first-setup.md => 0. aws-first-setup.md} | 2 +- ...all-in-one-deployment.md => 1. aws-all-in-one-deployment.md} | 0 ...-troubleshooting.md => 2. aws-deployment-troubleshooting.md} | 0 ...s-deployment-using-k3s.md => 3. aws-deployment-using-k3s.md} | 0 ...ment-using-kubeadm.md => 4. aws-deployment-using-kubeadm.md} | 0 5 files changed, 1 insertion(+), 1 deletion(-) rename .docs/content/1.installation/3.aws/{aws-first-setup.md => 0. aws-first-setup.md} (97%) rename .docs/content/1.installation/3.aws/{aws-all-in-one-deployment.md => 1. aws-all-in-one-deployment.md} (100%) rename .docs/content/1.installation/3.aws/{aws-deployment-troubleshooting.md => 2. aws-deployment-troubleshooting.md} (100%) rename .docs/content/1.installation/3.aws/{aws-deployment-using-k3s.md => 3. aws-deployment-using-k3s.md} (100%) rename .docs/content/1.installation/3.aws/{aws-deployment-using-kubeadm.md => 4. aws-deployment-using-kubeadm.md} (100%) diff --git a/.docs/content/1.installation/3.aws/aws-first-setup.md b/.docs/content/1.installation/3.aws/0. aws-first-setup.md similarity index 97% rename from .docs/content/1.installation/3.aws/aws-first-setup.md rename to .docs/content/1.installation/3.aws/0. aws-first-setup.md index 87d6f6211..e413e1b6e 100644 --- a/.docs/content/1.installation/3.aws/aws-first-setup.md +++ b/.docs/content/1.installation/3.aws/0. aws-first-setup.md @@ -50,4 +50,4 @@ You should click on the URL provided in the output to open the SSO authorization - Once authorized, the CLI will confirm successful login. - Your authorization page should look similar to this: -![aws-first-setup](aws-cli-access.png) \ No newline at end of file +![AWS CLI Access](./aws-cli-access.png) \ No newline at end of file diff --git a/.docs/content/1.installation/3.aws/aws-all-in-one-deployment.md b/.docs/content/1.installation/3.aws/1. aws-all-in-one-deployment.md similarity index 100% rename from .docs/content/1.installation/3.aws/aws-all-in-one-deployment.md rename to .docs/content/1.installation/3.aws/1. aws-all-in-one-deployment.md diff --git a/.docs/content/1.installation/3.aws/aws-deployment-troubleshooting.md b/.docs/content/1.installation/3.aws/2. aws-deployment-troubleshooting.md similarity index 100% rename from .docs/content/1.installation/3.aws/aws-deployment-troubleshooting.md rename to .docs/content/1.installation/3.aws/2. aws-deployment-troubleshooting.md diff --git a/.docs/content/1.installation/3.aws/aws-deployment-using-k3s.md b/.docs/content/1.installation/3.aws/3. aws-deployment-using-k3s.md similarity index 100% rename from .docs/content/1.installation/3.aws/aws-deployment-using-k3s.md rename to .docs/content/1.installation/3.aws/3. aws-deployment-using-k3s.md diff --git a/.docs/content/1.installation/3.aws/aws-deployment-using-kubeadm.md b/.docs/content/1.installation/3.aws/4. aws-deployment-using-kubeadm.md similarity index 100% rename from .docs/content/1.installation/3.aws/aws-deployment-using-kubeadm.md rename to .docs/content/1.installation/3.aws/4. aws-deployment-using-kubeadm.md From 736f6b3e20f0722ae807a8c86d0383959c491132 Mon Sep 17 00:00:00 2001 From: Mohamed Khairallah Gharbi Date: Mon, 25 Nov 2024 16:58:18 +0100 Subject: [PATCH 05/24] add gcp documentation --- .docs/content/1.installation/3.aws/_dir.yml | 2 +- .../4.gcp/0. gcp-all-in-one-deployment.md | 79 +++++++++ .../4.gcp/1. gcp-troubleshooting.md | 74 +++++++++ .docs/content/1.installation/4.gcp/_dir.yml | 2 +- .../4.gcp/gcp-all-in-one-deployment.md | 153 ------------------ 5 files changed, 155 insertions(+), 155 deletions(-) create mode 100644 .docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md create mode 100644 .docs/content/1.installation/4.gcp/1. gcp-troubleshooting.md delete mode 100644 .docs/content/1.installation/4.gcp/gcp-all-in-one-deployment.md diff --git a/.docs/content/1.installation/3.aws/_dir.yml b/.docs/content/1.installation/3.aws/_dir.yml index f344c96cf..6954dbbb2 100644 --- a/.docs/content/1.installation/3.aws/_dir.yml +++ b/.docs/content/1.installation/3.aws/_dir.yml @@ -1,2 +1,2 @@ title: AWS -navigation.icon: vscode-icons:file-type-aws +navigation.icon: vscode-icons:folder-type-aws diff --git a/.docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md b/.docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md new file mode 100644 index 000000000..e2a95f166 --- /dev/null +++ b/.docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md @@ -0,0 +1,79 @@ +# GCP all in one deployment Guide + +This guide will help you deploy your Armonik project on Google Cloud Platform (GCP). + +## Step 1: Preparation + +#### 1. Install Google CLI + +When receiving your credentials (email and password). +Download and install the Google CLI by following the instructions on the [following link](https://cloud.google.com/sdk/docs/install?hl=fr#deb) + +#### 2. Initial Setup + +After installation, authenticate using the provided credentials (email and password) and select the project you want to deploy the resources to. +Follow [following tutorial](https://cloud.google.com/docs/authentication/provide-credentials-adc?hl=fr#how-to) to authenticate into the CLI. + +You can authenticate using the following command: + +```bash +gcloud auth login +``` + +To configure the project, if you don't know the project ID, you can list all the projects using the following command: + +```bash +gcloud projects list +``` + +To configure the project, use the following command: + +```bash +gcloud config set project +``` + +Once authenticated, you should see a confirmation page with the message: + +*You are now authenticated with the gcloud CLI!* + +## Step 2: Deployment +Navigate to the **gcp folder** (infrastructure/quick-deploy/gcp) and follow the steps below to deploy your resources. +### 1. Boostrap + +Generate a prefix key ready in the GCP environment to deploy your resources. + +```bash +make bootstrap-deploy PREFIX= +``` + +### 2. Deploy + +To deploy your resources, execute: + +```bash +make deploy PREFIX= +``` + +## Step 3: Cleanup + +### 1. Destroy the deployment + +#### Attention: After using the deployment, you have to make sure to destroy it to avoid any additional costs. #### + +To destroy the deployment, use the following command: + +```bash +make destroy PREFIX= +``` + +### 2. Destroy the GCP Prefix Key + +To clean up the GCP prefix key, use the following command: + +```bash +make bootstrap-destroy PREFIX= +``` + +## Step 4: Add a Sample Partition + +Just like with an **AWS** or **localhost** deployment, you can add a sample partition to test deployment on the **GCP** environment. You need to build the images and redeploy the services after adding the sample partition in the parameters.tfvars file. diff --git a/.docs/content/1.installation/4.gcp/1. gcp-troubleshooting.md b/.docs/content/1.installation/4.gcp/1. gcp-troubleshooting.md new file mode 100644 index 000000000..91a4f6fde --- /dev/null +++ b/.docs/content/1.installation/4.gcp/1. gcp-troubleshooting.md @@ -0,0 +1,74 @@ +# GCP Troubleshooting Guide + +This guide will help you troubleshoot common issues when deploying your Armonik project on Google Cloud Platform (GCP). + +## 1. Deployment Error with New Partition +If you encounter an error during deployment after adding a new partition with the value `replicas > 0`, follow these steps: +1. Set the replicas to `0`. +2. Redeploy the resources. +3. If the deployment succeeds, update the replicas to the desired value. +4. Redeploy the resources again. + +--- + +## 2. Pub/Sub Client-Side Issue +If you encounter an issue with the Pub/Sub client and see the error message: `FAILED_PRECONDITION: Requested entity was not found.`, follow the steps below to resolve the issue. + +### Error Message +When running the application, you may encounter the following error: + +```bash +grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with: + status = StatusCode.FAILED_PRECONDITION + details = "Cloud Pub/Sub did not have the necessary permissions configured to support this operation. + Please verify that the service account [SERVICE_ACCOUNT] was granted the Cloud KMS CryptoKey Encrypter/Decrypter role + for the project containing the CryptoKey resource [PROJECT_ID]/locations/[LOCATION]/keyRings/[KEY_RING]/cryptoKeys/[CRYPTO_KEY]." + debug_error_string = "UNKNOWN:Error received from peer {created_time:"[TIMESTAMP]", + grpc_status:9, + grpc_message:"Cloud Pub/Sub did not have the necessary permissions configured to support this operation. + Please verify that the service account [SERVICE_ACCOUNT] was granted the Cloud KMS CryptoKey Encrypter/Decrypter role + for the project containing the CryptoKey resource [PROJECT_ID]/locations/[LOCATION]/keyRings/[KEY_RING]/cryptoKeys/[CRYPTO_KEY]."} +``` + +### Problem Description +This error occurs because Cloud Pub/Sub requires access to the specified CMEK to encrypt or decrypt messages. The service account used by Pub/Sub does not have the **Cloud KMS CryptoKey Encrypter/Decrypter** role for the specified CryptoKey. + +### Key Components in the Error: +- **Service Account**: `service-[NUMERIC_ID]@gcp-sa-pubsub.iam.gserviceaccount.com` +- **CryptoKey Resource**: + - Project: `[PROJECT_ID]` + - Location: `[LOCATION]` (e.g., `europe-west1`) + - Key Ring: `[KEY_RING]` + - CryptoKey: `[CRYPTO_KEY]` +- **Missing Role**: `roles/cloudkms.cryptoKeyEncrypterDecrypter` + +Without this role, Cloud Pub/Sub cannot perform encryption or decryption using the CMEK. + +### Solution + +To resolve the issue, grant the **Cloud KMS CryptoKey Encrypter/Decrypter** role to the Pub/Sub service account for the specified CryptoKey. + +#### Step 1: Identify the Service Account +The service account mentioned in the error typically has the format: `service-[NUMERIC_ID]@gcp-sa-pubsub.iam.gserviceaccount.com`. This service account is automatically created by Google Cloud to manage Pub/Sub operations. + +#### Step 2: Grant the Necessary Role +You can grant the **Cloud KMS CryptoKey Encrypter/Decrypter** role to the service account using the Google Cloud Console or the gcloud command-line tool. + +##### Using the Google Cloud Console +1. Open the [Google Cloud Console](https://console.cloud.google.com). +2. Navigate to **Key Management > CryptoKeys**. +3. Locate the CryptoKey resource: + - **Project**: `[PROJECT_ID]` + - **Location**: `[LOCATION]` (e.g., `europe-west1`) + - **Key Ring**: `[KEY_RING]` + - **CryptoKey**: `[CRYPTO_KEY]` +4. Click on the CryptoKey and go to the **Permissions** tab. +5. Add the service account as a principal: + - **Principal**: `service-[NUMERIC_ID]@gcp-sa-pubsub.iam.gserviceaccount.com` + - **Role**: `Cloud KMS CryptoKey Encrypter/Decrypter` +6. Save the changes. + +--- + +After granting the role, Cloud Pub/Sub should be able to access the specified CMEK for encryption and decryption operations. Retry the operation that triggered the error. Ensure that the deployment is successful and verify that the Pub/Sub client can now access the CMEK without any issues. + diff --git a/.docs/content/1.installation/4.gcp/_dir.yml b/.docs/content/1.installation/4.gcp/_dir.yml index 0dc1d33df..fa8498148 100644 --- a/.docs/content/1.installation/4.gcp/_dir.yml +++ b/.docs/content/1.installation/4.gcp/_dir.yml @@ -1,2 +1,2 @@ title: GCP -navigation.icon: vscode-icons:file-type-gcp +navigation.icon: vscode-icons:folder-type-gcp diff --git a/.docs/content/1.installation/4.gcp/gcp-all-in-one-deployment.md b/.docs/content/1.installation/4.gcp/gcp-all-in-one-deployment.md deleted file mode 100644 index 5167c7bc7..000000000 --- a/.docs/content/1.installation/4.gcp/gcp-all-in-one-deployment.md +++ /dev/null @@ -1,153 +0,0 @@ -# GCP all in one deployment Guide - -This guide will help you deploy your Armonik project on Google Cloud Platform (GCP). - - - -## Step 1: Preparation - -### 1. Install Google CLI - -When receiving your credentials (email and password). -Download and install the Google CLI by following the instructions on the [following link](https://cloud.google.com/sdk/docs/install?hl=fr#deb) - - -### 2. Initial Setup - - -After installation, authenticate using the provided credentials (email and password) and select the project **armonik gcp 13469** -Follow [following tutorial](https://cloud.google.com/docs/authentication/provide-credentials-adc?hl=fr#how-to) to authenticate into the CLI. - -You can authenticate using the following command: - -```bash -gcloud auth login -``` - -To configure the project, if you don't know the project ID, you can list all the projects using the following command: - -```bash -gcloud projects list -``` - -To configure the project, use the following command: - -```bash -gcloud config set project -``` - -Once authenticated, you should see a confirmation page with the message: - -*You are now authenticated with the gcloud CLI!* - - -## Step 2: Deployment - -### 1. Boostrap - -Navigate to the **gcp folder** (infrastructure/quick-deploy/gcp) and generate a prefix key by running the following command: - -```bash -make bootstrap-deploy PREFIX= -``` - -### 2. Deploy - -To deploy your resources, execute: -``` -make deploy PREFIX= -``` - -## Step 3: Cleanup - -### 1. Destroy the deployment - -#### After using the deployment, you have to make sure to destroy it to avoid any additional costs. #### - -To destroy the deployment, use the following command: - -```bash -make destroy PREFIX= -``` - -### 2. Destroy the GCP Prefix Key - -To clean up the GCP prefix key, use the following command: - -```bash -make bootstrap-destroy PREFIX= -``` - - -## Step 4: Add a Sample Partition - -Just like with an **AWS** or **localhost** deployment, you can add a sample partition to test deployment on the **GCP** environment. - - -## Troubleshooting - -### 1. Deployment Error with New Partition -In case of an error during deployment after adding a new partition with the value replicas > 0, you can put the replicas to 0 and redeploy the resources. -If the deployment succeeds, you can then update the replicas to the desired value and redeploy the resources. - ---- -### 2. Pub/Sub Issue client side -If you encounter an issue with the Pub/Sub client, with the following error message: `FAILED_PRECONDITION: Requested entity was not found.`, you can follow the steps in the [following link](https://cloud.google.com/pubsub/docs/customer-managed-encryption-keys#troubleshooting) to resolve the issue. - ---- -#### Error Message - -When running the application, you may encounter the following error: -grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with: status = StatusCode.FAILED_PRECONDITION details = "Cloud Pub/Sub did not have the necessary permissions configured to support this operation. Please verify that the service account [SERVICE_ACCOUNT] was granted the Cloud KMS CryptoKey Encrypter/Decrypter role for the project containing the CryptoKey resource [PROJECT_ID]/locations/[LOCATION]/keyRings/[KEY_RING]/cryptoKeys/[CRYPTO_KEY]." debug_error_string = "UNKNOWN:Error received from peer {created_time:"[TIMESTAMP]", grpc_status:9, grpc_message:"Cloud Pub/Sub did not have the necessary permissions configured to support this operation. Please verify that the service account [SERVICE_ACCOUNT] was granted the Cloud KMS CryptoKey Encrypter/Decrypter role for the project containing the CryptoKey resource [PROJECT_ID]/locations/[LOCATION]/keyRings/[KEY_RING]/cryptoKeys/[CRYPTO_KEY]."}" - - ---- - -#### Problem Description - -This error occurs because Cloud Pub/Sub requires access to the specified CMEK to encrypt or decrypt messages. The service account used by Pub/Sub does not have the **Cloud KMS CryptoKey Encrypter/Decrypter** role for the specified CryptoKey. - -#### Key Components in the Error: -- **Service Account**: `service-[NUMERIC_ID]@gcp-sa-pubsub.iam.gserviceaccount.com` -- **CryptoKey Resource**: - - Project: `[PROJECT_ID]` - - Location: `[LOCATION]` (e.g., `europe-west1`) - - Key Ring: `[KEY_RING]` - - CryptoKey: `[CRYPTO_KEY]` -- **Missing Role**: `roles/cloudkms.cryptoKeyEncrypterDecrypter` - -Without this role, Cloud Pub/Sub cannot perform encryption or decryption using the CMEK. - ---- - -## Solution - -To resolve the issue, grant the **Cloud KMS CryptoKey Encrypter/Decrypter** role to the Pub/Sub service account for the specified CryptoKey. - -### Step 1: Identify the Service Account -The service account mentioned in the error typically has the format: service-[NUMERIC_ID]@gcp-sa-pubsub.iam.gserviceaccount.com - -This service account is automatically created by Google Cloud to manage Pub/Sub operations. ---- -### Step 2: Grant the Necessary Role - -#### ** Using the Google Cloud Console** -1. Open the [Google Cloud Console](https://console.cloud.google.com). -2. Navigate to **Key Management > CryptoKeys**. -3. Locate the CryptoKey resource: - - **Project**: `[PROJECT_ID]` - - **Location**: `[LOCATION]` (e.g., `europe-west1`) - - **Key Ring**: `[KEY_RING]` - - **CryptoKey**: `[CRYPTO_KEY]` -4. Click on the CryptoKey and go to the **Permissions** tab. -5. Add the service account as a principal: - - **Principal**: `service-[NUMERIC_ID]@gcp-sa-pubsub.iam.gserviceaccount.com` - - **Role**: `Cloud KMS CryptoKey Encrypter/Decrypter`. -6. Save the changes. - ---- -After granting the role, Cloud Pub/Sub should be able to access the specified CMEK for encryption and decryption operations. You can now retry the operation that triggered the error. You should deploy the resources again to ensure that the changes take effect. -When the deployment is successful, you can verify that the Pub/Sub client can now access the CMEK without any issues. - - - From 1b0f57b0eaafb09d5a00194bc77aa481d90f0ef2 Mon Sep 17 00:00:00 2001 From: nico_dreylaq Date: Fri, 6 Dec 2024 11:03:20 +0100 Subject: [PATCH 06/24] style: fix linter --- .../3.aws/0. aws-first-setup.md | 9 ++--- .../2. aws-deployment-troubleshooting.md | 35 +++++++++++-------- 2 files changed, 26 insertions(+), 18 deletions(-) diff --git a/.docs/content/1.installation/3.aws/0. aws-first-setup.md b/.docs/content/1.installation/3.aws/0. aws-first-setup.md index e413e1b6e..5c7b70905 100644 --- a/.docs/content/1.installation/3.aws/0. aws-first-setup.md +++ b/.docs/content/1.installation/3.aws/0. aws-first-setup.md @@ -1,14 +1,14 @@ # AWS Setup 🚀 - This guide will help you install and configure the AWS CLI on your system and set up your AWS environment for use. -# 1. Installation & Configuration +## 1. Installation & Configuration -Follow the official AWS CLI install guide [here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) +Follow the official AWS CLI install guide [here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html). For Linux-based systems, you can use the following commands: -```bash + +```bash curl "https://awscli.amazonaws.com/aws-cli-exe-linux-x86_64.zip" -o "awscliv2.zip" unzip awscliv2.zip sudo ./aws/install @@ -26,6 +26,7 @@ Once installed, you need to configure the AWS CLI with your credentials: ```bash aws configure ``` + During configuration: - Enter your **Access Key ID** and **Secret Access Key** (provided by AWS). - Choose a default region, e.g., *eu-west-3* (optional but recommended). diff --git a/.docs/content/1.installation/3.aws/2. aws-deployment-troubleshooting.md b/.docs/content/1.installation/3.aws/2. aws-deployment-troubleshooting.md index ea8aeec33..902abbeb7 100644 --- a/.docs/content/1.installation/3.aws/2. aws-deployment-troubleshooting.md +++ b/.docs/content/1.installation/3.aws/2. aws-deployment-troubleshooting.md @@ -5,12 +5,14 @@ This guide covers common issues you might encounter while deploying your applica ## 1. Error: Partition Not Found If you encounter a Partition Not Found error during deployment or runtime, ensure the following: + - Set the Replication Factor to 1 in the **parameters.tfvars**: -```tf - default = { - node_selector = { service = "workers" } +```hcl +default = { + node_selector = { service = "workers" } replicas = 1 # HERE +} ``` - Check Spelling in Configuration Files. @@ -18,14 +20,15 @@ If you encounter a Partition Not Found error during deployment or runtime, ensur For example in **parameters.tfvars**: -```tf +```hcl default = { node_selector = { service = "workers" } replicas = 1 # HERE ``` instead of -``` + +```hcl defautl = { node_selector = { service = "workers" } replicas = 1 # HERE @@ -44,14 +47,16 @@ After deploying your application or service, AWS typically generates a URL for t Ensure the endpoint is properly defined in your client application, as shown below: ```csharp - var endpoint = new Option("--endpoint", - description: "Endpoint pour la connexion au control plane ArmoniK.", - getDefaultValue: () => "http://localhost:5001"); +var endpoint = new Option("--endpoint", + description: "Endpoint pour la connexion au control plane ArmoniK.", + getDefaultValue: () => "http://localhost:5001"); + ``` ### Adapting the URL When Using Docker When running your application in a Docker container, remember to update the endpoint dynamically. For instance: + ```bash docker run --rm --name client --endpoint "http://:5001" --partition ``` @@ -71,6 +76,7 @@ You need to build the Docker image for the worker using the following command: ```bash docker build -f "./csharp/native/LinearSubTasking/Worker/Dockerfile" -t subtasking "./csharp/native/" ``` + - **-f**: Specifies the path to the Dockerfile for your worker. - **-t subtasking**: Tags the image as subtasking, which will be used as the image name. - **"./csharp/native/"**: Specifies the context for the build (i.e., the root directory containing your source files). @@ -79,15 +85,16 @@ Voici la documentation mise à jour pour inclure la gestion des erreurs liées a 3. Error: Docker Image Not Found If you encounter an error related to the Docker image not being found or incorrectly configured, follow these steps to ensure the image is built and referenced correctly. -Step 1: Ensure the Worker Image is Built Correctly -You need to build the Docker image for the worker using the following command: +**Ensure the Worker Image is Built Correctly** +You need to build the Docker image for the worker using the following command: +``` docker build -f "./csharp/native/LinearSubTasking/Worker/Dockerfile" -t subtasking "./csharp/native/" - - -f: Specifies the path to the Dockerfile for your worker. - -t subtasking: Tags the image as subtasking, which will be used as the image name. - "./csharp/native/": Specifies the context for the build (i.e., the root directory containing your source files). +``` +- **-f**: Specifies the path to the Dockerfile for your worker. +- **-t subtasking**: Tags the image as subtasking, which will be used as the image name. +- **"./csharp/native/"**: Specifies the context for the build (i.e., the root directory containing your source files). ### Verify the Image Tag Matches the Configuration From 556d65e7e8f1ee84dce90556a2ff19ab28f1f487 Mon Sep 17 00:00:00 2001 From: Mohamed Khairallah Gharbi Date: Fri, 6 Dec 2024 11:24:49 +0100 Subject: [PATCH 07/24] correct lint md errors --- .../3.aws/0. aws-first-setup.md | 5 ++-- .../2. aws-deployment-troubleshooting.md | 23 +++++++++++-------- .../4.gcp/0. gcp-all-in-one-deployment.md | 14 ++++++----- .../4.gcp/1. gcp-troubleshooting.md | 21 +++++++++++------ 4 files changed, 39 insertions(+), 24 deletions(-) diff --git a/.docs/content/1.installation/3.aws/0. aws-first-setup.md b/.docs/content/1.installation/3.aws/0. aws-first-setup.md index 5c7b70905..ecf2116a1 100644 --- a/.docs/content/1.installation/3.aws/0. aws-first-setup.md +++ b/.docs/content/1.installation/3.aws/0. aws-first-setup.md @@ -2,7 +2,7 @@ This guide will help you install and configure the AWS CLI on your system and set up your AWS environment for use. -## 1. Installation & Configuration +## 1. Installation & Configuration Follow the official AWS CLI install guide [here](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html). @@ -15,6 +15,7 @@ sudo ./aws/install ``` Ensure the AWS CLI is installed correctly by checking its version: + ```bash aws --version ``` @@ -51,4 +52,4 @@ You should click on the URL provided in the output to open the SSO authorization - Once authorized, the CLI will confirm successful login. - Your authorization page should look similar to this: -![AWS CLI Access](./aws-cli-access.png) \ No newline at end of file +![AWS CLI Access](./aws-cli-access.png) diff --git a/.docs/content/1.installation/3.aws/2. aws-deployment-troubleshooting.md b/.docs/content/1.installation/3.aws/2. aws-deployment-troubleshooting.md index 902abbeb7..918e898b6 100644 --- a/.docs/content/1.installation/3.aws/2. aws-deployment-troubleshooting.md +++ b/.docs/content/1.installation/3.aws/2. aws-deployment-troubleshooting.md @@ -18,7 +18,7 @@ default = { - Check Spelling in Configuration Files. - Double-check for any typos in partition names or configurations. Even minor spelling mistakes can cause the error. -For example in **parameters.tfvars**: +For example in **parameters.tfvars**: ```hcl default = { @@ -26,7 +26,7 @@ default = { replicas = 1 # HERE ``` -instead of +instead of ```hcl defautl = { @@ -34,10 +34,12 @@ defautl = { replicas = 1 # HERE ``` + ## 2. Error: Incorrect URL or Missing Endpoint If your application fails to connect to a service, verify the following: -### Use the Correct URL Generated by AWS: + +### Use the Correct URL Generated by AWS After deploying your application or service, AWS typically generates a URL for the *control_plane* or other services. Make sure to: @@ -86,14 +88,16 @@ Voici la documentation mise à jour pour inclure la gestion des erreurs liées a If you encounter an error related to the Docker image not being found or incorrectly configured, follow these steps to ensure the image is built and referenced correctly. -**Ensure the Worker Image is Built Correctly** +Ensure the Worker Image is Built Correctly You need to build the Docker image for the worker using the following command: -``` + +```bash docker build -f "./csharp/native/LinearSubTasking/Worker/Dockerfile" -t subtasking "./csharp/native/" ``` -- **-f**: Specifies the path to the Dockerfile for your worker. -- **-t subtasking**: Tags the image as subtasking, which will be used as the image name. + +- **-f**: Specifies the path to the Dockerfile for your worker. +- **-t subtasking**: Tags the image as subtasking, which will be used as the image name. - **"./csharp/native/"**: Specifies the context for the build (i.e., the root directory containing your source files). ### Verify the Image Tag Matches the Configuration @@ -101,10 +105,11 @@ docker build -f "./csharp/native/LinearSubTasking/Worker/Dockerfile" -t subtaski The image name (**subtasking** in this case) must match the value specified in your Terraform parameters file (**parameters.tfvars**). Check the following: 1. Open the **parameters.tfvars** file. -2. Look for the worker_image: +2. Look for the worker_image: + ```hcl worker = [ { image = "subtasking" ... -``` \ No newline at end of file +``` diff --git a/.docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md b/.docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md index e2a95f166..74a597e8a 100644 --- a/.docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md +++ b/.docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md @@ -1,15 +1,15 @@ -# GCP all in one deployment Guide +# GCP All-in-One Deployment Guide -This guide will help you deploy your Armonik project on Google Cloud Platform (GCP). +This guide will help you deploy your ArmoniK project on Google Cloud Platform (GCP). ## Step 1: Preparation -#### 1. Install Google CLI +### 1. Install Google CLI When receiving your credentials (email and password). Download and install the Google CLI by following the instructions on the [following link](https://cloud.google.com/sdk/docs/install?hl=fr#deb) -#### 2. Initial Setup +### 2. Initial Setup After installation, authenticate using the provided credentials (email and password) and select the project you want to deploy the resources to. Follow [following tutorial](https://cloud.google.com/docs/authentication/provide-credentials-adc?hl=fr#how-to) to authenticate into the CLI. @@ -37,8 +37,10 @@ Once authenticated, you should see a confirmation page with the message: *You are now authenticated with the gcloud CLI!* ## Step 2: Deployment + Navigate to the **gcp folder** (infrastructure/quick-deploy/gcp) and follow the steps below to deploy your resources. -### 1. Boostrap + +### 1. Bootstrap Generate a prefix key ready in the GCP environment to deploy your resources. @@ -58,7 +60,7 @@ make deploy PREFIX= ### 1. Destroy the deployment -#### Attention: After using the deployment, you have to make sure to destroy it to avoid any additional costs. #### +- Attention: After using the deployment, you have to make sure to destroy it to avoid any additional costs. To destroy the deployment, use the following command: diff --git a/.docs/content/1.installation/4.gcp/1. gcp-troubleshooting.md b/.docs/content/1.installation/4.gcp/1. gcp-troubleshooting.md index 91a4f6fde..5b9e75e0f 100644 --- a/.docs/content/1.installation/4.gcp/1. gcp-troubleshooting.md +++ b/.docs/content/1.installation/4.gcp/1. gcp-troubleshooting.md @@ -3,6 +3,7 @@ This guide will help you troubleshoot common issues when deploying your Armonik project on Google Cloud Platform (GCP). ## 1. Deployment Error with New Partition + If you encounter an error during deployment after adding a new partition with the value `replicas > 0`, follow these steps: 1. Set the replicas to `0`. 2. Redeploy the resources. @@ -12,9 +13,11 @@ If you encounter an error during deployment after adding a new partition with th --- ## 2. Pub/Sub Client-Side Issue + If you encounter an issue with the Pub/Sub client and see the error message: `FAILED_PRECONDITION: Requested entity was not found.`, follow the steps below to resolve the issue. ### Error Message + When running the application, you may encounter the following error: ```bash @@ -31,15 +34,17 @@ grpc._channel._InactiveRpcError: <_InactiveRpcError of RPC that terminated with: ``` ### Problem Description + This error occurs because Cloud Pub/Sub requires access to the specified CMEK to encrypt or decrypt messages. The service account used by Pub/Sub does not have the **Cloud KMS CryptoKey Encrypter/Decrypter** role for the specified CryptoKey. -### Key Components in the Error: +### Key Components in the Error + - **Service Account**: `service-[NUMERIC_ID]@gcp-sa-pubsub.iam.gserviceaccount.com` -- **CryptoKey Resource**: - - Project: `[PROJECT_ID]` - - Location: `[LOCATION]` (e.g., `europe-west1`) - - Key Ring: `[KEY_RING]` - - CryptoKey: `[CRYPTO_KEY]` +- **CryptoKey Resource**: + - Project: `[PROJECT_ID]` + - Location: `[LOCATION]` (e.g., `europe-west1`) + - Key Ring: `[KEY_RING]` + - CryptoKey: `[CRYPTO_KEY]` - **Missing Role**: `roles/cloudkms.cryptoKeyEncrypterDecrypter` Without this role, Cloud Pub/Sub cannot perform encryption or decryption using the CMEK. @@ -49,12 +54,15 @@ Without this role, Cloud Pub/Sub cannot perform encryption or decryption using t To resolve the issue, grant the **Cloud KMS CryptoKey Encrypter/Decrypter** role to the Pub/Sub service account for the specified CryptoKey. #### Step 1: Identify the Service Account + The service account mentioned in the error typically has the format: `service-[NUMERIC_ID]@gcp-sa-pubsub.iam.gserviceaccount.com`. This service account is automatically created by Google Cloud to manage Pub/Sub operations. #### Step 2: Grant the Necessary Role + You can grant the **Cloud KMS CryptoKey Encrypter/Decrypter** role to the service account using the Google Cloud Console or the gcloud command-line tool. ##### Using the Google Cloud Console + 1. Open the [Google Cloud Console](https://console.cloud.google.com). 2. Navigate to **Key Management > CryptoKeys**. 3. Locate the CryptoKey resource: @@ -71,4 +79,3 @@ You can grant the **Cloud KMS CryptoKey Encrypter/Decrypter** role to the servic --- After granting the role, Cloud Pub/Sub should be able to access the specified CMEK for encryption and decryption operations. Retry the operation that triggered the error. Ensure that the deployment is successful and verify that the Pub/Sub client can now access the CMEK without any issues. - From 44d3cf081c73f0171187055e8fe4000a149a7239 Mon Sep 17 00:00:00 2001 From: Mohamed Khairallah Gharbi Date: Mon, 9 Dec 2024 15:51:52 +0100 Subject: [PATCH 08/24] explain the difference between destroy and bootstrap-deploy --- .../1.installation/4.gcp/0. gcp-all-in-one-deployment.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md b/.docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md index 74a597e8a..c9b5db2f2 100644 --- a/.docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md +++ b/.docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md @@ -60,7 +60,7 @@ make deploy PREFIX= ### 1. Destroy the deployment -- Attention: After using the deployment, you have to make sure to destroy it to avoid any additional costs. +- Attention: After using the deployment, you have to make sure to destroy it to avoid any additional costs. The command will destroy all the resources created during the deployment and the project will be deleted. It will conserve the prefix key for future use and by that, the terraform state will be saved in the GCP environment. To destroy the deployment, use the following command: @@ -68,7 +68,9 @@ To destroy the deployment, use the following command: make destroy PREFIX= ``` -### 2. Destroy the GCP Prefix Key +### 2. Destroy the GCP Prefix Key - Optional + +It's an optional step if you are not willing to use the prefix key in the future. The command will delete the prefix key and the terraform state from the GCP environment. To clean up the GCP prefix key, use the following command: From da6286ca3f4a059a6aa64491a2c5eef8e30b687e Mon Sep 17 00:00:00 2001 From: Mohamed Khairallah Gharbi Date: Thu, 16 Jan 2025 12:06:37 +0100 Subject: [PATCH 09/24] correct file names and link to files for aws --- .../3.aws/{0. aws-first-setup.md => 0.aws-first-setup.md} | 0 ...n-one-deployment.md => 1.aws-all-in-one-deployment.md} | 0 ...bleshooting.md => 2.aws-deployment-troubleshooting.md} | 8 ++++---- ...loyment-using-k3s.md => 3.aws-deployment-using-k3s.md} | 0 ...using-kubeadm.md => 4.aws-deployment-using-kubeadm.md} | 0 5 files changed, 4 insertions(+), 4 deletions(-) rename .docs/content/1.installation/3.aws/{0. aws-first-setup.md => 0.aws-first-setup.md} (100%) rename .docs/content/1.installation/3.aws/{1. aws-all-in-one-deployment.md => 1.aws-all-in-one-deployment.md} (100%) rename .docs/content/1.installation/3.aws/{2. aws-deployment-troubleshooting.md => 2.aws-deployment-troubleshooting.md} (85%) rename .docs/content/1.installation/3.aws/{3. aws-deployment-using-k3s.md => 3.aws-deployment-using-k3s.md} (100%) rename .docs/content/1.installation/3.aws/{4. aws-deployment-using-kubeadm.md => 4.aws-deployment-using-kubeadm.md} (100%) diff --git a/.docs/content/1.installation/3.aws/0. aws-first-setup.md b/.docs/content/1.installation/3.aws/0.aws-first-setup.md similarity index 100% rename from .docs/content/1.installation/3.aws/0. aws-first-setup.md rename to .docs/content/1.installation/3.aws/0.aws-first-setup.md diff --git a/.docs/content/1.installation/3.aws/1. aws-all-in-one-deployment.md b/.docs/content/1.installation/3.aws/1.aws-all-in-one-deployment.md similarity index 100% rename from .docs/content/1.installation/3.aws/1. aws-all-in-one-deployment.md rename to .docs/content/1.installation/3.aws/1.aws-all-in-one-deployment.md diff --git a/.docs/content/1.installation/3.aws/2. aws-deployment-troubleshooting.md b/.docs/content/1.installation/3.aws/2.aws-deployment-troubleshooting.md similarity index 85% rename from .docs/content/1.installation/3.aws/2. aws-deployment-troubleshooting.md rename to .docs/content/1.installation/3.aws/2.aws-deployment-troubleshooting.md index 918e898b6..52949199f 100644 --- a/.docs/content/1.installation/3.aws/2. aws-deployment-troubleshooting.md +++ b/.docs/content/1.installation/3.aws/2.aws-deployment-troubleshooting.md @@ -6,7 +6,7 @@ This guide covers common issues you might encounter while deploying your applica If you encounter a Partition Not Found error during deployment or runtime, ensure the following: -- Set the Replication Factor to 1 in the **parameters.tfvars**: +- Set the Replication Factor to 1 in the [`parameters.tfvars`](https://github.com/aneoconsulting/ArmoniK/blob/main/infrastructure/quick-deploy/aws/parameters.tfvars): ```hcl default = { @@ -18,7 +18,7 @@ default = { - Check Spelling in Configuration Files. - Double-check for any typos in partition names or configurations. Even minor spelling mistakes can cause the error. -For example in **parameters.tfvars**: +For example in [`parameters.tfvars`](https://github.com/aneoconsulting/ArmoniK/blob/main/infrastructure/quick-deploy/aws/parameters.tfvars): ```hcl default = { @@ -102,9 +102,9 @@ docker build -f "./csharp/native/LinearSubTasking/Worker/Dockerfile" -t subtaski ### Verify the Image Tag Matches the Configuration -The image name (**subtasking** in this case) must match the value specified in your Terraform parameters file (**parameters.tfvars**). Check the following: +The image name (**subtasking** in this case) must match the value specified in your Terraform parameters file ([`parameters.tfvars`](https://github.com/aneoconsulting/ArmoniK/blob/main/infrastructure/quick-deploy/aws/parameters.tfvars)). Check the following: -1. Open the **parameters.tfvars** file. +1. Open the [`parameters.tfvars`](https://github.com/aneoconsulting/ArmoniK/blob/main/infrastructure/quick-deploy/aws/parameters.tfvars) file. 2. Look for the worker_image: ```hcl diff --git a/.docs/content/1.installation/3.aws/3. aws-deployment-using-k3s.md b/.docs/content/1.installation/3.aws/3.aws-deployment-using-k3s.md similarity index 100% rename from .docs/content/1.installation/3.aws/3. aws-deployment-using-k3s.md rename to .docs/content/1.installation/3.aws/3.aws-deployment-using-k3s.md diff --git a/.docs/content/1.installation/3.aws/4. aws-deployment-using-kubeadm.md b/.docs/content/1.installation/3.aws/4.aws-deployment-using-kubeadm.md similarity index 100% rename from .docs/content/1.installation/3.aws/4. aws-deployment-using-kubeadm.md rename to .docs/content/1.installation/3.aws/4.aws-deployment-using-kubeadm.md From 8dcfea6451be32e8d85d835fdfc82c09ef67b896 Mon Sep 17 00:00:00 2001 From: Mohamed Khairallah Gharbi Date: Thu, 16 Jan 2025 12:07:12 +0100 Subject: [PATCH 10/24] correct the prefix destruction in gcp --- .../1.installation/4.gcp/0. gcp-all-in-one-deployment.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md b/.docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md index c9b5db2f2..1d68f676c 100644 --- a/.docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md +++ b/.docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md @@ -38,7 +38,7 @@ Once authenticated, you should see a confirmation page with the message: ## Step 2: Deployment -Navigate to the **gcp folder** (infrastructure/quick-deploy/gcp) and follow the steps below to deploy your resources. +Navigate to the [`**gcp folder**`](https://github.com/aneoconsulting/ArmoniK/blob/main/infrastructure/quick-deploy/gcp/) and follow the steps below to deploy your resources. ### 1. Bootstrap @@ -68,9 +68,9 @@ To destroy the deployment, use the following command: make destroy PREFIX= ``` -### 2. Destroy the GCP Prefix Key - Optional +### 2. Destroy the GCP Prefix Key - In case you don't need to deploy on GCP anymore -It's an optional step if you are not willing to use the prefix key in the future. The command will delete the prefix key and the terraform state from the GCP environment. +It's an optional step if you are not willing to use the prefix key in the future. However, please note that if you delete the prefix key and the terraform state from the GCP environment, you will not be able to deploy on GCP again without it. It is not recommended to perform this step unless you are certain that you will not need the prefix key again, such as when you are leaving the project. To clean up the GCP prefix key, use the following command: From 8227c3e92c44cd12d3266474076b618ee540c7d3 Mon Sep 17 00:00:00 2001 From: Mohamed Khairallah Gharbi Date: Thu, 16 Jan 2025 12:35:52 +0100 Subject: [PATCH 11/24] add image aws cli using s3 object url --- .docs/content/1.installation/3.aws/0.aws-first-setup.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.docs/content/1.installation/3.aws/0.aws-first-setup.md b/.docs/content/1.installation/3.aws/0.aws-first-setup.md index ecf2116a1..7c4cc9ebe 100644 --- a/.docs/content/1.installation/3.aws/0.aws-first-setup.md +++ b/.docs/content/1.installation/3.aws/0.aws-first-setup.md @@ -52,4 +52,4 @@ You should click on the URL provided in the output to open the SSO authorization - Once authorized, the CLI will confirm successful login. - Your authorization page should look similar to this: -![AWS CLI Access](./aws-cli-access.png) +![AWS CLI Access](https://armonik-public-images.s3.eu-west-3.amazonaws.com/deployment-doc/aws-cli-access.png) From feb40712caa344ca1911d91f0062a7877fef9444 Mon Sep 17 00:00:00 2001 From: Mohamed Khairallah Gharbi Date: Thu, 16 Jan 2025 16:05:15 +0100 Subject: [PATCH 12/24] remove aws troubleshooting, correct explanation part for deployment --- .../1.installation/3.aws/0.aws-first-setup.md | 19 ++- .../3.aws/1.aws-all-in-one-deployment.md | 4 +- .../3.aws/2.aws-deployment-troubleshooting.md | 115 ------------------ ...g-k3s.md => 2.aws-deployment-using-k3s.md} | 2 +- ...m.md => 3.aws-deployment-using-kubeadm.md} | 0 ...ment.md => 0.gcp-all-in-one-deployment.md} | 14 +-- ...leshooting.md => 1.gcp-troubleshooting.md} | 0 7 files changed, 24 insertions(+), 130 deletions(-) delete mode 100644 .docs/content/1.installation/3.aws/2.aws-deployment-troubleshooting.md rename .docs/content/1.installation/3.aws/{3.aws-deployment-using-k3s.md => 2.aws-deployment-using-k3s.md} (95%) rename .docs/content/1.installation/3.aws/{4.aws-deployment-using-kubeadm.md => 3.aws-deployment-using-kubeadm.md} (100%) rename .docs/content/1.installation/4.gcp/{0. gcp-all-in-one-deployment.md => 0.gcp-all-in-one-deployment.md} (65%) rename .docs/content/1.installation/4.gcp/{1. gcp-troubleshooting.md => 1.gcp-troubleshooting.md} (100%) diff --git a/.docs/content/1.installation/3.aws/0.aws-first-setup.md b/.docs/content/1.installation/3.aws/0.aws-first-setup.md index 7c4cc9ebe..993062ed7 100644 --- a/.docs/content/1.installation/3.aws/0.aws-first-setup.md +++ b/.docs/content/1.installation/3.aws/0.aws-first-setup.md @@ -32,20 +32,29 @@ During configuration: - Enter your **Access Key ID** and **Secret Access Key** (provided by AWS). - Choose a default region, e.g., *eu-west-3* (optional but recommended). - Specify the default output format (e.g., *json*, *table*, or *text*). +- The advised output format is *json*. -To use AWS environnement, you can use the following: +To be able to interact with the AWS CLI, you need to set up your AWS Single Sign-On (SSO) credentials. This is realized by running the following command: ```bash aws sso login ``` +Each time you want to deploy ArmoniK on AWS, you need to run this command to authenticate. + You should click on the URL provided in the output to open the SSO authorization page in a browser. -## Step 1: Authorize in the Browser +## Step 1: AWS Authentication Setup + +To be able to interact with the AWS CLI, you need to set up your AWS Single Sign-On (SSO) credentials. This is realized by running the following command: + +```bash +aws sso login +``` + +Each time you want to deploy ArmoniK on AWS, you need to run this command to authenticate. -- Click on the first URL provided in the output. -- This will open the SSO authorization page in your default browser. -- After logging in, you’ll be prompted to grant permissions. +You should click on the URL provided in the output to open the SSO authorization page in a browser. It will open the SSO authorization page in your default browser. After logging in, you’ll be prompted to grant permissions. ## Step 2: Verify Login diff --git a/.docs/content/1.installation/3.aws/1.aws-all-in-one-deployment.md b/.docs/content/1.installation/3.aws/1.aws-all-in-one-deployment.md index cc388aaa3..9f2b821c5 100644 --- a/.docs/content/1.installation/3.aws/1.aws-all-in-one-deployment.md +++ b/.docs/content/1.installation/3.aws/1.aws-all-in-one-deployment.md @@ -46,10 +46,10 @@ export KUBECONFIG=/generated/kubeconfig ## Configuration -All parameters are contained in [`parameters.tfvars`](https://github.com/aneoconsulting/ArmoniK/blob/main/infrastructure/quick-deploy/aws/parameters.tfvars) +All parameters are contained in [`parameters.tfvars`](../../../../infrastructure/quick-deploy/aws/parameters.tfvars) ::alert{type="info"} -By default, all the cloud services are set to launch. To see what kind of parameters are available, read [`variables.tf`](https://github.com/aneoconsulting/ArmoniK/blob/main/infrastructure/quick-deploy/aws/variables.tf) +By default, all the cloud services are set to launch. To see what kind of parameters are available, read [`variables.tf`](../../../../infrastructure/quick-deploy/aws/variables.tf) :: You can specify a custom parameter file. When executing the `make` command, you may use the `PARAMETERS_FILE` option to set the path to your file. diff --git a/.docs/content/1.installation/3.aws/2.aws-deployment-troubleshooting.md b/.docs/content/1.installation/3.aws/2.aws-deployment-troubleshooting.md deleted file mode 100644 index 52949199f..000000000 --- a/.docs/content/1.installation/3.aws/2.aws-deployment-troubleshooting.md +++ /dev/null @@ -1,115 +0,0 @@ -# AWS Deployment Troubleshooting Guide 🛠️ - -This guide covers common issues you might encounter while deploying your application on AWS and provides solutions to address them. - -## 1. Error: Partition Not Found - -If you encounter a Partition Not Found error during deployment or runtime, ensure the following: - -- Set the Replication Factor to 1 in the [`parameters.tfvars`](https://github.com/aneoconsulting/ArmoniK/blob/main/infrastructure/quick-deploy/aws/parameters.tfvars): - -```hcl -default = { - node_selector = { service = "workers" } - replicas = 1 # HERE -} -``` - -- Check Spelling in Configuration Files. -- Double-check for any typos in partition names or configurations. Even minor spelling mistakes can cause the error. - -For example in [`parameters.tfvars`](https://github.com/aneoconsulting/ArmoniK/blob/main/infrastructure/quick-deploy/aws/parameters.tfvars): - -```hcl -default = { - node_selector = { service = "workers" } - replicas = 1 # HERE -``` - -instead of - -```hcl -defautl = { - node_selector = { service = "workers" } - replicas = 1 # HERE - -``` - -## 2. Error: Incorrect URL or Missing Endpoint - -If your application fails to connect to a service, verify the following: - -### Use the Correct URL Generated by AWS - -After deploying your application or service, AWS typically generates a URL for the *control_plane* or other services. Make sure to: - -- Copy the URL provided by AWS during deployment. -- Update the client configuration with the correct endpoint before building your project. - -Ensure the endpoint is properly defined in your client application, as shown below: - -```csharp -var endpoint = new Option("--endpoint", - description: "Endpoint pour la connexion au control plane ArmoniK.", - getDefaultValue: () => "http://localhost:5001"); - -``` - -### Adapting the URL When Using Docker - -When running your application in a Docker container, remember to update the endpoint dynamically. For instance: - -```bash -docker run --rm --name client --endpoint "http://:5001" --partition -``` - -- Replace with the actual IP or hostname of the deployed service. -- Replace with the appropriate partition name you’re using. -- Always double-check that the endpoint and partition names are spelled correctly to avoid runtime errors. - -## 3. Missing Images - -If you encounter an error related to the Docker image not being found or incorrectly configured, follow these steps to ensure the image is built and referenced correctly. - -### Ensure the Worker Image is Built Correctly - -You need to build the Docker image for the worker using the following command: - -```bash -docker build -f "./csharp/native/LinearSubTasking/Worker/Dockerfile" -t subtasking "./csharp/native/" -``` - -- **-f**: Specifies the path to the Dockerfile for your worker. -- **-t subtasking**: Tags the image as subtasking, which will be used as the image name. -- **"./csharp/native/"**: Specifies the context for the build (i.e., the root directory containing your source files). - -Voici la documentation mise à jour pour inclure la gestion des erreurs liées aux images Docker : -3. Error: Docker Image Not Found - -If you encounter an error related to the Docker image not being found or incorrectly configured, follow these steps to ensure the image is built and referenced correctly. - -Ensure the Worker Image is Built Correctly - -You need to build the Docker image for the worker using the following command: - -```bash -docker build -f "./csharp/native/LinearSubTasking/Worker/Dockerfile" -t subtasking "./csharp/native/" -``` - -- **-f**: Specifies the path to the Dockerfile for your worker. -- **-t subtasking**: Tags the image as subtasking, which will be used as the image name. -- **"./csharp/native/"**: Specifies the context for the build (i.e., the root directory containing your source files). - -### Verify the Image Tag Matches the Configuration - -The image name (**subtasking** in this case) must match the value specified in your Terraform parameters file ([`parameters.tfvars`](https://github.com/aneoconsulting/ArmoniK/blob/main/infrastructure/quick-deploy/aws/parameters.tfvars)). Check the following: - -1. Open the [`parameters.tfvars`](https://github.com/aneoconsulting/ArmoniK/blob/main/infrastructure/quick-deploy/aws/parameters.tfvars) file. -2. Look for the worker_image: - -```hcl - worker = [ - { - image = "subtasking" - ... -``` diff --git a/.docs/content/1.installation/3.aws/3.aws-deployment-using-k3s.md b/.docs/content/1.installation/3.aws/2.aws-deployment-using-k3s.md similarity index 95% rename from .docs/content/1.installation/3.aws/3.aws-deployment-using-k3s.md rename to .docs/content/1.installation/3.aws/2.aws-deployment-using-k3s.md index 6bc3ad597..37ed8aa30 100644 --- a/.docs/content/1.installation/3.aws/3.aws-deployment-using-k3s.md +++ b/.docs/content/1.installation/3.aws/2.aws-deployment-using-k3s.md @@ -45,7 +45,7 @@ We will create a cluster on AWS composed of four ec2 instances: * a master node * three worker nodes -In [parameters.tfvars](https://github.com/aneoconsulting/ArmoniK/blob/main/infrastructure/docs/kubernetes/cluster/k3s-cluster/parameters.tfvars): +In [parameters.tfvars](../../../../infrastructure/docs/kubernetes/cluster/k3s-cluster/parameters.tfvars): * set the value of the parameter `ssh_key` with the content of the public SSH key `~/.ssh/cluster-key.pub` and the path to the private SSH key, for example: diff --git a/.docs/content/1.installation/3.aws/4.aws-deployment-using-kubeadm.md b/.docs/content/1.installation/3.aws/3.aws-deployment-using-kubeadm.md similarity index 100% rename from .docs/content/1.installation/3.aws/4.aws-deployment-using-kubeadm.md rename to .docs/content/1.installation/3.aws/3.aws-deployment-using-kubeadm.md diff --git a/.docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md b/.docs/content/1.installation/4.gcp/0.gcp-all-in-one-deployment.md similarity index 65% rename from .docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md rename to .docs/content/1.installation/4.gcp/0.gcp-all-in-one-deployment.md index 1d68f676c..78998753f 100644 --- a/.docs/content/1.installation/4.gcp/0. gcp-all-in-one-deployment.md +++ b/.docs/content/1.installation/4.gcp/0.gcp-all-in-one-deployment.md @@ -7,12 +7,12 @@ This guide will help you deploy your ArmoniK project on Google Cloud Platform (G ### 1. Install Google CLI When receiving your credentials (email and password). -Download and install the Google CLI by following the instructions on the [following link](https://cloud.google.com/sdk/docs/install?hl=fr#deb) +Download and install the Google CLI by following the instructions on the [following link](https://cloud.google.com/sdk/docs/install) ### 2. Initial Setup -After installation, authenticate using the provided credentials (email and password) and select the project you want to deploy the resources to. -Follow [following tutorial](https://cloud.google.com/docs/authentication/provide-credentials-adc?hl=fr#how-to) to authenticate into the CLI. +After installation, you can authenticate using your credentials (email and password) and select the project you want to deploy the resources on. +Follow [following tutorial](https://cloud.google.com/docs/authentication/provide-credentials-adc#how-to) to authenticate using the CLI. You can authenticate using the following command: @@ -38,7 +38,7 @@ Once authenticated, you should see a confirmation page with the message: ## Step 2: Deployment -Navigate to the [`**gcp folder**`](https://github.com/aneoconsulting/ArmoniK/blob/main/infrastructure/quick-deploy/gcp/) and follow the steps below to deploy your resources. +Navigate to the [`gcp folder`](../../../../infrastructure/quick-deploy/gcp/) and follow the steps below to deploy your resources. ### 1. Bootstrap @@ -60,7 +60,7 @@ make deploy PREFIX= ### 1. Destroy the deployment -- Attention: After using the deployment, you have to make sure to destroy it to avoid any additional costs. The command will destroy all the resources created during the deployment and the project will be deleted. It will conserve the prefix key for future use and by that, the terraform state will be saved in the GCP environment. +- Warning: After using the deployment, you have to make sure to destroy it to avoid any additional costs. The next command will destroy all the resources created during the deployment and the project will be deleted. It will only conserve the prefix key for future use. The terraform state will be saved in the GCP environment. To destroy the deployment, use the following command: @@ -70,7 +70,7 @@ make destroy PREFIX= ### 2. Destroy the GCP Prefix Key - In case you don't need to deploy on GCP anymore -It's an optional step if you are not willing to use the prefix key in the future. However, please note that if you delete the prefix key and the terraform state from the GCP environment, you will not be able to deploy on GCP again without it. It is not recommended to perform this step unless you are certain that you will not need the prefix key again, such as when you are leaving the project. +It's an optional step if you are not willing to use the prefix key in the future. However, please note that if you delete the prefix key and the terraform state from the GCP environment, you will not be able to reproduce this exact deployment on GCP. It is not recommended to perform this step unless you are certain that you will not need the prefix key again, such as when you are leaving the project. To clean up the GCP prefix key, use the following command: @@ -80,4 +80,4 @@ make bootstrap-destroy PREFIX= ## Step 4: Add a Sample Partition -Just like with an **AWS** or **localhost** deployment, you can add a sample partition to test deployment on the **GCP** environment. You need to build the images and redeploy the services after adding the sample partition in the parameters.tfvars file. +Just as with an **AWS** or **localhost** deployment, you can add a sample partition to test a deployment on the **GCP** environment. You need to build the images and redeploy the services after adding the sample partition in the parameters.tfvars file. diff --git a/.docs/content/1.installation/4.gcp/1. gcp-troubleshooting.md b/.docs/content/1.installation/4.gcp/1.gcp-troubleshooting.md similarity index 100% rename from .docs/content/1.installation/4.gcp/1. gcp-troubleshooting.md rename to .docs/content/1.installation/4.gcp/1.gcp-troubleshooting.md From de9e1cea8286d5805a64f1d42acfaf0ed4d67a8b Mon Sep 17 00:00:00 2001 From: Mohamed Khairallah Gharbi Date: Thu, 16 Jan 2025 16:05:15 +0100 Subject: [PATCH 13/24] remove aws troubleshooting, correct explanation part for deployment --- .../content/1.installation/4.gcp/0.gcp-all-in-one-deployment.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.docs/content/1.installation/4.gcp/0.gcp-all-in-one-deployment.md b/.docs/content/1.installation/4.gcp/0.gcp-all-in-one-deployment.md index 78998753f..66f720d71 100644 --- a/.docs/content/1.installation/4.gcp/0.gcp-all-in-one-deployment.md +++ b/.docs/content/1.installation/4.gcp/0.gcp-all-in-one-deployment.md @@ -38,7 +38,7 @@ Once authenticated, you should see a confirmation page with the message: ## Step 2: Deployment -Navigate to the [`gcp folder`](../../../../infrastructure/quick-deploy/gcp/) and follow the steps below to deploy your resources. +Navigate to `infrastructure/quick-deploy/gcp` folder and follow the steps below to deploy your resources. ### 1. Bootstrap From 8a3eb240e4bf79caf52cfcedebcf68269b1f0d81 Mon Sep 17 00:00:00 2001 From: Trystan Schneider Date: Thu, 31 Oct 2024 18:58:53 +0100 Subject: [PATCH 14/24] feat: Add bench workflow --- .github/workflows/bench-benchmark.yml | 144 +++++++ .../parameters.tfvars | 408 ++++++++++++++++++ .../versions.tfvars.json | 92 ++++ benchmarking/aws/README.md | 16 + benchmarking/aws/parameters.tfvars | 408 ++++++++++++++++++ tools/ci/bench-job-template.yml | 41 ++ tools/ci/python/program.py | 326 ++++++++++++++ tools/ci/python/requirements.txt | 2 + 8 files changed, 1437 insertions(+) create mode 100644 .github/workflows/bench-benchmark.yml create mode 100644 benchmarking/aws/2-21-0_8e69cdd2f5ca8dd99652ce651bc4ff9eaa8776eb/parameters.tfvars create mode 100644 benchmarking/aws/2-21-0_8e69cdd2f5ca8dd99652ce651bc4ff9eaa8776eb/versions.tfvars.json create mode 100644 benchmarking/aws/README.md create mode 100644 benchmarking/aws/parameters.tfvars create mode 100644 tools/ci/bench-job-template.yml create mode 100644 tools/ci/python/program.py create mode 100644 tools/ci/python/requirements.txt diff --git a/.github/workflows/bench-benchmark.yml b/.github/workflows/bench-benchmark.yml new file mode 100644 index 000000000..db3c7b7fe --- /dev/null +++ b/.github/workflows/bench-benchmark.yml @@ -0,0 +1,144 @@ +name: "Benchmark with Bench client" + +on: + push: + branches: + - main + release: + types: [published, prereleased] + workflow_dispatch: + inputs: + destroy-on-session-end: + description: "Whether to destroy infrastructure right after the bench job has ended" + type: boolean + required: false + default: true + prefix: + description: "Prefix for the infrastructure. The bucket associated with this prefix must be created before." + required: false + default: "ci-bench" + +jobs: + define-matrix: + name: Define matrix + runs-on: ubuntu-latest + outputs: + matrix: ${{ steps.types.outputs.matrix }} + steps: + - id: types + name: Define matrix + env: + TRIGGER: ${{ github.event_name }} + REF_NAME: ${{ github.ref_name }} + run: | + set -ex + if [ "$TRIGGER" == 'push' ]; then + echo '{"include":[{"type": "localhost", "ntasks":3000, "polling-limit": 300}]}' > matrix.json + echo "FILE_PREFIX=$REF_NAME" >> $GITHUB_ENV + elif [ "$TRIGGER" == 'release' ]; then + echo '{"include":[{"type": "localhost", "ntasks":3000, "polling-limit": 300}, {"type": "aws", "ntasks":1200000, "polling-limit": 1000, "parameters-file-path": "benchmarking/aws/parameters.tfvars"}]}' > matrix.json + echo "FILE_PREFIX=release/$REF_NAME" >> $GITHUB_ENV + elif [ "$TRIGGER" == 'workflow_dispatch' ]; then + echo '{"include":[{"type": "aws", "ntasks":1200000, "polling-limit": 1000, "parameters-file-path": "benchmarking/aws/parameters.tfvars"}]}' > matrix.json + echo "FILE_PREFIX=manual/$REF_NAME" >> $GITHUB_ENV + fi + echo "matrix=$(cat matrix.json)" >> "$GITHUB_OUTPUT" + + benchmark: + name: ${{ matrix.type }} + runs-on: ubuntu-latest + needs: define-matrix + strategy: + fail-fast: false + matrix: ${{ fromJson(needs.define-matrix.outputs.matrix) }} + env: + prefix: ${{ inputs.prefix || 'ci-bench' }} + parameters-file-path: ${{ matrix.parameters-file-path }} + outputs: + terraform-output: ${{ steps.deploy.outputs.terraform-output }} + armonik-endpoint: ${{ steps.get-armonik-endpoint.outputs.endpoint }} + steps: + - name: Checkout + uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + + - name: Install Dependencies + uses: aneoconsulting/ArmoniK.Action.Deploy/dependencies@main + with: + terraform: true + k3s: true + docker: true + aws: true + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_REGION: eu-west-3 + + - name: Get Core version + run: | + set -ex + echo "core-version=$(cat versions.tfvars.json | jq -r '.armonik_versions.core')" >> $GITHUB_ENV + + - id: deploy + name: "Deploy ArmoniK" + uses: aneoconsulting/ArmoniK.Action.Deploy/deploy@main + with: + type: ${{ matrix.type }} + prefix: ${{ env.prefix }} + core-version: ${{ env.core-version }} + parameters-file-path: ${{ env.parameters-file-path }} + + - id: get-armonik-endpoint + name: "Get ArmoniK's control plane endpoint" + env: + TYPE: ${{ matrix.type }} + run: | + set -ex + grpc_endpoint=$(cat "infrastructure/quick-deploy/$TYPE/generated/armonik-output.json" | jq -r '.armonik.control_plane_url' | sed -r 's/(http:\/\/)([^:]*)(:.*)/\2/') + echo "grpc-endpoint=$grpc_endpoint" >> "$GITHUB_OUTPUT" + sleep 60 + + - id: bench + name: Run Bench + uses: aneoconsulting/ArmoniK.Action.Deploy/bench@main + with: + type: ${{ matrix.type }} + armonik-core-version: ${{ env.core-version }} + ntasks: ${{ matrix.ntasks }} + session-name: bench + grpc-client-endpoint: ${{ steps.get-armonik-endpoint.outputs.grpc-endpoint }} + timeout: 1200 + + - id: get-bench-stats + name: Get Bench Stats + uses: aneoconsulting/ArmoniK.Action.Deploy/get-throughput@main + with: + grpc-client-endpoint: ${{ steps.get-armonik-endpoint.outputs.grpc-endpoint }} + session-name: ${{ steps.bench.outputs.session-name }} + poll-duration-limit: ${{ matrix.polling-limit }} + + - name: Upload benchmark results to artifact registry + uses: actions/upload-artifact@v4 + with: + name: benchclient_benchmark_${{ github.event_name }}_${{ matrix.type }}_${{ github.run_id }} + path: ${{ steps.get-bench-stats.outputs.bench-file-path }} + + - name: Upload benchmark results to s3 + env: + EVENT_NAME: ${{ github.event_name }} + BENCH_RESULTS_PATH: ${{ steps.get-bench-stats.outputs.bench-file-path }} + TYPE: ${{ matrix.type }} + GHRUNID: ${{ github.run_id }} + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_EC2_METADATA_DISABLED: true + run: | + DATE=$(date +"%Y-%m-%d") + aws s3 cp "$BENCH_RESULTS_PATH" "s3://armonik-bench-storage/${FILE_PREFIX}/${GHRUNID}_${DATE}/benchclient_benchmark_${EVENT_NAME}_${TYPE}.json" + + - if: ${{ (github.event_name == 'workflow_dispatch' && inputs.destroy-on-session-end) || (github.event_name != 'workflow_dispatch' && always()) }} + id: destroy + name: Destroy deployment + uses: aneoconsulting/ArmoniK.Action.Deploy/destroy@main + with: + type: ${{ matrix.type }} + prefix: ${{ env.prefix }} + parameters-file-path: ${{ env.parameters-file-path }} \ No newline at end of file diff --git a/benchmarking/aws/2-21-0_8e69cdd2f5ca8dd99652ce651bc4ff9eaa8776eb/parameters.tfvars b/benchmarking/aws/2-21-0_8e69cdd2f5ca8dd99652ce651bc4ff9eaa8776eb/parameters.tfvars new file mode 100644 index 000000000..143a06817 --- /dev/null +++ b/benchmarking/aws/2-21-0_8e69cdd2f5ca8dd99652ce651bc4ff9eaa8776eb/parameters.tfvars @@ -0,0 +1,408 @@ +# Tags +tags = { + "name" = "bench" + "origin" = "terraform" + "csp" = "aws" + "Terraform" = "true" +} + +vpc = { + enable_private_subnet = false +} + +# AWS EKS +eks = { + cluster_version = "1.25" + node_selector = { service = "monitoring" } + cluster_endpoint_public_access = true + map_roles = [] + map_users = [] +} + +eks_managed_node_groups = { + workers = { + name = "workers" + launch_template_description = "Node group for ArmoniK Compute-plane pods" + ami_type = "AL2_x86_64" + instance_types = ["c5a.4xlarge"] + capacity_type = "ON_DEMAND" # "SPOT" + min_size = 8 + desired_size = 8 + max_size = 8 + labels = { + service = "workers" + "node.kubernetes.io/lifecycle" = "ondemand" # "spot" + } + taints = { + dedicated = { + key = "service" + value = "workers" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + + metrics = { + name = "metrics" + launch_template_description = "Node group for metrics: Metrics exporter and Prometheus" + ami_type = "AL2_x86_64" + instance_types = ["c5.large"] + capacity_type = "ON_DEMAND" + min_size = 1 + desired_size = 1 + max_size = 5 + labels = { + service = "metrics" + "node.kubernetes.io/lifecycle" = "ondemand" + } + taints = { + dedicated = { + key = "service" + value = "metrics" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + # Node group for ArmoniK control-plane: control-plane and Ingress + control_plane = { + name = "control-plane" + launch_template_description = "Node group for ArmoniK Control-plane and Ingress" + ami_type = "AL2_x86_64" + instance_types = ["c5a.4xlarge"] + capacity_type = "ON_DEMAND" + min_size = 1 + desired_size = 1 + max_size = 2 + labels = { + service = "control-plane" + "node.kubernetes.io/lifecycle" = "ondemand" + } + taints = { + dedicated = { + key = "service" + value = "control-plane" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + # Node group for monitoring: metrics server, keda, seq, grafana, cluster-autoscaler, coreDNS, termination handler + monitoring = { + name = "monitoring" + launch_template_description = "Node group for monitoring" + ami_type = "AL2_x86_64" + instance_types = ["c5.large"] + capacity_type = "ON_DEMAND" + min_size = 1 + desired_size = 1 + max_size = 5 + labels = { + service = "monitoring" + "node.kubernetes.io/lifecycle" = "ondemand" + } + taints = { + dedicated = { + key = "service" + value = "monitoring" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + # Node group for data-plane + # state_database, inner_storage, task_queue + state_database = { + name = "mongodb" + launch_template_description = "Node group for MongoDB" + ami_type = "AL2_x86_64" + instance_types = ["c5a.8xlarge"] + use_custom_launch_template = true + block_device_mappings = { + xvda = { + device_name = "/dev/xvda" + ebs = { + volume_size = 75 + volume_type = "gp3" + iops = 5000 + throughput = 1000 + encrypted = null + kms_key_id = null + delete_on_termination = true + } + } + } + capacity_type = "ON_DEMAND" + min_size = 1 + desired_size = 1 + max_size = 1 + labels = { + service = "state-database" + "node.kubernetes.io/lifecycle" = "ondemand" + } + taints = { + dedicated = { + key = "service" + value = "state-database" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } +} + +self_managed_node_groups = { + others = { + name = "others" + launch_template_description = "Node group for others" + instance_type = "c5.large" + min_size = 0 + desired_size = 0 + max_size = 5 + force_delete = true + force_delete_warm_pool = true + instance_market_options = { + market_type = "spot" + } + bootstrap_extra_args = "--kubelet-extra-args '--node-labels=node.kubernetes.io/lifecycle=spot'" + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + others_mixed = { + name = "others-mixed" + launch_template_description = "Mixed On demand and SPOT instances for other pods" + min_size = 1 + desired_size = 1 + max_size = 5 + use_mixed_instances_policy = true + mixed_instances_policy = { + on_demand_allocation_strategy = "lowest-price" + on_demand_base_capacity = 0 + on_demand_percentage_above_base_capacity = 20 # 20% On-Demand Instances, 80% Spot Instances + spot_allocation_strategy = "price-capacity-optimized" + spot_instance_pools = null + spot_max_price = null + } + override = [ + { + instance_type = "c5.4xlarge" + weighted_capacity = "1" + }, + { + instance_type = "c5.2xlarge" + weighted_capacity = "2" + }, + ] + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } +} + +# List of fargate profiles +fargate_profiles = {} + +metrics_server = { + node_selector = { service = "monitoring" } +} + +keda = { + node_selector = { service = "monitoring" } +} + +# Object storage +# Uncomment either the `elasticache` or the `s3_os` parameter +elasticache = { + engine = "redis" + engine_version = "6.x" + node_type = "cache.r4.large" + num_cache_clusters = 1 +} + +#s3_os = {} + +mq = { + engine_type = "ActiveMQ" + engine_version = "5.17.6" + host_instance_type = "mq.m5.xlarge" +} + +mongodb = { + node_selector = { service = "state-database" } + replicas = 2 + mongodb_resources = { + limits = { + "cpu" = "30" + "memory" = "60Gi" + "ephemeral-storage" = "20Gi" + } + requests = { + "cpu" = "14" + "memory" = "29Gi" + "ephemeral-storage" = "4Gi" + } + } +} + +seq = { + node_selector = { service = "monitoring" } +} + +grafana = { + node_selector = { service = "monitoring" } +} + +node_exporter = { + node_selector = {} +} + +windows_exporter = { + node_selector = { + "plateform" = "windows" + } +} + +prometheus = { + node_selector = { service = "metrics" } +} + +metrics_exporter = { + node_selector = { service = "metrics" } +} + + +fluent_bit = { + is_daemonset = true + node_selector = {} +} + +logging_level = "Information" + +control_plane = { + limits = { + cpu = "2000m" + memory = "4096Mi" + } + requests = { + cpu = "1000m" + memory = "2048Mi" + } + default_partition = "default" + replicas = 12 + node_selector = { service = "control-plane" } +} + +admin_gui = { + limits = { + cpu = "1000m" + memory = "1024Mi" + } + requests = { + cpu = "100m" + memory = "128Mi" + } + node_selector = { service = "monitoring" } +} + +compute_plane = { + bench = { + node_selector = { service = "workers" } + replicas = 120 + polling_agent = { + limits = { + cpu = "1000m" + memory = "1024Mi" + } + requests = { + cpu = "500m" + memory = "256Mi" + } + } + worker = [ + { + image = "dockerhubaneo/armonik_core_bench_test_worker" + limits = { + cpu = "1000m" + memory = "1024Mi" + } + requests = { + cpu = "500m" + memory = "512Mi" + } + } + ] + }, +} + +ingress = { + tls = false + mtls = false + generate_client_cert = false + node_selector = { service = "control-plane" } +} + +# Job to insert partitions in the database +job_partitions_in_database = { + node_selector = { service = "control-plane" } +} + +# Authentication behavior +authentication = { + node_selector = { service = "control-plane" } +} + +configurations = { + core = { + env = { + Amqp__AllowHostMismatch = false + Amqp__MaxPriority = "10" + Amqp__MaxRetries = "5" + Amqp__QueueStorage__LockRefreshPeriodicity = "00:00:45" + Amqp__QueueStorage__PollPeriodicity = "00:00:10" + Amqp__QueueStorage__LockRefreshExtension = "00:02:00" + MongoDB__TableStorage__PollingDelayMin = "00:00:01" + MongoDB__TableStorage__PollingDelayMax = "00:00:10" + MongoDB__TableStorage__PollingDelay = "00:00:01" + MongoDB__DataRetention = "1.00:00:00" # 1 day retention + MongoDB__AllowInsecureTls = true + Redis__Timeout = 3000 + Redis__SslHost = "" + Redis__TtlTimeSpan = "1.00:00:00" # 1 day retention + Submitter__DeletePayload = true + } + } + control = { + env = { + Submitter__MaxErrorAllowed = 50 + } + } + jobs = { env = { MongoDB__DataRetention = "1.00:00:00" } } +} + +environment_description = { + name = "aws-dev" + version = "0.0.0" + description = "AWS environment" + color = "#80ff80" +} + +upload_images = false diff --git a/benchmarking/aws/2-21-0_8e69cdd2f5ca8dd99652ce651bc4ff9eaa8776eb/versions.tfvars.json b/benchmarking/aws/2-21-0_8e69cdd2f5ca8dd99652ce651bc4ff9eaa8776eb/versions.tfvars.json new file mode 100644 index 000000000..90cb085d7 --- /dev/null +++ b/benchmarking/aws/2-21-0_8e69cdd2f5ca8dd99652ce651bc4ff9eaa8776eb/versions.tfvars.json @@ -0,0 +1,92 @@ +{ + "armonik_versions": { + "armonik": "2.21.0", + "infra": "0.7.0", + "infra_plugins": "0.1.1", + "core": "0.29.1", + "api": "3.21.0", + "gui": "0.13.3", + "extcsharp": "0.12.11", + "samples": "2.21.0" + }, + "armonik_images": { + "armonik": [ + ], + "infra": [ + "https://github.com/aneoconsulting/ArmoniK.Infra.git" + ], + "infra_plugins": [ + "dockerhubaneo/armonik_pdc_update" + ], + "core": [ + "dockerhubaneo/armonik_pollingagent", + "dockerhubaneo/armonik_control_metrics", + "dockerhubaneo/armonik_control_partition_metrics", + "dockerhubaneo/armonik_control", + "dockerhubaneo/armonik_core_stream_test_worker", + "dockerhubaneo/armonik_core_stream_test_client", + "dockerhubaneo/armonik_core_htcmock_test_worker", + "dockerhubaneo/armonik_core_htcmock_test_client", + "dockerhubaneo/armonik_core_bench_test_worker", + "dockerhubaneo/armonik_core_bench_test_client" + ], + "api": [ + ], + "gui": [ + "dockerhubaneo/armonik_admin_app", + "dockerhubaneo/armonik_admin_api" + ], + "extcsharp": [ + "dockerhubaneo/armonik_worker_dll" + ], + "samples": [ + "dockerhubaneo/armonik_demo_helloworld_worker", + "dockerhubaneo/armonik_demo_subtasking_worker", + "dockerhubaneo/armonik_demo_linearsubtasking_worker", + "dockerhubaneo/armonik_demo_multipleresults_worker" + ] + }, + "image_tags": { + "registry.k8s.io/autoscaling/cluster-autoscaler": "v1.31.0", + "registry.k8s.io/metrics-server/metrics-server": "v0.7.2", + "ghcr.io/kedacore/keda": "2.16.0", + "ghcr.io/kedacore/keda-metrics-apiserver": "2.16.0", + "public.ecr.aws/aws-ec2/aws-node-termination-handler": "v1.22.1", + "public.ecr.aws/efs-csi-driver/amazon/aws-efs-csi-driver": "v2.1.0", + "public.ecr.aws/eks-distro/kubernetes-csi/livenessprobe": "v2.14.0-eks-1-31-7", + "public.ecr.aws/eks-distro/kubernetes-csi/node-driver-registrar": "v2.12.0-eks-1-31-7", + "public.ecr.aws/eks-distro/kubernetes-csi/external-provisioner": "v5.1.0-eks-1-31-7", + "symptoma/activemq": "5.18.4", + "mongo": "8.0.3", + "bitnami/mongodb": "8.0.3-debian-12-r0", + "bitnami/mongodb-sharded": "8.0.3-debian-12-r0", + "rtsp/mongosh": "2.3.3", + "redis": "7.4.1-alpine", + "minio/minio": "RELEASE.2024-11-07T00-52-20Z", + "datalust/seq": "2024.3", + "datalust/seqcli": "2024.3", + "grafana/grafana": "11.3.0", + "prom/node-exporter": "v1.8.2", + "prom/prometheus": "v3.0.0", + "fluent/fluent-bit": "3.1.10", + "nginxinc/nginx-unprivileged": "1.27.2-alpine-slim", + "registry.k8s.io/sig-storage/nfs-subdir-external-provisioner": "v4.0.18", + "bitnami/rabbitmq": "4.0.3", + "ghcr.io/chaos-mesh/chaos-mesh": "v2.7.0", + "ghcr.io/chaos-mesh/chaos-daemon": "v2.7.0", + "ghcr.io/chaos-mesh/chaos-dashboard": "v2.7.0", + "ghcr.io/prometheus-community/windows-exporter": "0.29.2-ltsc2022", + "mcr.microsoft.com/windows/nanoserver": "ltsc2022" + }, + "helm_charts" : { + "keda" : { "repository" : "https://kedacore.github.io/charts" , "version" : "2.16.0"}, + "metrics_server" : { "repository" : "https://kubernetes-sigs.github.io/metrics-server/" , "version" :"3.12.2"}, + "cluster_autoscaler" : {"repository" : "https://kubernetes.github.io/autoscaler" , "version" : "9.43.2"}, + "termination_handler" : {"repository" : "https://aws.github.io/eks-charts" , "version" : "0.21.0" }, + "efs_csi_driver" : { "repository" :"https://kubernetes-sigs.github.io/aws-efs-csi-driver/" , "version": "3.0.8" }, + "rabbitmq" : { "repository" : "https://charts.bitnami.com/bitnami" , "version" : "13.0.2"}, + "chaos_mesh" : { "repository" : "https://charts.chaos-mesh.org" , "version" : "2.6.3"}, + "mongodb" : { "repository": "oci://registry-1.docker.io/bitnamicharts", "version" : "16.2.2"}, + "mongodb-sharded" : { "repository": "oci://registry-1.docker.io/bitnamicharts", "version" : "9.0.3" } + } +} diff --git a/benchmarking/aws/README.md b/benchmarking/aws/README.md new file mode 100644 index 000000000..e371bfe3b --- /dev/null +++ b/benchmarking/aws/README.md @@ -0,0 +1,16 @@ +# ArmoniK benchmarking on AWS +This folder contains a Terraform parameters file describing the infrastructure that has been chosen as *ArmoniK reference infrastructure on AWS* that is used for regular benchmarks at each ArmoniK release. This file might evolve alongside ArmoniK. + +Thus for reproducibility concerns, this folder also contains subfolders that save dumps of the exact infrastructure configuration used for each version of ArmoniK benchmarked as well as the versions of ArmoniK's components. + +The subfolders are actually named as following, given a version *X.X.X* and a commit SHA *123abc* : **X-X-X_123abc** + +### How to reproduce an ArmoniK infrastructure for benchmarking ? + +1. Choose an ArmoniK version and save the associated subfolder name. + +2. Make sure you are at the root of the ArmoniK folder. + +3. Go to AWS quick-deploy : `cd infrastructure/quick-deploy/aws` + +4. Deploy ArmoniK with the appropriate Terraform parameters files located in the subfolder you selected : `make deploy PARAMETERS_FILE=../../../../benchmarking/aws/{ARMONIK_VERSION}_{COMMIT_SHA}/parameters.tfvars VERSIONS_FILE=../../../../benchmarking/aws/{ARMONIK_VERSION}_{COMMIT_SHA}/versions.tfvars.json` \ No newline at end of file diff --git a/benchmarking/aws/parameters.tfvars b/benchmarking/aws/parameters.tfvars new file mode 100644 index 000000000..143a06817 --- /dev/null +++ b/benchmarking/aws/parameters.tfvars @@ -0,0 +1,408 @@ +# Tags +tags = { + "name" = "bench" + "origin" = "terraform" + "csp" = "aws" + "Terraform" = "true" +} + +vpc = { + enable_private_subnet = false +} + +# AWS EKS +eks = { + cluster_version = "1.25" + node_selector = { service = "monitoring" } + cluster_endpoint_public_access = true + map_roles = [] + map_users = [] +} + +eks_managed_node_groups = { + workers = { + name = "workers" + launch_template_description = "Node group for ArmoniK Compute-plane pods" + ami_type = "AL2_x86_64" + instance_types = ["c5a.4xlarge"] + capacity_type = "ON_DEMAND" # "SPOT" + min_size = 8 + desired_size = 8 + max_size = 8 + labels = { + service = "workers" + "node.kubernetes.io/lifecycle" = "ondemand" # "spot" + } + taints = { + dedicated = { + key = "service" + value = "workers" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + + metrics = { + name = "metrics" + launch_template_description = "Node group for metrics: Metrics exporter and Prometheus" + ami_type = "AL2_x86_64" + instance_types = ["c5.large"] + capacity_type = "ON_DEMAND" + min_size = 1 + desired_size = 1 + max_size = 5 + labels = { + service = "metrics" + "node.kubernetes.io/lifecycle" = "ondemand" + } + taints = { + dedicated = { + key = "service" + value = "metrics" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + # Node group for ArmoniK control-plane: control-plane and Ingress + control_plane = { + name = "control-plane" + launch_template_description = "Node group for ArmoniK Control-plane and Ingress" + ami_type = "AL2_x86_64" + instance_types = ["c5a.4xlarge"] + capacity_type = "ON_DEMAND" + min_size = 1 + desired_size = 1 + max_size = 2 + labels = { + service = "control-plane" + "node.kubernetes.io/lifecycle" = "ondemand" + } + taints = { + dedicated = { + key = "service" + value = "control-plane" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + # Node group for monitoring: metrics server, keda, seq, grafana, cluster-autoscaler, coreDNS, termination handler + monitoring = { + name = "monitoring" + launch_template_description = "Node group for monitoring" + ami_type = "AL2_x86_64" + instance_types = ["c5.large"] + capacity_type = "ON_DEMAND" + min_size = 1 + desired_size = 1 + max_size = 5 + labels = { + service = "monitoring" + "node.kubernetes.io/lifecycle" = "ondemand" + } + taints = { + dedicated = { + key = "service" + value = "monitoring" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + # Node group for data-plane + # state_database, inner_storage, task_queue + state_database = { + name = "mongodb" + launch_template_description = "Node group for MongoDB" + ami_type = "AL2_x86_64" + instance_types = ["c5a.8xlarge"] + use_custom_launch_template = true + block_device_mappings = { + xvda = { + device_name = "/dev/xvda" + ebs = { + volume_size = 75 + volume_type = "gp3" + iops = 5000 + throughput = 1000 + encrypted = null + kms_key_id = null + delete_on_termination = true + } + } + } + capacity_type = "ON_DEMAND" + min_size = 1 + desired_size = 1 + max_size = 1 + labels = { + service = "state-database" + "node.kubernetes.io/lifecycle" = "ondemand" + } + taints = { + dedicated = { + key = "service" + value = "state-database" + effect = "NO_SCHEDULE" + } + } + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } +} + +self_managed_node_groups = { + others = { + name = "others" + launch_template_description = "Node group for others" + instance_type = "c5.large" + min_size = 0 + desired_size = 0 + max_size = 5 + force_delete = true + force_delete_warm_pool = true + instance_market_options = { + market_type = "spot" + } + bootstrap_extra_args = "--kubelet-extra-args '--node-labels=node.kubernetes.io/lifecycle=spot'" + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } + others_mixed = { + name = "others-mixed" + launch_template_description = "Mixed On demand and SPOT instances for other pods" + min_size = 1 + desired_size = 1 + max_size = 5 + use_mixed_instances_policy = true + mixed_instances_policy = { + on_demand_allocation_strategy = "lowest-price" + on_demand_base_capacity = 0 + on_demand_percentage_above_base_capacity = 20 # 20% On-Demand Instances, 80% Spot Instances + spot_allocation_strategy = "price-capacity-optimized" + spot_instance_pools = null + spot_max_price = null + } + override = [ + { + instance_type = "c5.4xlarge" + weighted_capacity = "1" + }, + { + instance_type = "c5.2xlarge" + weighted_capacity = "2" + }, + ] + iam_role_use_name_prefix = false + iam_role_additional_policies = { + AmazonSSMManagedInstanceCore = "arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore" + } + } +} + +# List of fargate profiles +fargate_profiles = {} + +metrics_server = { + node_selector = { service = "monitoring" } +} + +keda = { + node_selector = { service = "monitoring" } +} + +# Object storage +# Uncomment either the `elasticache` or the `s3_os` parameter +elasticache = { + engine = "redis" + engine_version = "6.x" + node_type = "cache.r4.large" + num_cache_clusters = 1 +} + +#s3_os = {} + +mq = { + engine_type = "ActiveMQ" + engine_version = "5.17.6" + host_instance_type = "mq.m5.xlarge" +} + +mongodb = { + node_selector = { service = "state-database" } + replicas = 2 + mongodb_resources = { + limits = { + "cpu" = "30" + "memory" = "60Gi" + "ephemeral-storage" = "20Gi" + } + requests = { + "cpu" = "14" + "memory" = "29Gi" + "ephemeral-storage" = "4Gi" + } + } +} + +seq = { + node_selector = { service = "monitoring" } +} + +grafana = { + node_selector = { service = "monitoring" } +} + +node_exporter = { + node_selector = {} +} + +windows_exporter = { + node_selector = { + "plateform" = "windows" + } +} + +prometheus = { + node_selector = { service = "metrics" } +} + +metrics_exporter = { + node_selector = { service = "metrics" } +} + + +fluent_bit = { + is_daemonset = true + node_selector = {} +} + +logging_level = "Information" + +control_plane = { + limits = { + cpu = "2000m" + memory = "4096Mi" + } + requests = { + cpu = "1000m" + memory = "2048Mi" + } + default_partition = "default" + replicas = 12 + node_selector = { service = "control-plane" } +} + +admin_gui = { + limits = { + cpu = "1000m" + memory = "1024Mi" + } + requests = { + cpu = "100m" + memory = "128Mi" + } + node_selector = { service = "monitoring" } +} + +compute_plane = { + bench = { + node_selector = { service = "workers" } + replicas = 120 + polling_agent = { + limits = { + cpu = "1000m" + memory = "1024Mi" + } + requests = { + cpu = "500m" + memory = "256Mi" + } + } + worker = [ + { + image = "dockerhubaneo/armonik_core_bench_test_worker" + limits = { + cpu = "1000m" + memory = "1024Mi" + } + requests = { + cpu = "500m" + memory = "512Mi" + } + } + ] + }, +} + +ingress = { + tls = false + mtls = false + generate_client_cert = false + node_selector = { service = "control-plane" } +} + +# Job to insert partitions in the database +job_partitions_in_database = { + node_selector = { service = "control-plane" } +} + +# Authentication behavior +authentication = { + node_selector = { service = "control-plane" } +} + +configurations = { + core = { + env = { + Amqp__AllowHostMismatch = false + Amqp__MaxPriority = "10" + Amqp__MaxRetries = "5" + Amqp__QueueStorage__LockRefreshPeriodicity = "00:00:45" + Amqp__QueueStorage__PollPeriodicity = "00:00:10" + Amqp__QueueStorage__LockRefreshExtension = "00:02:00" + MongoDB__TableStorage__PollingDelayMin = "00:00:01" + MongoDB__TableStorage__PollingDelayMax = "00:00:10" + MongoDB__TableStorage__PollingDelay = "00:00:01" + MongoDB__DataRetention = "1.00:00:00" # 1 day retention + MongoDB__AllowInsecureTls = true + Redis__Timeout = 3000 + Redis__SslHost = "" + Redis__TtlTimeSpan = "1.00:00:00" # 1 day retention + Submitter__DeletePayload = true + } + } + control = { + env = { + Submitter__MaxErrorAllowed = 50 + } + } + jobs = { env = { MongoDB__DataRetention = "1.00:00:00" } } +} + +environment_description = { + name = "aws-dev" + version = "0.0.0" + description = "AWS environment" + color = "#80ff80" +} + +upload_images = false diff --git a/tools/ci/bench-job-template.yml b/tools/ci/bench-job-template.yml new file mode 100644 index 000000000..919a16066 --- /dev/null +++ b/tools/ci/bench-job-template.yml @@ -0,0 +1,41 @@ +apiVersion: batch/v1 +kind: Job +metadata: + name: bench-session + namespace: armonik +spec: + backoffLimit: 0 + ttlSecondsAfterFinished: 1200 + template: + spec: + restartPolicy: Never + containers: + - name: bench-session + image: dockerhubaneo/armonik_core_bench_test_client:@@ARMONIK_CORE_VERSION@@ # version should at least be 0.27.4 + env: + - name: BenchOptions__NTasks + value: "@@NTASKS@@" + - name: BenchOptions__Partition + value: bench + - name: BenchOptions__Options__SessionName + value: "@@SESSION_NAME@@" + - name: BenchOptions__PayloadSize + value: "1" + - name: BenchOptions__ResultSize + value: "1" + - name: BenchOptions__TaskDurationMs + value: "0" + - name: BenchOptions__DegreeOfParallelism + value: "10" + - name: BenchOptions__PauseSessionDuringSubmission + value: "true" + - name: BenchOptions__PurgeData + value: "false" + - name: BenchOptions__DownloadResults + value: "false" + - name: GrpcClient__Endpoint + value: http://@@GRPC_CLIENT_ENDPOINT@@:5001 + resources: + requests: + cpu: "1" + memory: "500Mi" diff --git a/tools/ci/python/program.py b/tools/ci/python/program.py new file mode 100644 index 000000000..019ba2eb0 --- /dev/null +++ b/tools/ci/python/program.py @@ -0,0 +1,326 @@ +import datetime +import time +from typing import Any +import grpc +import argparse +import json +import sys +from logging import Filter, LogRecord +import logging.config +from pathlib import Path +from armonik.client import ArmoniKTasks, ArmoniKSessions +from armonik.common import Task, TaskStatus, Session, SessionStatus, Direction + + +class LogMsgStripFilter(Filter): + """Return a copy of the string with leading and trailing whitespace removed.""" + + def filter(self, record: LogRecord) -> bool: + try: + record.msg = record.msg.strip() + except AttributeError: + pass + return True + + +class ContextFilter(Filter): + """Process context and return and empty dict when not provided""" + + def filter(self, record: Any) -> bool: + try: + _ = record.context + if isinstance(_, dict): + record.context = json.dumps(_) + except AttributeError: + record.context = {} + return True + + +class SessionNotFoundError(Exception): + """Exception raised when a session cannot be found""" + + pass + + +LEVEL = "INFO" +LOGGING = { + "version": 1, + "disable_existing_loggers": False, + "formatters": { + "console": { + "datefmt": "%Y-%m-%dT%H:%M:%S", + "format": "%(asctime)s.%(msecs)03dZ%(levelname)s [%(funcName)s] | {" + '"message": "%(message)s", "filename": "%(filename)s", "line": %(lineno)d, ' + '"context": %(context)s}', + } + }, + "filters": { + "log_msg_strip_filter": {"()": LogMsgStripFilter}, + "context_filter": {"()": ContextFilter}, + }, + "handlers": { + "console": { + "class": "logging.StreamHandler", + "level": LEVEL, + "formatter": "console", + "filters": ["log_msg_strip_filter", "context_filter"], + } + }, + "loggers": {"my_logger": {"handlers": ["console"], "level": "INFO"}}, +} + +logging.config.dictConfig(LOGGING) +logger = logging.getLogger("my_logger") + + +def get_session_id_by_name(session_name: str, grpc_channel) -> str: + """ + Retrieves a session id by its name defined as HtcMock.Options.SessionName . + If multiple sessions have the same name, the one retrieved is the last from the list returned by the API + + Args: + session_name: name of the session + grpc_channel: gRPC channel with ArmoniK's control plane + + Returns: + Session id + + Exception: + SessionNotFoundError: When session_name cannot match any session's SessionName + """ + + sessions_client = ArmoniKSessions(grpc_channel) + + try: + session_id = sessions_client.list_sessions( + Session.options["SessionName"] == session_name + )[1][-1].session_id + return session_id + except IndexError: + raise SessionNotFoundError + + +def get_session_stats(session_id: str, grpc_channel: grpc.Channel) -> dict: + """ + Retrieves stats for a session. + For now retrieves throughput and number of tasks completed. + + Args: + session_id: id of the session + grpc_channel: gRPC channel with ArmoniK's control plane + + Returns: + Dictionnary with metric name as key and metric value as value. + """ + + tasks_client = ArmoniKTasks(grpc_channel) + + tasks_count, tasks_list = tasks_client.list_tasks( + (Task.status == TaskStatus.COMPLETED) & (Task.session_id == session_id), + page=0, + page_size=1, + sort_field=Task.processed_at, + sort_direction=Direction.ASC, + ) + first_processed_task = tasks_list[0] + + last_ended_task = tasks_client.list_tasks( + (Task.status == TaskStatus.COMPLETED) & (Task.session_id == session_id), + page=0, + page_size=1, + sort_field=Task.ended_at, + sort_direction=Direction.DESC, + )[1][0] + + logger.info( + "Session stats summary", + extra={ + "context": { + "Task count:": tasks_count, + "First task started at": first_processed_task.started_at.strftime( + "%m/%d/%Y, %H:%M:%S" + ), + "Last task to end ended at": last_ended_task.ended_at.strftime( + "%m/%d/%Y, %H:%M:%S" + ), + } + }, + ) + + return { + "tasks_count": tasks_count, + "throughput": tasks_count + / (last_ended_task.ended_at - first_processed_task.started_at).total_seconds(), + } + + +def poll_session_ending( + session_id: str, grpc_channel: grpc.Channel, polling_limit: float +): + """ + Polls for a session to be completed (CANCELLED status). + + Args: + session_id: name of the session + grpc_channel: gRPC channel with ArmoniK's control plane + polling_limit: number of seconds to poll before timeout + + Exception: + If the session isn't completed in time, raises Timeout Error + """ + + sessions_client = ArmoniKSessions(grpc_channel) + + timeout_date = datetime.datetime.now() + datetime.timedelta(seconds=polling_limit) + + logger.info( + "Session polling", + extra={ + "context": { + "Session polled": session_id, + "Started to poll at": datetime.datetime.now().strftime( + "%m/%d/%Y, %H:%M:%S" + ), + "Will end polling at": timeout_date.strftime("%m/%d/%Y, %H:%M:%S"), + } + }, + ) + + while datetime.datetime.now() < timeout_date: + session_status = sessions_client.get_session(session_id).status + if session_status != SessionStatus.CLOSED: + logger.info( + "Waiting for session to end", + extra={"context": {"Session id": session_id}}, + ) + time.sleep(5) + else: + logger.info( + "Session finished", extra={"context": {"Session id": session_id}} + ) + return + + logger.error( + "Polling timeout exceeded", extra={"context": {"Session id": session_id}} + ) + + raise TimeoutError("Polling duration was exceeded.") + + +def main(session_name: str, grpc_endpoint: str, polling_limit: float) -> list[dict]: + """ + Retrieves a session's stats by its name. + + Args: + session_id: name of the session + grpc_channel: gRPC channel with ArmoniK's control plane + polling_limit: number of seconds to poll before timeout + + Returns: + The path to the JSON file containing the session's stats + """ + + with grpc.insecure_channel(f"{grpc_endpoint}:5001") as channel: + session_id = get_session_id_by_name(session_name, channel) + poll_session_ending(session_id, channel, polling_limit) + session_stats = get_session_stats(session_id, channel) + + session_stats_json = [ + { + "metadata": {"session_id": session_id, "session_name": session_name}, + "metrics": { + "throughput": { + "name": "Throughput", + "unit": "Task per second", + "value": session_stats["throughput"], + }, + "tasks_count": { + "name": "Total number of tasks", + "unit": "Task", + "value": session_stats["tasks_count"], + }, + }, + } + ] + + logger.debug( + "Session stats", + extra={ + "context": { + "Session name": session_name, + "Session id": session_id, + "Bench Results": session_stats_json, + } + }, + ) + + return session_stats_json + + +def write_json_output(session_stats_json: dict, path: str = "") -> str: + """ + Writes a session stats file in JSON. + + Args: + session_id: name of the session + grpc_channel: gRPC channel with ArmoniK's control plane + polling_limit: number of seconds to poll before timeout + path: relative path where to store session's stats + + Returns: + Absolute path to the JSON file containing the session's stats. + """ + + file_directory = Path(path) + file_directory.mkdir(parents=True, exist_ok=True) + + file_name = f"session_{session_stats[0]['metadata']['session_id']}_benchmark_{session_stats[0]['metrics']['tasks_count']['value']}tasks.json" + + absolute_file_path = file_directory.resolve() / file_name + + content = json.dumps(session_stats) + + logger.debug( + "JSON file to be written", + extra={ + "context": { + "directory": file_directory, + "filename": file_name, + "path": absolute_file_path, + "content": content, + } + }, + ) + + with open(absolute_file_path, "w") as output_file: + output_file.write(content) + + return absolute_file_path + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument("grpc_endpoint", type=str) + parser.add_argument("-n", "--session-name", type=str, default="") + parser.add_argument("-l", "--polling-limit", type=float, default=300) + parser.add_argument("-p", "--output-path", type=str, default="") + + args = parser.parse_args() + + try: + session_stats = main(args.session_name, args.grpc_endpoint, args.polling_limit) + output_path = write_json_output(session_stats) + print(output_path, file=sys.stdout) + except SessionNotFoundError: + logger.error( + "Session not found", + extra={"context": {"Session name provided": args.session_name}}, + ) + sys.exit(1) + except TimeoutError: + logger.error( + "Session exceeded polling duration", + extra={"context": {"Session name provided": args.session_name}}, + ) + sys.exit(1) \ No newline at end of file diff --git a/tools/ci/python/requirements.txt b/tools/ci/python/requirements.txt new file mode 100644 index 000000000..a4aae0b36 --- /dev/null +++ b/tools/ci/python/requirements.txt @@ -0,0 +1,2 @@ +armonik==3.21.0 +argparse==1.4.0 \ No newline at end of file From b7b4d058ca3be85b3276bfab3228747ba74b7f5b Mon Sep 17 00:00:00 2001 From: Trystan Schneider Date: Tue, 21 Jan 2025 14:38:51 +0100 Subject: [PATCH 15/24] fix: unshard MongoDB in default deployments --- infrastructure/quick-deploy/aws/parameters.tfvars | 2 +- infrastructure/quick-deploy/gcp/parameters.tfvars | 2 +- infrastructure/quick-deploy/localhost/parameters.tfvars | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/infrastructure/quick-deploy/aws/parameters.tfvars b/infrastructure/quick-deploy/aws/parameters.tfvars index 48927709f..c78e55c3e 100644 --- a/infrastructure/quick-deploy/aws/parameters.tfvars +++ b/infrastructure/quick-deploy/aws/parameters.tfvars @@ -303,7 +303,7 @@ mongodb = { } # Nullify to disable sharding, each nullification of subobject will result in the use of default values -mongodb_sharding = {} +# mongodb_sharding = {} seq = { node_selector = { service = "monitoring" } diff --git a/infrastructure/quick-deploy/gcp/parameters.tfvars b/infrastructure/quick-deploy/gcp/parameters.tfvars index 3eb585594..02bd60973 100644 --- a/infrastructure/quick-deploy/gcp/parameters.tfvars +++ b/infrastructure/quick-deploy/gcp/parameters.tfvars @@ -255,7 +255,7 @@ mongodb = { } # Nullify to disable sharding, each nullification of subobject will result in the use of default values -mongodb_sharding = {} +# mongodb_sharding = {} #memorystore = { # memory_size_gb = 20 diff --git a/infrastructure/quick-deploy/localhost/parameters.tfvars b/infrastructure/quick-deploy/localhost/parameters.tfvars index 5f5e0fbc4..9bd08f7cb 100644 --- a/infrastructure/quick-deploy/localhost/parameters.tfvars +++ b/infrastructure/quick-deploy/localhost/parameters.tfvars @@ -345,4 +345,4 @@ mongodb = { } # Nullify to disable sharding -mongodb_sharding = {} +# mongodb_sharding = {} From 92cb7d07143c8af06c22bb82c88cbcf88038ec98 Mon Sep 17 00:00:00 2001 From: Mohamed Khairallah Gharbi Date: Wed, 22 Jan 2025 10:49:41 +0100 Subject: [PATCH 16/24] update active mq version --- infrastructure/quick-deploy/aws/parameters.tfvars | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infrastructure/quick-deploy/aws/parameters.tfvars b/infrastructure/quick-deploy/aws/parameters.tfvars index 48927709f..a7cd8f020 100644 --- a/infrastructure/quick-deploy/aws/parameters.tfvars +++ b/infrastructure/quick-deploy/aws/parameters.tfvars @@ -294,7 +294,7 @@ elasticache = { mq = { engine_type = "ActiveMQ" - engine_version = "5.17.6" + engine_version = "5.18.6" host_instance_type = "mq.m5.xlarge" } From a1dc3bb7692519edac11dde78e005fb1862ba17a Mon Sep 17 00:00:00 2001 From: Mohamed Khairallah Gharbi Date: Wed, 22 Jan 2025 14:15:19 +0100 Subject: [PATCH 17/24] correct version --- infrastructure/quick-deploy/aws/parameters.tfvars | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infrastructure/quick-deploy/aws/parameters.tfvars b/infrastructure/quick-deploy/aws/parameters.tfvars index a7cd8f020..d612548f6 100644 --- a/infrastructure/quick-deploy/aws/parameters.tfvars +++ b/infrastructure/quick-deploy/aws/parameters.tfvars @@ -294,7 +294,7 @@ elasticache = { mq = { engine_type = "ActiveMQ" - engine_version = "5.18.6" + engine_version = "5.18" host_instance_type = "mq.m5.xlarge" } From d660ac7ef063f0bc84f02b1e5857fb0d0347616f Mon Sep 17 00:00:00 2001 From: Trystan Schneider Date: Mon, 22 Jul 2024 17:12:45 +0200 Subject: [PATCH 18/24] Parametrize persitence for GCP --- infrastructure/quick-deploy/gcp/parameters.tfvars | 2 ++ infrastructure/quick-deploy/gcp/storage.tf | 2 +- infrastructure/quick-deploy/gcp/variables.tf | 15 +++++++++++++++ 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/infrastructure/quick-deploy/gcp/parameters.tfvars b/infrastructure/quick-deploy/gcp/parameters.tfvars index 02bd60973..b1e7ae793 100644 --- a/infrastructure/quick-deploy/gcp/parameters.tfvars +++ b/infrastructure/quick-deploy/gcp/parameters.tfvars @@ -252,6 +252,8 @@ keda = { mongodb = { node_selector = { service = "state-database" } + # Uncomment the line below to enable persistence, comment to disable + # persistent_volume = {} } # Nullify to disable sharding, each nullification of subobject will result in the use of default values diff --git a/infrastructure/quick-deploy/gcp/storage.tf b/infrastructure/quick-deploy/gcp/storage.tf index eb3c0eabf..1e8815b5c 100644 --- a/infrastructure/quick-deploy/gcp/storage.tf +++ b/infrastructure/quick-deploy/gcp/storage.tf @@ -28,7 +28,7 @@ module "mongodb" { } mongodb_resources = var.mongodb.mongodb_resources arbiter_resources = var.mongodb.arbiter_resources - persistent_volume = null + persistent_volume = var.mongodb.persistent_volume } module "mongodb_sharded" { diff --git a/infrastructure/quick-deploy/gcp/variables.tf b/infrastructure/quick-deploy/gcp/variables.tf index 7b02f6c84..c1d49f4a5 100644 --- a/infrastructure/quick-deploy/gcp/variables.tf +++ b/infrastructure/quick-deploy/gcp/variables.tf @@ -128,6 +128,21 @@ variable "mongodb" { limits = optional(map(string)) requests = optional(map(string)) })) + + persistent_volume = optional(object({ + storage_provisioner = optional(string) + volume_binding_mode = optional(string, "Immediate") + parameters = optional(map(string), {}) + #Resources for PVC + resources = optional(object({ + limits = optional(object({ + storage = string + })) + requests = optional(object({ + storage = string + })) + })) + })) }) default = {} } From 680ed499109881249b35d4598991ff58fc34b073 Mon Sep 17 00:00:00 2001 From: Trystan Schneider Date: Mon, 22 Jul 2024 17:13:51 +0200 Subject: [PATCH 19/24] Parametrize persistence for localhost --- infrastructure/quick-deploy/gcp/variables.tf | 2 +- .../quick-deploy/localhost/parameters.tfvars | 3 +++ infrastructure/quick-deploy/localhost/storage.tf | 2 +- .../quick-deploy/localhost/variables.tf | 15 +++++++++++++++ 4 files changed, 20 insertions(+), 2 deletions(-) diff --git a/infrastructure/quick-deploy/gcp/variables.tf b/infrastructure/quick-deploy/gcp/variables.tf index c1d49f4a5..663a48a68 100644 --- a/infrastructure/quick-deploy/gcp/variables.tf +++ b/infrastructure/quick-deploy/gcp/variables.tf @@ -128,7 +128,7 @@ variable "mongodb" { limits = optional(map(string)) requests = optional(map(string)) })) - + persistent_volume = optional(object({ storage_provisioner = optional(string) volume_binding_mode = optional(string, "Immediate") diff --git a/infrastructure/quick-deploy/localhost/parameters.tfvars b/infrastructure/quick-deploy/localhost/parameters.tfvars index 9bd08f7cb..65319afb9 100644 --- a/infrastructure/quick-deploy/localhost/parameters.tfvars +++ b/infrastructure/quick-deploy/localhost/parameters.tfvars @@ -342,6 +342,9 @@ mongodb = { # "ephemeral-storage" = "500Mi" # } # } + + # Uncomment the line below to enable persistence, comment to disable + # persistent_volume = {} } # Nullify to disable sharding diff --git a/infrastructure/quick-deploy/localhost/storage.tf b/infrastructure/quick-deploy/localhost/storage.tf index d0c39efd0..8a1d1bf24 100644 --- a/infrastructure/quick-deploy/localhost/storage.tf +++ b/infrastructure/quick-deploy/localhost/storage.tf @@ -37,7 +37,7 @@ module "mongodb" { } mongodb_resources = var.mongodb.mongodb_resources arbiter_resources = var.mongodb.arbiter_resources - persistent_volume = null + persistent_volume = var.mongodb.persistent_volume } module "mongodb_sharded" { diff --git a/infrastructure/quick-deploy/localhost/variables.tf b/infrastructure/quick-deploy/localhost/variables.tf index fa879fc98..a04cae214 100644 --- a/infrastructure/quick-deploy/localhost/variables.tf +++ b/infrastructure/quick-deploy/localhost/variables.tf @@ -152,6 +152,21 @@ variable "mongodb" { limits = optional(map(string)) requests = optional(map(string)) })) + + persistent_volume = optional(object({ + storage_provisioner = optional(string) + volume_binding_mode = optional(string, "Immediate") + parameters = optional(map(string), {}) + #Resources for PVC + resources = optional(object({ + limits = optional(object({ + storage = string + })) + requests = optional(object({ + storage = string + })) + })) + })) }) default = {} } From da400d5bf239371b101e938fd2d446dbcc3b5638 Mon Sep 17 00:00:00 2001 From: Mohamed Khairallah Gharbi Date: Mon, 27 Jan 2025 15:29:43 +0100 Subject: [PATCH 20/24] customize active mq in localhost folder --- .../quick-deploy/localhost/parameters.tfvars | 14 +++++++++++++- infrastructure/quick-deploy/localhost/storage.tf | 3 +++ infrastructure/quick-deploy/localhost/variables.tf | 9 +++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/infrastructure/quick-deploy/localhost/parameters.tfvars b/infrastructure/quick-deploy/localhost/parameters.tfvars index 65319afb9..c23b7e4b2 100644 --- a/infrastructure/quick-deploy/localhost/parameters.tfvars +++ b/infrastructure/quick-deploy/localhost/parameters.tfvars @@ -17,7 +17,19 @@ redis = {} # Queue # Uncomment either the `activemq` or the `rabbitmq` parameter -activemq = {} +activemq = { + node_selector = { service = "state-database" } + limits = { + cpu = "4000m" + memory = "16Gi" + } + requests = { + cpu = "4000m" + memory = "16Gi" + } + activemq_opts_memory = "-Xms1g -Xmx3g" +} + #rabbitmq = {} /*parition_metrics_exporter = { diff --git a/infrastructure/quick-deploy/localhost/storage.tf b/infrastructure/quick-deploy/localhost/storage.tf index 8a1d1bf24..9c48e77be 100644 --- a/infrastructure/quick-deploy/localhost/storage.tf +++ b/infrastructure/quick-deploy/localhost/storage.tf @@ -8,6 +8,9 @@ module "activemq" { tag = try(coalesce(var.activemq.image_tag), local.default_tags[var.activemq.image_name]) node_selector = var.activemq.node_selector image_pull_secrets = var.activemq.image_pull_secrets + limits = var.activemq.limits + requests = var.activemq.requests + activemq_opts_memory = var.activemq.activemq_opts_memory } } diff --git a/infrastructure/quick-deploy/localhost/variables.tf b/infrastructure/quick-deploy/localhost/variables.tf index a04cae214..b9150034d 100644 --- a/infrastructure/quick-deploy/localhost/variables.tf +++ b/infrastructure/quick-deploy/localhost/variables.tf @@ -114,6 +114,15 @@ variable "activemq" { image_tag = optional(string) node_selector = optional(any, {}) image_pull_secrets = optional(string, "") + limits = optional(object({ + cpu = optional(string) + memory = optional(string) + })) + requests = optional(object({ + cpu = optional(string) + memory = optional(string) + })) + activemq_opts_memory = optional(string, "-Xms1g -Xmx3g") }) default = null } From a7d2296029c3f3fec266c10ec41d63cbdd1f9247 Mon Sep 17 00:00:00 2001 From: Mohamed Khairallah Gharbi Date: Mon, 27 Jan 2025 15:30:00 +0100 Subject: [PATCH 21/24] add and customize active mq module in aws --- infrastructure/quick-deploy/aws/armonik.tf | 2 +- .../quick-deploy/aws/parameters.tfvars | 14 +++++++++++++ infrastructure/quick-deploy/aws/storage.tf | 17 +++++++++++++++ infrastructure/quick-deploy/aws/variables.tf | 21 +++++++++++++++++++ 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/infrastructure/quick-deploy/aws/armonik.tf b/infrastructure/quick-deploy/aws/armonik.tf index 95ff1fda4..eb061557d 100644 --- a/infrastructure/quick-deploy/aws/armonik.tf +++ b/infrastructure/quick-deploy/aws/armonik.tf @@ -4,7 +4,7 @@ module "armonik" { logging_level = var.logging_level configurations = merge(var.configurations, { - core = [module.mq, module.elasticache, module.mongodb, module.mongodb_sharded, var.configurations.core] + core = [module.activemq, module.mq, module.elasticache, module.mongodb, module.mongodb_sharded, var.configurations.core] }) fluent_bit = module.fluent_bit diff --git a/infrastructure/quick-deploy/aws/parameters.tfvars b/infrastructure/quick-deploy/aws/parameters.tfvars index 55c5dfa1b..53fa6f54c 100644 --- a/infrastructure/quick-deploy/aws/parameters.tfvars +++ b/infrastructure/quick-deploy/aws/parameters.tfvars @@ -292,6 +292,20 @@ elasticache = { #s3_os = {} +# activemq = { +# node_selector = { service = "state-database" } +# limits = { +# cpu = "4000m" +# memory = "16Gi" +# } +# requests = { +# cpu = "4000m" +# memory = "16Gi" +# } +# activemq_opts_memory = "-Xms1g -Xmx3g" +# } + + mq = { engine_type = "ActiveMQ" engine_version = "5.18" diff --git a/infrastructure/quick-deploy/aws/storage.tf b/infrastructure/quick-deploy/aws/storage.tf index e28be80cc..07fc5c7a6 100644 --- a/infrastructure/quick-deploy/aws/storage.tf +++ b/infrastructure/quick-deploy/aws/storage.tf @@ -123,6 +123,7 @@ resource "kubernetes_secret" "elasticache" { # Amazon MQ module "mq" { + count = var.activemq == null ? 1 : 0 source = "./generated/infra-modules/storage/aws/mq" tags = local.tags name = "${local.prefix}-mq" @@ -144,6 +145,22 @@ module "mq" { kms_key_id = local.kms_key } +# ActiveMQ +module "activemq" { + count = var.activemq != null ? 1 : 0 + source = "./generated/infra-modules/storage/onpremise/activemq" + namespace = local.namespace + activemq = { + image = var.activemq.image_name + tag = try(coalesce(var.activemq.image_tag), local.default_tags[var.activemq.image_name]) + node_selector = var.activemq.node_selector + image_pull_secrets = var.activemq.image_pull_secrets + limits = var.activemq.limits + requests = var.activemq.requests + activemq_opts_memory = var.activemq.activemq_opts_memory + } +} + module "aws_service_account" { namespace = local.namespace source = "./generated/infra-modules/service-account/aws" diff --git a/infrastructure/quick-deploy/aws/variables.tf b/infrastructure/quick-deploy/aws/variables.tf index 0de972938..eb6ed8273 100644 --- a/infrastructure/quick-deploy/aws/variables.tf +++ b/infrastructure/quick-deploy/aws/variables.tf @@ -301,6 +301,27 @@ variable "mq_credentials" { } } +# Parameters for ActiveMQ - on premise +variable "activemq" { + description = "Parameters of ActiveMQ" + type = object({ + image_name = optional(string, "symptoma/activemq") + image_tag = optional(string) + node_selector = optional(any, {}) + image_pull_secrets = optional(string, "") + limits = optional(object({ + cpu = optional(string) + memory = optional(string) + })) + requests = optional(object({ + cpu = optional(string) + memory = optional(string) + })) + activemq_opts_memory = optional(string, "-Xms1g -Xmx3g") + }) + default = null +} + # Parameters for MongoDB variable "mongodb" { description = "Parameters of MongoDB" From 0a6428198a09233e9b7d9ca43797f225f9fb1cd6 Mon Sep 17 00:00:00 2001 From: Mohamed Khairallah Gharbi Date: Mon, 27 Jan 2025 15:30:34 +0100 Subject: [PATCH 22/24] customize active mq in gcp --- infrastructure/quick-deploy/gcp/parameters.tfvars | 15 +++++++++++---- infrastructure/quick-deploy/gcp/storage.tf | 3 +++ infrastructure/quick-deploy/gcp/variables.tf | 9 +++++++++ 3 files changed, 23 insertions(+), 4 deletions(-) diff --git a/infrastructure/quick-deploy/gcp/parameters.tfvars b/infrastructure/quick-deploy/gcp/parameters.tfvars index b1e7ae793..bff891cbc 100644 --- a/infrastructure/quick-deploy/gcp/parameters.tfvars +++ b/infrastructure/quick-deploy/gcp/parameters.tfvars @@ -244,12 +244,19 @@ keda = { } # activemq = { -# node_selector = {} -# image_name = "symptoma/activemq" -# image_tag = "latest" -# image_pull_secrets = "" +# node_selector = { service = "state-database" } +# limits = { +# cpu = "4000m" +# memory = "16Gi" +# } +# requests = { +# cpu = "4000m" +# memory = "16Gi" +# } +# activemq_opts_memory = "-Xms1g -Xmx3g" # } + mongodb = { node_selector = { service = "state-database" } # Uncomment the line below to enable persistence, comment to disable diff --git a/infrastructure/quick-deploy/gcp/storage.tf b/infrastructure/quick-deploy/gcp/storage.tf index 1e8815b5c..9131a9eec 100644 --- a/infrastructure/quick-deploy/gcp/storage.tf +++ b/infrastructure/quick-deploy/gcp/storage.tf @@ -231,5 +231,8 @@ module "activemq" { tag = try(coalesce(var.activemq.image_tag), local.default_tags[var.activemq.image_name]) node_selector = var.activemq.node_selector image_pull_secrets = var.activemq.image_pull_secrets + limits = var.activemq.limits + requests = var.activemq.requests + activemq_opts_memory = var.activemq.activemq_opts_memory } } diff --git a/infrastructure/quick-deploy/gcp/variables.tf b/infrastructure/quick-deploy/gcp/variables.tf index 663a48a68..613fd09e7 100644 --- a/infrastructure/quick-deploy/gcp/variables.tf +++ b/infrastructure/quick-deploy/gcp/variables.tf @@ -610,6 +610,15 @@ variable "activemq" { image_tag = optional(string) node_selector = optional(any, {}) image_pull_secrets = optional(string, "") + limits = optional(object({ + cpu = optional(string) + memory = optional(string) + })) + requests = optional(object({ + cpu = optional(string) + memory = optional(string) + })) + activemq_opts_memory = optional(string, "-Xms1g -Xmx3g") }) default = null } From ed25868b94fd3df55340cb500d668b05d139c744 Mon Sep 17 00:00:00 2001 From: Mohamed Khairallah Gharbi Date: Mon, 27 Jan 2025 15:36:48 +0100 Subject: [PATCH 23/24] format modified files --- infrastructure/quick-deploy/aws/storage.tf | 12 ++++++------ infrastructure/quick-deploy/gcp/storage.tf | 12 ++++++------ .../quick-deploy/localhost/parameters.tfvars | 4 ++-- infrastructure/quick-deploy/localhost/storage.tf | 12 ++++++------ 4 files changed, 20 insertions(+), 20 deletions(-) diff --git a/infrastructure/quick-deploy/aws/storage.tf b/infrastructure/quick-deploy/aws/storage.tf index 07fc5c7a6..45ecf8d2f 100644 --- a/infrastructure/quick-deploy/aws/storage.tf +++ b/infrastructure/quick-deploy/aws/storage.tf @@ -151,12 +151,12 @@ module "activemq" { source = "./generated/infra-modules/storage/onpremise/activemq" namespace = local.namespace activemq = { - image = var.activemq.image_name - tag = try(coalesce(var.activemq.image_tag), local.default_tags[var.activemq.image_name]) - node_selector = var.activemq.node_selector - image_pull_secrets = var.activemq.image_pull_secrets - limits = var.activemq.limits - requests = var.activemq.requests + image = var.activemq.image_name + tag = try(coalesce(var.activemq.image_tag), local.default_tags[var.activemq.image_name]) + node_selector = var.activemq.node_selector + image_pull_secrets = var.activemq.image_pull_secrets + limits = var.activemq.limits + requests = var.activemq.requests activemq_opts_memory = var.activemq.activemq_opts_memory } } diff --git a/infrastructure/quick-deploy/gcp/storage.tf b/infrastructure/quick-deploy/gcp/storage.tf index 9131a9eec..64720f6e9 100644 --- a/infrastructure/quick-deploy/gcp/storage.tf +++ b/infrastructure/quick-deploy/gcp/storage.tf @@ -227,12 +227,12 @@ module "activemq" { source = "./generated/infra-modules/storage/onpremise/activemq" namespace = local.namespace activemq = { - image = var.activemq.image_name - tag = try(coalesce(var.activemq.image_tag), local.default_tags[var.activemq.image_name]) - node_selector = var.activemq.node_selector - image_pull_secrets = var.activemq.image_pull_secrets - limits = var.activemq.limits - requests = var.activemq.requests + image = var.activemq.image_name + tag = try(coalesce(var.activemq.image_tag), local.default_tags[var.activemq.image_name]) + node_selector = var.activemq.node_selector + image_pull_secrets = var.activemq.image_pull_secrets + limits = var.activemq.limits + requests = var.activemq.requests activemq_opts_memory = var.activemq.activemq_opts_memory } } diff --git a/infrastructure/quick-deploy/localhost/parameters.tfvars b/infrastructure/quick-deploy/localhost/parameters.tfvars index c23b7e4b2..1b3a1c811 100644 --- a/infrastructure/quick-deploy/localhost/parameters.tfvars +++ b/infrastructure/quick-deploy/localhost/parameters.tfvars @@ -20,11 +20,11 @@ redis = {} activemq = { node_selector = { service = "state-database" } limits = { - cpu = "4000m" + cpu = "4000m" memory = "16Gi" } requests = { - cpu = "4000m" + cpu = "4000m" memory = "16Gi" } activemq_opts_memory = "-Xms1g -Xmx3g" diff --git a/infrastructure/quick-deploy/localhost/storage.tf b/infrastructure/quick-deploy/localhost/storage.tf index 9c48e77be..ab2a64158 100644 --- a/infrastructure/quick-deploy/localhost/storage.tf +++ b/infrastructure/quick-deploy/localhost/storage.tf @@ -4,12 +4,12 @@ module "activemq" { source = "./generated/infra-modules/storage/onpremise/activemq" namespace = local.namespace activemq = { - image = var.activemq.image_name - tag = try(coalesce(var.activemq.image_tag), local.default_tags[var.activemq.image_name]) - node_selector = var.activemq.node_selector - image_pull_secrets = var.activemq.image_pull_secrets - limits = var.activemq.limits - requests = var.activemq.requests + image = var.activemq.image_name + tag = try(coalesce(var.activemq.image_tag), local.default_tags[var.activemq.image_name]) + node_selector = var.activemq.node_selector + image_pull_secrets = var.activemq.image_pull_secrets + limits = var.activemq.limits + requests = var.activemq.requests activemq_opts_memory = var.activemq.activemq_opts_memory } } From b8ea77c5f6356c5c450110e564dfbf82be7acaec Mon Sep 17 00:00:00 2001 From: Mohamed Khairallah Gharbi Date: Tue, 28 Jan 2025 11:09:40 +0100 Subject: [PATCH 24/24] modify version of armonik.infra to 0.8.0 --- infrastructure/quick-deploy/localhost/parameters.tfvars | 9 --------- versions.tfvars.json | 2 +- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/infrastructure/quick-deploy/localhost/parameters.tfvars b/infrastructure/quick-deploy/localhost/parameters.tfvars index 1b3a1c811..31c2c5804 100644 --- a/infrastructure/quick-deploy/localhost/parameters.tfvars +++ b/infrastructure/quick-deploy/localhost/parameters.tfvars @@ -18,15 +18,6 @@ redis = {} # Queue # Uncomment either the `activemq` or the `rabbitmq` parameter activemq = { - node_selector = { service = "state-database" } - limits = { - cpu = "4000m" - memory = "16Gi" - } - requests = { - cpu = "4000m" - memory = "16Gi" - } activemq_opts_memory = "-Xms1g -Xmx3g" } diff --git a/versions.tfvars.json b/versions.tfvars.json index 90cb085d7..b03595640 100644 --- a/versions.tfvars.json +++ b/versions.tfvars.json @@ -1,7 +1,7 @@ { "armonik_versions": { "armonik": "2.21.0", - "infra": "0.7.0", + "infra": "0.8.0", "infra_plugins": "0.1.1", "core": "0.29.1", "api": "3.21.0",