diff --git a/e2e-tests/installer/auto_install.py b/e2e-tests/installer/auto_install.py index fe163816b..8ece89ecf 100644 --- a/e2e-tests/installer/auto_install.py +++ b/e2e-tests/installer/auto_install.py @@ -975,7 +975,7 @@ def makefile_upgrade(self): self.installer_session.expect("orchestrator-admin:~") self.installer_session.sendline(f"{self.internal_makefile_params} make upgrade") - self.installer_session.expect("orchestrator-admin:~", timeout=1800) + self.installer_session.expect("orchestrator-admin:~", timeout=2700) def deprovision(self): """ diff --git a/installer/fix-external-secrets.sh b/installer/fix-external-secrets.sh index 9521b28ee..c225b9246 100755 --- a/installer/fix-external-secrets.sh +++ b/installer/fix-external-secrets.sh @@ -92,9 +92,6 @@ EOF echo "Syncing root app" kubectl patch -n "$TARGET_ENV" application root-app --patch-file /tmp/argo-cd/sync-patch.yaml --type merge -# argo has trouble replacing this seceret so manually remove it -echo "Deleting TLS Boots..." -kubectl delete secret tls-boots -n orch-boots # force vault to reload echo "Deleting Vault..." @@ -114,3 +111,16 @@ kubectl patch -n "$TARGET_ENV" application root-app --patch-file /tmp/argo-cd/sy echo "Deleting and Syncing for Cluster Templates" restart_and_wait_pod "orch-cluster" "cluster-manager" restart_and_wait_pod "orch-cluster" "cluster-manager-template-controller" + +echo "Sleep for 10 minutes to allow all apps to sync and come up" +sleep 600 + +# Delete infra-external vault related jobs +kubectl delete jobs -n orch-infra --field-selector status.successful=1 + +# delete ns-label related jobs +kubectl delete jobs -n ns-label --field-selector status.successful=1 + +# argo has trouble replacing this seceret so manually remove it +echo "Deleting TLS Boots..." +kubectl delete secret tls-boots -n orch-boots diff --git a/pod-configs/utils/upgrades/10-upgrade-to-2025.2.0.sh b/pod-configs/utils/upgrades/10-upgrade-to-2025.2.0.sh index 921251825..0f6484d4c 100644 --- a/pod-configs/utils/upgrades/10-upgrade-to-2025.2.0.sh +++ b/pod-configs/utils/upgrades/10-upgrade-to-2025.2.0.sh @@ -79,6 +79,15 @@ get_eks_node_ami() { echo $ami } +get_eks_vol_size() { + +read LT_ID LT_VER <<< "$(aws eks describe-nodegroup --cluster-name ${ENV_NAME} --nodegroup-name nodegroup-${ENV_NAME} --query "nodegroup.launchTemplate.[id,version]" --output text)" +aws ec2 describe-launch-template-versions \ + --launch-template-id "$LT_ID" \ + --versions "$LT_VER" \ + --query 'LaunchTemplateVersions[0].LaunchTemplateData.BlockDeviceMappings[0].Ebs.VolumeSize' \ + --output text +} get_aurora_ins_azs() { aurora_azs=($1) @@ -124,10 +133,10 @@ vpc_terraform_backend_key = "${VPC_TERRAFORM_BACKEND_KEY}" vpc_terraform_backend_region = "${BUCKET_REGION}" # region of the S3 bucket to store the TF state eks_cluster_name = "$ENV_NAME" aws_account_number = "$AWS_ACCOUNT" -eks_volume_size = 128 -eks_desired_size = $EKS_DESIRED_SIZE +eks_volume_size = $(get_eks_vol_size) +eks_desired_size = $((EKS_DESIRED_SIZE + 1)) eks_min_size = $EKS_MIN_SIZE -eks_max_size = $EKS_MAX_SIZE +eks_max_size = $((EKS_MAX_SIZE + 1)) eks_node_ami_id = "$(get_eks_node_ami)" eks_volume_type = "gp3" aws_region = "${AWS_REGION}" @@ -238,6 +247,16 @@ else exit 1 fi +echo "Applying changes for EKS module..." +if terraform apply -target=module.eks -var-file="environments/${ENV_NAME}/variable.tfvar" -auto-approve; then + echo " ^|^e Terraform apply for EKS module succeeded." + wait_for_nodegroup_ready_nodes +else + echo " ^}^l Terraform apply for EKS module failed!" + exit 1 +fi + + echo "Applying changes for KMS module..." if terraform apply -target=module.kms -var-file="environments/${ENV_NAME}/variable.tfvar" -auto-approve; then echo "✅ Terraform apply for KMS module succeeded." @@ -288,6 +307,32 @@ aws ec2 describe-security-groups --group-ids "$LB_SG_ID_T3" --query "SecurityGro return 0 } +wait_for_nodegroup_ready_nodes() { + TARGET=$((EKS_DESIRED_SIZE + 1)) + TIMEOUT=900 # 15 minutes = 900 seconds + INTERVAL=15 # check every 15 seconds + ELAPSED=0 + + while [ "$ELAPSED" -lt "$TIMEOUT" ]; do + READY=$(kubectl get nodes \ + -A \ + --no-headers 2>/dev/null | grep -c " Ready") + + echo "[$ELAPSED sec] Ready nodes: $READY / Waiting for: $TARGET" + + if [ "$READY" -ge "$TARGET" ]; then + echo " ^|^e Node group reached $TARGET Ready nodes" + return 0 + fi + + sleep "$INTERVAL" + ELAPSED=$((ELAPSED + INTERVAL)) + done + + echo " ^}^l Timeout after 15 minutes. Node group did not reach $TARGET Ready nodes." + return 1 +} + # Main if [[ ${COMMAND:-""} != upgrade ]]; then