keycloak · mhajas · Jul 1, 2024 · Jun 27, 2024 · Jun 27, 2024 · Jun 27, 2024
diff --git a/.github/workflows/rosa-cluster-auto-provision-on-schedule.yml b/.github/workflows/rosa-cluster-auto-provision-on-schedule.yml
@@ -115,8 +115,16 @@ jobs:
       activeActive: true
     secrets: inherit
 
-  run-scaling-benchmark-active-active:
+  run-functional-tests-active-active:
     needs: keycloak-deploy-active-active
+    uses: ./.github/workflows/rosa-run-crossdc-func-tests.yml
+    with:
+      activeActive: true
+      clusterPrefix: gh-keycloak # ${{ env.CLUSTER_PREFIX }} -- unfortunately 'env.' doesn't work here
+    secrets: inherit
+
+  run-scaling-benchmark-active-active:
+    needs: run-functional-tests-active-active
     uses: ./.github/workflows/rosa-scaling-benchmark.yml
     with:
       clusterName: gh-keycloak-a # ${{ env.CLUSTER_PREFIX }}-a -- unfortunately 'env.' doesn't work here ${{ env.CLUSTER_PREFIX }}-a

diff --git a/doc/kubernetes/modules/ROOT/pages/running/loadbalancing.adoc b/doc/kubernetes/modules/ROOT/pages/running/loadbalancing.adoc
@@ -28,7 +28,8 @@ image::accelerator/accelerator-multi-az.dio.svg[]
 
 An AWS Network Load Balancer (NLB) is created on both ROSA clusters in order to make the Keycloak
 pods available as Endpoints to an AWS Global Accelerator instance. Each cluster endpoint is assigned a weight of
-50 to ensure that accelerator traffic is routed equally to both availability-zones when both clusters are healthy.
+128 (half of the maximum weight 255) to ensure that accelerator traffic is routed equally to both availability-zones
+when both clusters are healthy.
 
 == Prerequisites
 
@@ -196,12 +197,12 @@ CLUSTER_2_ENDPOINT_ARN=$(aws elbv2 describe-load-balancers \
 ENDPOINTS='[
   {
     "EndpointId": "'${CLUSTER_1_ENDPOINT_ARN}'",
-    "Weight": 50,
+    "Weight": 128,
     "ClientIPPreservationEnabled": false
   },
   {
     "EndpointId": "'${CLUSTER_2_ENDPOINT_ARN}'",
-    "Weight": 50,
+    "Weight": 128,
     "ClientIPPreservationEnabled": false
   }
 ]'
@@ -226,13 +227,13 @@ aws globalaccelerator create-endpoint-group \
         "EndpointDescriptions": [
             {
                 "EndpointId": "arn:aws:elasticloadbalancing:eu-west-1:606671647913:loadbalancer/net/abab80a363ce8479ea9c4349d116bce2/6b65e8b4272fa4b5",
-                "Weight": 50,
+                "Weight": 128,
                 "HealthState": "HEALTHY",
                 "ClientIPPreservationEnabled": false
             },
             {
                 "EndpointId": "arn:aws:elasticloadbalancing:eu-west-1:606671647913:loadbalancer/net/a1c76566e3c334e4ab7b762d9f8dcbcf/985941f9c8d108d4",
-                "Weight": 50,
+                "Weight": 128,
                 "HealthState": "HEALTHY",
                 "ClientIPPreservationEnabled": false
             }

diff --git a/doc/kubernetes/modules/ROOT/pages/running/take-active-site-offline.adoc b/doc/kubernetes/modules/ROOT/pages/running/take-active-site-offline.adoc
@@ -33,13 +33,13 @@ include::partial$accelerator-endpoint-group.adoc[]
             "EndpointDescriptions": [
                 {
                     "EndpointId": "arn:aws:elasticloadbalancing:eu-west-1:606671647913:loadbalancer/net/a49e56e51e16843b9a3bc686327c907b/9b786f80ed4eba3d",
-                    "Weight": 50,
+                    "Weight": 128,
                     "HealthState": "HEALTHY",
                     "ClientIPPreservationEnabled": false
                 },
                 {
                     "EndpointId": "arn:aws:elasticloadbalancing:eu-west-1:606671647913:loadbalancer/net/a3c75f239541c4a6e9c48cf8d48d602f/5ba333e87019ccf0",
-                    "Weight": 50,
+                    "Weight": 128,
                     "HealthState": "HEALTHY",
                     "ClientIPPreservationEnabled": false
                 }
@@ -66,7 +66,7 @@ aws globalaccelerator update-endpoint-group \
   [
     {
         "EndpointId": "arn:aws:elasticloadbalancing:eu-west-1:606671647913:loadbalancer/net/a49e56e51e16843b9a3bc686327c907b/9b786f80ed4eba3d",
-        "Weight": 50,
+        "Weight": 128,
         "ClientIPPreservationEnabled": false
     }
   ]

diff --git a/provision/infinispan/ispn-helm/templates/infinispan-alerts.yaml b/provision/infinispan/ispn-helm/templates/infinispan-alerts.yaml
@@ -18,6 +18,10 @@ metadata:
 spec:
   route:
     receiver: default
+    groupBy:
+      - accelerator
+    groupInterval: 90s
+    groupWait: 0s
     matchers:
       - matchType: =
         name: alertname

diff --git a/provision/openshift/cluster-monitoring-config.yaml b/provision/openshift/cluster-monitoring-config.yaml
@@ -8,3 +8,4 @@ data:
     alertmanager:
       enabled: true
       enableAlertmanagerConfig: true
+      logLevel: debug
diff --git a/provision/opentofu/modules/aws/accelerator/main.tf b/provision/opentofu/modules/aws/accelerator/main.tf
@@ -30,11 +30,11 @@ module "global_accelerator" {
           {
             client_ip_preservation_enabled = false
             endpoint_id                    = data.aws_lb.site_a.arn
-            weight                         = 50
+            weight                         = 128
           }, {
             client_ip_preservation_enabled = false
             endpoint_id                    = data.aws_lb.site_b.arn
-            weight                         = 50
+            weight                         = 128
           }
         ]
       }

diff --git a/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/pom.xml b/provision/rosa-cross-dc/keycloak-benchmark-crossdc-tests/pom.xml
@@ -18,6 +18,7 @@
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
         <aws.java.sdk.version>2.20.43</aws.java.sdk.version>
         <fabric8.version>6.13.0</fabric8.version>
+        <jsonpath.version>2.9.0</jsonpath.version>
     </properties>
 
     <dependencies>
@@ -99,6 +100,11 @@
             <artifactId>keycloak-model-infinispan</artifactId>
             <scope>test</scope>
         </dependency>
+        <dependency>
+            <groupId>com.jayway.jsonpath</groupId>
+            <artifactId>json-path</artifactId>
+            <version>${jsonpath.version}</version>
+        </dependency>
     </dependencies>
 
     <profiles>

diff --git a/...hmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/AbstractCrossDCTest.java b/...hmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/AbstractCrossDCTest.java
@@ -24,18 +24,22 @@
 import org.junit.jupiter.api.TestInstance;
 import org.keycloak.admin.client.Keycloak;
 import org.keycloak.admin.client.resource.RealmResource;
+import org.keycloak.benchmark.crossdc.client.AWSClient;
 import org.keycloak.benchmark.crossdc.client.DatacenterInfo;
 import org.keycloak.benchmark.crossdc.client.KeycloakClient;
 import org.keycloak.benchmark.crossdc.junit.tags.ActivePassive;
 import org.keycloak.benchmark.crossdc.util.HttpClientUtils;
 import org.keycloak.benchmark.crossdc.util.InfinispanUtils;
+import org.keycloak.benchmark.crossdc.util.K8sUtils;
 import org.keycloak.benchmark.crossdc.util.PropertyUtils;
 import org.keycloak.representations.idm.ClientRepresentation;
 import org.keycloak.representations.idm.CredentialRepresentation;
 import org.keycloak.representations.idm.RealmRepresentation;
 import org.keycloak.representations.idm.UserRepresentation;
 
+import io.fabric8.kubernetes.client.KubernetesClient;
 import jakarta.ws.rs.NotFoundException;
+import software.amazon.awssdk.services.cloudwatch.model.StateValue;
 
 @TestInstance(TestInstance.Lifecycle.PER_CLASS)
 public abstract class AbstractCrossDCTest {
@@ -60,7 +64,8 @@ public AbstractCrossDCTest() {
     }
 
     @BeforeEach
-    public void setUpTestEnvironment() throws UnknownHostException {
+    public void setUpTestEnvironment() throws URISyntaxException, IOException, InterruptedException, UnknownHostException {
+        failbackLoadBalancers();
         assertTrue(DC_1.kc().isActive(LOAD_BALANCER_KEYCLOAK));
 
         Keycloak adminClient = DC_1.kc().adminClient();
@@ -181,6 +186,14 @@ protected void assertCacheSize(String cache, int size) {
         assertEquals(size, DC_2.ispn().cache(cache).size(), () -> "External cache " + cache + " in DC2 has " + DC_2.ispn().cache(cache).size() + " entries");
     }
 
+    protected void waitForAcceleratorEndpointCount(int count) {
+        eventually(
+              () -> String.format("Expected the Accelerator EndpointGroup size to be %d", count),
+              () -> AWSClient.getAcceleratorEndpoints(DC_1.getLoadbalancerURL()).size() == count,
+              2, TimeUnit.MINUTES
+        );
+    }
+
     protected void eventually(Supplier<String> messageSupplier, Supplier<Boolean> condition) {
         eventually(messageSupplier, condition, 30, TimeUnit.SECONDS);
     }

diff --git a/...ak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/FailoverTest.java b/...ak-benchmark-crossdc-tests/src/test/java/org/keycloak/benchmark/crossdc/FailoverTest.java
@@ -1,28 +1,32 @@
 package org.keycloak.benchmark.crossdc;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.keycloak.benchmark.crossdc.util.InfinispanUtils.SESSIONS;
 
 import java.io.IOException;
 import java.net.URISyntaxException;
+import java.time.Instant;
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.TimeUnit;
 import java.util.function.Supplier;
 
+import org.infinispan.commons.util.ByRef;
 import org.junit.jupiter.api.Test;
 import org.keycloak.benchmark.crossdc.client.AWSClient;
 import org.keycloak.benchmark.crossdc.client.DatacenterInfo;
+import org.keycloak.benchmark.crossdc.client.PrometheusClient;
 import org.keycloak.benchmark.crossdc.junit.tags.ActiveActive;
 import org.keycloak.benchmark.crossdc.junit.tags.ActivePassive;
+import org.keycloak.benchmark.crossdc.util.K8sUtils;
 
-import io.fabric8.kubernetes.client.KubernetesClient;
 import software.amazon.awssdk.services.cloudwatch.model.StateValue;
 
 public class FailoverTest extends AbstractCrossDCTest {
 
-    static final String OPERATORS_NS = "openshift-operators";
+    static final String SITE_OFFLINE_ALERT = "SiteOffline";
 
     @Override
     protected void failbackLoadBalancers() throws URISyntaxException, IOException, InterruptedException {
@@ -39,11 +43,18 @@ protected void failbackLoadBalancers() throws URISyntaxException, IOException, I
             assertTrue(route53HealthCheckPath.endsWith("/lb-check"), "Health check path was supposed to end with /lb-check but was " + route53HealthCheckPath);
         } else {
             // Heal split-brain if previously initiated
-            scaleUpGossipRouter(DC_1);
-            scaleUpGossipRouter(DC_2);
+            scaleGossipRouter(DC_1, 1);
+            scaleGossipRouter(DC_2, 1);
             // Wait for JGroups site view to contain both sites
+            waitForSitesViewCount(2);
+            // Ensure that the SiteOffline alert is no longer firing
+            eventually(
+                  () -> String.format("Alert '%s' still firing on DC", SITE_OFFLINE_ALERT),
+                  () -> !DC_1.prometheus().isAlertFiring(SITE_OFFLINE_ALERT) && !DC_2.prometheus().isAlertFiring(SITE_OFFLINE_ALERT),
+                  5, TimeUnit.MINUTES
+            );
+            // Add both sites to the Accelerator EndpointGroup
             AWSClient.acceleratorFallback(LOAD_BALANCER_KEYCLOAK.getKeycloakServerUrl());
-            // Assert that both sites are part of the Accelerator EndpointGroup
             waitForAcceleratorEndpointCount(2);
         }
     }
@@ -81,58 +92,53 @@ public void logoutUserWithFailoverTest() throws IOException, URISyntaxException,
 
     @Test
     @ActiveActive
-    public void ensureAcceleratorUpdatedOnSplitBrainTest() throws Exception {
+    public void ensureAcceleratorUpdatedOnSplitBrainTest() {
+        // Minus one minute to allow for difference in local and AWS clocks
+        var startTime = Instant.now().minusSeconds(60);
+        var acceleratorMeta = AWSClient.getAcceleratorMeta(DC_1.getLoadbalancerURL());
+        var region = acceleratorMeta.endpointGroup().endpointGroupRegion();
+
+        // Ensure that no SiteOffline events are firing on either site
+        assertFalse(DC_1.prometheus().isAlertFiring(SITE_OFFLINE_ALERT));
+        assertFalse(DC_2.prometheus().isAlertFiring(SITE_OFFLINE_ALERT));
+
+        // Assert that the Lambda has not been executed
+        assertEquals(0, AWSClient.getLambdaInvocationCount(acceleratorMeta.name(), region, startTime));
+
         // Assert that both sites are part of the Accelerator EndpointGroup
         assertEquals(2, AWSClient.getAcceleratorEndpoints(DC_1.getLoadbalancerURL()).size());
 
         // Trigger a split-brain by scaling down the GossipRouter in both sites
-        scaleDownGossipRouter(DC_1);
-        scaleDownGossipRouter(DC_2);
+        scaleGossipRouter(DC_1, 0);
+        scaleGossipRouter(DC_2, 0);
 
         // Wait for both sites to detect split-brain
         waitForSitesViewCount(1);
 
         // Assert that the AWS Lambda was executed and that only one site LB remains in the Accelerator EndpointGroup
         waitForAcceleratorEndpointCount(1);
-    }
 
-    private void waitForAcceleratorEndpointCount(int count) {
+        // Assert that the Lambda has been triggered by both sites
+        ByRef.Long count = new ByRef.Long(0);
         eventually(
-              () -> String.format("Expected the Accelerator EndpointGroup size to be %d", count),
-              () -> AWSClient.getAcceleratorEndpoints(DC_1.getLoadbalancerURL()).size() == count,
-              2, TimeUnit.MINUTES
+              () -> String.format("Expected %d Lambda invocations, got %d", 2, count.get()),
+              () -> {
+                  count.set(AWSClient.getLambdaInvocationCount(acceleratorMeta.name(), region, startTime));
+                  return count.get() == 2;
+              },
+              10, TimeUnit.MINUTES
         );
     }
 
     private void waitForSitesViewCount(int count) {
         Supplier<String> msg = () -> "Timedout waiting for cross-site view to reform";
-        eventually(msg, () -> DC_1.ispn().getSiteView().size() == count);
-        eventually(msg, () -> DC_2.ispn().getSiteView().size() == count);
+        eventually(msg, () -> DC_1.ispn().getSiteView().size() == count, 5, TimeUnit.MINUTES);
+        eventually(msg, () -> DC_2.ispn().getSiteView().size() == count, 5, TimeUnit.MINUTES);
     }
 
-    private void scaleDownGossipRouter(DatacenterInfo datacenter) throws InterruptedException {
+    protected void scaleGossipRouter(DatacenterInfo datacenter, int replicas) {
         var oc = datacenter.oc();
-        scaleDeployment(oc, "infinispan-operator-controller-manager", OPERATORS_NS, 0);
-        scaleDeployment(oc, "infinispan-router", datacenter.namespace(), 0);
-    }
-
-    private void scaleUpGossipRouter(DatacenterInfo datacenter) throws InterruptedException {
-        var oc = datacenter.oc();
-        scaleDeployment(oc, "infinispan-operator-controller-manager", OPERATORS_NS, 1);
-        scaleDeployment(oc, "infinispan-router", datacenter.namespace(), 1);
-    }
-
-    private void scaleDeployment(KubernetesClient k8s, String name, String namespace, int replicas) throws InterruptedException {
-        k8s.apps()
-              .deployments()
-              .inNamespace(namespace)
-              .withName(name)
-              .scale(replicas);
-
-        k8s.apps()
-              .deployments()
-              .inNamespace(namespace)
-              .withName(name)
-              .waitUntilReady(30, TimeUnit.SECONDS);
+        K8sUtils.scaleDeployment(oc, "infinispan-operator-controller-manager", "openshift-operators", replicas);
+        K8sUtils.scaleDeployment(oc, "infinispan-router", datacenter.namespace(), replicas);
     }
 }