From daa240330c16d1e557563b69f1375aa6fd46ae6b Mon Sep 17 00:00:00 2001 From: Ryan Emerson Date: Tue, 2 Jul 2024 15:21:08 +0100 Subject: [PATCH] Create Active/Active operational procedure for resyncronizing sites Closes #876 Signed-off-by: Ryan Emerson Signed-off-by: Alexander Schwartz Co-authored-by: Alexander Schwartz --- .../running/bring-active-site-online.adoc | 2 +- .../modules/ROOT/pages/running/index.adoc | 1 + .../ROOT/pages/running/synchronize-sites.adoc | 75 +++++++++++ .../running/take-active-site-offline.adoc | 2 +- .../infinispan-cli-clear-caches.adoc | 99 +++++++++++++++ .../infinispan/infinispan-cli-connect.adoc | 22 ++++ .../infinispan-cli-state-transfer.adoc | 120 ++++++++++++++++++ 7 files changed, 319 insertions(+), 2 deletions(-) create mode 100644 doc/kubernetes/modules/ROOT/pages/running/synchronize-sites.adoc create mode 100644 doc/kubernetes/modules/ROOT/partials/infinispan/infinispan-cli-clear-caches.adoc create mode 100644 doc/kubernetes/modules/ROOT/partials/infinispan/infinispan-cli-connect.adoc create mode 100644 doc/kubernetes/modules/ROOT/partials/infinispan/infinispan-cli-state-transfer.adoc diff --git a/doc/kubernetes/modules/ROOT/pages/running/bring-active-site-online.adoc b/doc/kubernetes/modules/ROOT/pages/running/bring-active-site-online.adoc index dc193f112..356013a60 100644 --- a/doc/kubernetes/modules/ROOT/pages/running/bring-active-site-online.adoc +++ b/doc/kubernetes/modules/ROOT/pages/running/bring-active-site-online.adoc @@ -3,7 +3,7 @@ {description} -== When to use procedure +== When to use this procedure This procedure describes how to re-add a Keycloak site to the Global Accelerator, after it has previously been taken offline, so that it can once again service client requests. diff --git a/doc/kubernetes/modules/ROOT/pages/running/index.adoc b/doc/kubernetes/modules/ROOT/pages/running/index.adoc index 395a34aec..c4a37f0b7 100644 --- a/doc/kubernetes/modules/ROOT/pages/running/index.adoc +++ b/doc/kubernetes/modules/ROOT/pages/running/index.adoc @@ -23,6 +23,7 @@ These guides will eventually be published Keycloak's main web page. [#operational-procedures] == Operational procedures +* xref:running/synchronize-sites.adoc[] * xref:running/take-active-site-offline.adoc[] * xref:running/bring-active-site-online.adoc[] diff --git a/doc/kubernetes/modules/ROOT/pages/running/synchronize-sites.adoc b/doc/kubernetes/modules/ROOT/pages/running/synchronize-sites.adoc new file mode 100644 index 000000000..20b305207 --- /dev/null +++ b/doc/kubernetes/modules/ROOT/pages/running/synchronize-sites.adoc @@ -0,0 +1,75 @@ +:project_name: Keycloak +:jdgserver_name: Infinispan +:stale-site: offline +:keep-site: active +:site-a-cr: site-a +:site-b-cr: site-b +:keep-site-name: {site-a-cr} +:stale-site-name: {site-b-cr} +:infinispan-xsite-docs: https://infinispan.org/docs/stable/titles/xsite/xsite.html +:ns: keycloak +:cluster-name: infinispan + += Resynchronize sites +:description: This guide describes the procedures required to synchronize an offline site with an active site. + +{description} + +== When to use this procedure + +Use this when the state of {jdgserver_name} clusters of two sites become disconnected and the contents of the caches are out-of-sync. +Perform this for example after a split-brain or when one site has been taken offline for maintenance. + +At the end of the procedure, the session contents on the secondary site have been discarded and replaced by the session +contents of the active site. All caches in the offline site are cleared to prevent invalid cache contents. + +== Procedures + +=== {jdgserver_name} Cluster + +For the context of this guide, `{keep-site-name}` is the currently active site and `{stale-site-name}` is an offline site that is not part +of the AWS Global Accelerator EndpointGroup and is therefore not receiving user requests. + +WARNING: Transferring state may impact {jdgserver_name} cluster performance by increasing the response time and/or resources usage. + +The first procedure is to delete the stale data from the offline site. + +. Login into the offline site. + +. Shutdown {project_name}. +This will clear all {project_name} caches and prevents the {project_name} state from being out-of-sync with {jdgserver_name}. ++ +When deploying {project_name} using the {project_name} Operator, change the number of {project_name} instances in the {project_name} Custom Resource to 0. + +include::partial$infinispan/infinispan-cli-connect.adoc[] +include::partial$infinispan/infinispan-cli-clear-caches.adoc[] + +Now we are ready to transfer the state from the active site to the offline site. + +. Login into your Active site + +include::partial$infinispan/infinispan-cli-connect.adoc[] + +include::partial$infinispan/infinispan-cli-state-transfer.adoc[] + +Now the state is available in the offline site, {project_name} can be started again: + +. Login into your secondary site. + +. Startup {project_name}. ++ +When deploying {project_name} using the {project_name} Operator, change the number of {project_name} instances in the {project_name} Custom Resource to the original value. + +=== AWS Aurora Database + +No action required. + +=== AWS Global Accelerator + +Once the two sites have been synchronized, it is safe to add the previously offline site back to the Global Accelerator +EndpointGroup following the steps in the xref:running/bring-active-site-online.adoc[] guide. + +== Further reading + +See https://www.keycloak.org/high-availability/concepts-infinispan-cli-batch[Concepts to automate {jdgserver_name} CLI commands] +on how to automate {jdgserver_name} CLI commands. diff --git a/doc/kubernetes/modules/ROOT/pages/running/take-active-site-offline.adoc b/doc/kubernetes/modules/ROOT/pages/running/take-active-site-offline.adoc index a404c73bb..b09a2f433 100644 --- a/doc/kubernetes/modules/ROOT/pages/running/take-active-site-offline.adoc +++ b/doc/kubernetes/modules/ROOT/pages/running/take-active-site-offline.adoc @@ -3,7 +3,7 @@ {description} -== When to use procedure +== When to use this procedure During the deployment lifecycle it might be required that one of the Active/Active sites is temporarily taken offline for maintenance or to allow for software upgrades. diff --git a/doc/kubernetes/modules/ROOT/partials/infinispan/infinispan-cli-clear-caches.adoc b/doc/kubernetes/modules/ROOT/partials/infinispan/infinispan-cli-clear-caches.adoc new file mode 100644 index 000000000..287541568 --- /dev/null +++ b/doc/kubernetes/modules/ROOT/partials/infinispan/infinispan-cli-clear-caches.adoc @@ -0,0 +1,99 @@ +. Disable the replication from {stale-site} site to the {keep-site} site by running the following command. +It prevents the clear request to reach the {keep-site} site and delete all the correct cached data. ++ +.Command: +[source,bash,subs="+attributes"] +---- +site take-offline --all-caches --site={keep-site-name} +---- ++ +.Output: +[source,bash,subs="+attributes"] +---- +{ + "offlineClientSessions" : "ok", + "authenticationSessions" : "ok", + "sessions" : "ok", + "clientSessions" : "ok", + "work" : "ok", + "offlineSessions" : "ok", + "loginFailures" : "ok", + "actionTokens" : "ok" +} +---- + +. Check the replication status is `offline`. ++ +.Command: +[source,bash,subs="+attributes"] +---- +site status --all-caches --site={keep-site-name} +---- ++ +.Output: +[source,bash,subs="+attributes"] +---- +{ + "status" : "offline" +} +---- ++ +If the status is not `offline`, repeat the previous step. ++ +WARNING: Make sure the replication is `offline` otherwise the clear data will clear both sites. + +. Clear all the cached data in {stale-site} site using the following commands: ++ +.Command: +[source,bash,subs="+attributes"] +---- +clearcache actionTokens +clearcache authenticationSessions +clearcache clientSessions +clearcache loginFailures +clearcache offlineClientSessions +clearcache offlineSessions +clearcache sessions +clearcache work +---- ++ +These commands do not print any output. + +. Re-enable the cross-site replication from {stale-site} site to the {keep-site} site. ++ +.Command: +[source,bash,subs="+attributes"] +---- +site bring-online --all-caches --site={keep-site-name} +---- ++ +.Output: +[source,bash,subs="+attributes"] +---- +{ + "offlineClientSessions" : "ok", + "authenticationSessions" : "ok", + "sessions" : "ok", + "clientSessions" : "ok", + "work" : "ok", + "offlineSessions" : "ok", + "loginFailures" : "ok", + "actionTokens" : "ok" +} +---- + +. Check the replication status is `online`. ++ +.Command: +[source,bash,subs="+attributes"] +---- +site status --all-caches --site={keep-site-name} +---- ++ +.Output: +[source,bash,subs="+attributes"] +---- +{ + "status" : "online" +} +---- diff --git a/doc/kubernetes/modules/ROOT/partials/infinispan/infinispan-cli-connect.adoc b/doc/kubernetes/modules/ROOT/partials/infinispan/infinispan-cli-connect.adoc new file mode 100644 index 000000000..167379818 --- /dev/null +++ b/doc/kubernetes/modules/ROOT/partials/infinispan/infinispan-cli-connect.adoc @@ -0,0 +1,22 @@ +. Connect into {jdgserver_name} Cluster using the {jdgserver_name} CLI tool: ++ +.Command: +[source,bash,subs="+attributes"] +---- +kubectl -n {ns} exec -it pods/{cluster-name}-0 -- ./bin/cli.sh --trustall --connect https://127.0.0.1:11222 +---- ++ +It asks for the username and password for the {jdgserver_name} cluster. +Those credentials are the one set in the https://www.keycloak.org/high-availability/deploy-infinispan-kubernetes-crossdc[Deploy Infinispan for HA with the Infinispan Operator] +guide in the configuring credentials section. ++ +.Output: +[source,bash,subs="+attributes"] +---- +Username: developer +Password: +[{cluster-name}-0-29897@ISPN//containers/default]> +---- ++ +NOTE: The pod name depends on the cluster name defined in the {jdgserver_name} CR. +The connection can be done with any pod in the {jdgserver_name} cluster. diff --git a/doc/kubernetes/modules/ROOT/partials/infinispan/infinispan-cli-state-transfer.adoc b/doc/kubernetes/modules/ROOT/partials/infinispan/infinispan-cli-state-transfer.adoc new file mode 100644 index 000000000..1fc7cbaec --- /dev/null +++ b/doc/kubernetes/modules/ROOT/partials/infinispan/infinispan-cli-state-transfer.adoc @@ -0,0 +1,120 @@ +. Trigger the state transfer from the {keep-site} site to the {stale-site} site. ++ +.Command: +[source,bash,subs="+attributes"] +---- +site push-site-state --all-caches --site={stale-site-name} +---- ++ +.Output: +[source,bash,subs="+attributes"] +---- +{ + "offlineClientSessions" : "ok", + "authenticationSessions" : "ok", + "sessions" : "ok", + "clientSessions" : "ok", + "work" : "ok", + "offlineSessions" : "ok", + "loginFailures" : "ok", + "actionTokens" : "ok" +} +---- + +. Check the replication status is `online` for all caches. ++ +.Command: +[source,bash,subs="+attributes"] +---- +site status --all-caches --site={stale-site-name} +---- ++ +.Output: +[source,bash,subs="+attributes"] +---- +{ + "status" : "online" +} +---- + +. Wait for the state transfer to complete by checking the output of `push-site-status` command for all caches. ++ +.Command: +[source,bash,subs="+attributes"] +---- +site push-site-status --cache=actionTokens +site push-site-status --cache=authenticationSessions +site push-site-status --cache=clientSessions +site push-site-status --cache=loginFailures +site push-site-status --cache=offlineClientSessions +site push-site-status --cache=offlineSessions +site push-site-status --cache=sessions +site push-site-status --cache=work +---- ++ +.Output: +[source,bash,subs="+attributes"] +---- +{ + "{stale-site-name}" : "OK" +} +{ + "{stale-site-name}" : "OK" +} +{ + "{stale-site-name}" : "OK" +} +{ + "{stale-site-name}" : "OK" +} +{ + "{stale-site-name}" : "OK" +} +{ + "{stale-site-name}" : "OK" +} +{ + "{stale-site-name}" : "OK" +} +{ + "{stale-site-name}" : "OK" +} +---- ++ +Check the table in {infinispan-xsite-docs}#rest_v2_xsite_state_push_cross-site-operations-rest[this section for the Cross-Site Documentation] for the possible status values. ++ +If an error is reported, repeat the state transfer for that specific cache. ++ +.Command: +[source,bash,subs="+attributes"] +---- +site push-site-state --cache= --site={stale-site-name} +---- + +. Clear/reset the state transfer status with the following command ++ +.Command: +[source,bash,subs="+attributes"] +---- +site clear-push-site-status --cache=actionTokens +site clear-push-site-status --cache=authenticationSessions +site clear-push-site-status --cache=clientSessions +site clear-push-site-status --cache=loginFailures +site clear-push-site-status --cache=offlineClientSessions +site clear-push-site-status --cache=offlineSessions +site clear-push-site-status --cache=sessions +site clear-push-site-status --cache=work +---- ++ +.Output: +[source,bash,subs="+attributes"] +---- +"ok" +"ok" +"ok" +"ok" +"ok" +"ok" +"ok" +"ok" +----