-
Notifications
You must be signed in to change notification settings - Fork 169
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add etcdRecovery maintenance type for admin update - ARO-1534
In the event that a master node changes IP addresses (or NIC's) the etcd quorum will become degraded. The node with the change will then have it's etcd pod in a crashloop. This is due to the hardcoded etcd spec. This PR adds the remediation type EtcdRecovery maintenance task to remediate this issue. How it works: 1. Verify this is the issue by comparing etcd's env variables to the node's IP address. a degradedEtcd object is returned with relevant information. 1. Create a batch job to backup etcd's data directory and move the etcd manifest to stop the pod from crash looping. 1. A batch job is created to run a pod that ssh's into the peer etcd container's to remove the failing node from it's member list. 1. Secret's for the failing pod are deleted 1. Etcd is patched Currently there is no endpoint to access this recovery task yet. An endpoint will be added in a later PR. Additional scenarios handled: - Sometimes the etcd deployement can remediate itself after an IP address change, but there is still data present from the previous IP address\'s member. This results in 4/5 containers running in the pod with the etcd container failing, but no IP address conflicts to use for remediation. Added code to find the failing member based on the conditions if no conflict is found - Check for multiple etcd pods with IP mismatches - Wait for jobs to reach a succeeded state, when the shell script exits with code 0. If this never happens the context is cancelled. - Return container log files to user from jobs
- Loading branch information
1 parent
19751c9
commit 2d5491a
Showing
11 changed files
with
1,607 additions
and
10 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
package frontend | ||
|
||
// Copyright (c) Microsoft Corporation. | ||
// Licensed under the Apache License 2.0. | ||
|
||
import ( | ||
"context" | ||
"net/http" | ||
"path/filepath" | ||
"strings" | ||
|
||
"github.com/go-chi/chi/v5" | ||
operatorclient "github.com/openshift/client-go/operator/clientset/versioned" | ||
"github.com/sirupsen/logrus" | ||
|
||
"github.com/Azure/ARO-RP/pkg/api" | ||
"github.com/Azure/ARO-RP/pkg/database/cosmosdb" | ||
"github.com/Azure/ARO-RP/pkg/frontend/middleware" | ||
"github.com/Azure/ARO-RP/pkg/util/restconfig" | ||
) | ||
|
||
func (f *frontend) postAdminOpenShiftClusterEtcdRecovery(w http.ResponseWriter, r *http.Request) { | ||
ctx := r.Context() | ||
log := ctx.Value(middleware.ContextKeyLog).(*logrus.Entry) | ||
r.URL.Path = filepath.Dir(r.URL.Path) | ||
|
||
b, err := f._postAdminOpenShiftClusterEtcdRecovery(ctx, r, log) | ||
|
||
adminReply(log, w, nil, b, err) | ||
} | ||
|
||
func (f *frontend) _postAdminOpenShiftClusterEtcdRecovery(ctx context.Context, r *http.Request, log *logrus.Entry) ([]byte, error) { | ||
resType, resName, resGroupName := chi.URLParam(r, "resourceType"), chi.URLParam(r, "resourceName"), chi.URLParam(r, "resourceGroupName") | ||
resourceID := strings.TrimPrefix(r.URL.Path, "/admin") | ||
|
||
doc, err := f.dbOpenShiftClusters.Get(ctx, resourceID) | ||
switch { | ||
case cosmosdb.IsErrorStatusCode(err, http.StatusNotFound): | ||
return []byte{}, api.NewCloudError(http.StatusNotFound, api.CloudErrorCodeResourceNotFound, "", "The Resource '%s/%s' under resource group '%s' was not found.", resType, resName, resGroupName) | ||
case err != nil: | ||
return []byte{}, err | ||
} | ||
kubeActions, err := f.kubeActionsFactory(log, f.env, doc.OpenShiftCluster) | ||
if err != nil { | ||
return []byte{}, err | ||
} | ||
|
||
gvr, err := kubeActions.ResolveGVR("Etcd") | ||
if err != nil { | ||
return []byte{}, err | ||
} | ||
|
||
err = validateAdminKubernetesObjects(r.Method, gvr, namespaceEtcds, "cluster") | ||
if err != nil { | ||
return []byte{}, err | ||
} | ||
|
||
restConfig, err := restconfig.RestConfig(f.env, doc.OpenShiftCluster) | ||
if err != nil { | ||
return []byte{}, err | ||
} | ||
|
||
operatorcli, err := operatorclient.NewForConfig(restConfig) | ||
if err != nil { | ||
return []byte{}, err | ||
} | ||
|
||
return f.fixEtcd(ctx, log, f.env, doc, kubeActions, operatorcli.OperatorV1().Etcds()) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.