diff --git a/pkg/frontend/admin_openshiftcluster_etcdcertificaterenew.go b/pkg/frontend/admin_openshiftcluster_etcdcertificaterenew.go new file mode 100644 index 00000000000..fd89292edb5 --- /dev/null +++ b/pkg/frontend/admin_openshiftcluster_etcdcertificaterenew.go @@ -0,0 +1,367 @@ +package frontend + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + "encoding/json" + "fmt" + "net/http" + "path/filepath" + "strings" + "time" + + "github.com/go-chi/chi/v5" + configv1 "github.com/openshift/api/config/v1" + operatorv1 "github.com/openshift/api/operator/v1" + "github.com/sirupsen/logrus" + "github.com/ugorji/go/codec" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + kruntime "k8s.io/apimachinery/pkg/runtime" + + "github.com/Azure/ARO-RP/pkg/api" + "github.com/Azure/ARO-RP/pkg/database/cosmosdb" + "github.com/Azure/ARO-RP/pkg/frontend/adminactions" + "github.com/Azure/ARO-RP/pkg/frontend/middleware" + utilcert "github.com/Azure/ARO-RP/pkg/util/cert" + utilpem "github.com/Azure/ARO-RP/pkg/util/pem" + "github.com/Azure/ARO-RP/pkg/util/steps" + "github.com/Azure/ARO-RP/pkg/util/version" +) + +type etcdrenew struct { + log *logrus.Entry + k adminactions.KubeActions + secretNames []string + mode string + backupSecrets map[string][]byte +} + +var etcdOperatorControllerConditionsExpected = map[string]operatorv1.ConditionStatus{ + "EtcdCertSignerControllerDegraded": operatorv1.ConditionFalse, + "EtcdMembersAvailable": operatorv1.ConditionTrue, + "NodeInstallerProgressing": operatorv1.ConditionFalse, + "NodeControllerDegraded": operatorv1.ConditionFalse, + "EtcdMembersProgressing": operatorv1.ConditionFalse, +} + +var etcdOperatorConditionsExpected = map[configv1.ClusterStatusConditionType]configv1.ConditionStatus{ + configv1.OperatorAvailable: configv1.ConditionTrue, + configv1.OperatorProgressing: configv1.ConditionFalse, + configv1.OperatorDegraded: configv1.ConditionFalse, +} + +func (f *frontend) postAdminOpenShiftClusterEtcdCertificateRenew(w http.ResponseWriter, r *http.Request) { + ctx := r.Context() + log := ctx.Value(middleware.ContextKeyLog).(*logrus.Entry) + r.URL.Path = filepath.Dir(r.URL.Path) + + err := f._postAdminOpenShiftClusterEtcdCertificateRenew(ctx, r, log) + + adminReply(log, w, nil, nil, err) +} + +// validate cluster is <4.9 and etcd is in expected state +func (e *etcdrenew) validate(ctx context.Context) error { + s := []steps.Step{ + steps.Action(e.validateEtcdOperatorControllersState), + steps.Action(e.validateEtcdOperatorState), + steps.Action(e.validateEtcdCertsExistsAndExpiry), + } + _, err := steps.Run(ctx, e.log, 10*time.Second, s, nil) + if err != nil { + return err + } + return nil +} + +func (e *etcdrenew) isRenewed(ctx context.Context) error { + s := []steps.Step{ + steps.Condition(e.isRevisied, 30*time.Minute, true), + } + _, err := steps.Run(ctx, e.log, 10*time.Second, s, nil) + if err != nil { + return err + } + return nil +} + +func (e *etcdrenew) backupAndDelete(ctx context.Context) error { + s := []steps.Step{ + steps.Action(e.backupEtcdSecrets), + steps.Action(e.deleteEtcdSecrets), + } + _, err := steps.Run(ctx, e.log, 10*time.Second, s, nil) + if err != nil { + return err + } + return nil +} + +func (f *frontend) _postAdminOpenShiftClusterEtcdCertificateRenew(ctx context.Context, r *http.Request, log *logrus.Entry) error { + resourceName := chi.URLParam(r, "resourceName") + resourceType := chi.URLParam(r, "resourceType") + resourceGroupName := chi.URLParam(r, "resourceGroupName") + + resourceID := strings.TrimPrefix(r.URL.Path, "/admin") + + doc, err := f.dbOpenShiftClusters.Get(ctx, resourceID) + switch { + case cosmosdb.IsErrorStatusCode(err, http.StatusNotFound): + return api.NewCloudError(http.StatusNotFound, api.CloudErrorCodeResourceNotFound, "", "The Resource '%s/%s' under resource group '%s' was not found.", resourceType, resourceName, resourceGroupName) + case err != nil: + return err + } + + k, err := f.kubeActionsFactory(log, f.env, doc.OpenShiftCluster) + if err != nil { + return err + } + e := &etcdrenew{ + log: log, + k: k, + secretNames: nil, + mode: "renew", + } + + if err = e.validateClusterVersion(ctx); err != nil { + return err + } + if err = e.validate(ctx); err != nil { + return err + } + + // Fetch secretNames using nodeNames + masterNodeNames, err := fetchNodeNames(ctx, k, log) + if err != nil { + return err + } + if len(masterNodeNames) != 3 { + return api.NewCloudError(http.StatusForbidden, api.CloudErrorCodeForbidden, "", "The cluster doesn't have 3 master nodes") + } + + for _, nodeName := range masterNodeNames { + for _, prefix := range []string{"etcd-peer-", "etcd-serving-", "etcd-serving-metrics-"} { + e.secretNames = append(e.secretNames, prefix+nodeName) + } + } + + // backup and delete etcd secrets + if err = e.backupAndDelete(ctx); err != nil { + return err + } + + // Calling Sleep method + e.log.Infoln("Entering sleep... 3mins") + time.Sleep(3 * time.Minute) + + if err = e.isRenewed(ctx); err != nil { + e.mode = "recovery" + } else { + e.mode = "renewed" + } + + if e.mode == "renewed" { + if err = e.validate(ctx); err != nil { + return err + } + e.log.Infoln("Done") + } + + if e.mode == "recovery" { + e.log.Println("Attempting to recover from backup") + if err = e.recoverEtcdSecrets(ctx); err != nil { + return err + } + e.log.Infoln("Recovered") + } + + return nil +} + +func (e *etcdrenew) validateClusterVersion(ctx context.Context) error { + e.log.Infoln("validating cluster version now") + rawCV, err := e.k.KubeGet(ctx, "clusterversion", "", "version") + if err != nil { + return err + } + cv := &configv1.ClusterVersion{} + err = codec.NewDecoderBytes(rawCV, &codec.JsonHandle{}).Decode(cv) + if err != nil { + return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", fmt.Sprintf("failed to decode clusterversion, %s", err.Error())) + } + clusterVersion, err := version.GetClusterVersion(cv) + if err != nil { + return err + } + // ETCD ceritificates are autorotated by the operator when close to expiry for cluster running 4.9+ + if clusterVersion.Lt(version.NewVersion(4, 9)) { + return api.NewCloudError(http.StatusForbidden, api.CloudErrorCodeForbidden, "", "etcd certificate renewal is not needed for cluster running version 4.9+") + } + return nil +} + +func (e *etcdrenew) validateEtcdOperatorControllersState(ctx context.Context) error { + e.log.Infoln("validating etcdOperator Controllers state now") + rawEtcd, err := e.k.KubeGet(ctx, "Etcd", "", "cluster") + if err != nil { + return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", err.Error()) + } + etcd := &operatorv1.Etcd{} + err = codec.NewDecoderBytes(rawEtcd, &codec.JsonHandle{}).Decode(etcd) + if err != nil { + return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", fmt.Sprintf("failed to decode etcd object, %s", err.Error())) + } + for _, c := range etcd.Status.Conditions { + if _, ok := etcdOperatorControllerConditionsExpected[c.Type]; !ok { + continue + } + if etcdOperatorControllerConditionsExpected[c.Type] != c.Status && e.mode == "renewed" { + return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", "%s is in state %s, quiting.", c.Type, c.Status) + } + } + return nil +} + +func (e *etcdrenew) isRevisied(ctx context.Context) (bool, error) { + isAtRevision := true + rawEtcd, err := e.k.KubeGet(ctx, "Etcd", "", "cluster") + if err != nil { + return false, api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", err.Error()) + } + etcd := &operatorv1.Etcd{} + err = codec.NewDecoderBytes(rawEtcd, &codec.JsonHandle{}).Decode(etcd) + if err != nil { + return false, api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", fmt.Sprintf("failed to decode etcd object, %s", err.Error())) + } + for _, s := range etcd.Status.NodeStatuses { + if s.CurrentRevision != etcd.Status.LatestAvailableRevision { + isAtRevision = false + } + } + return isAtRevision, nil +} + +func (e *etcdrenew) validateEtcdOperatorState(ctx context.Context) error { + e.log.Infoln("validating Etcd Operator state") + rawEtcdOperator, err := e.k.KubeGet(ctx, "clusteroperator", "", "etcd") + if err != nil { + return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", err.Error()) + } + etcdOperator := &configv1.ClusterOperator{} + err = codec.NewDecoderBytes(rawEtcdOperator, &codec.JsonHandle{}).Decode(etcdOperator) + if err != nil { + return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", fmt.Sprintf("failed to decode etcd operator, %s", err.Error())) + } + for _, c := range etcdOperator.Status.Conditions { + if _, ok := etcdOperatorConditionsExpected[c.Type]; !ok { + continue + } + if etcdOperatorConditionsExpected[c.Type] != c.Status && e.mode == "renewed" { + return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", "Etcd Operator is not in expected state, quiting.") + } + } + return nil +} + +func fetchNodeNames(ctx context.Context, k adminactions.KubeActions, log *logrus.Entry) ([]string, error) { + var masterNodeNames []string + var u unstructured.Unstructured + var nodes corev1.NodeList + + nodeList, err := k.KubeList(ctx, "node", "") + if err != nil { + return nil, err + } + if err = json.Unmarshal(nodeList, &u); err != nil { + return nil, err + } + err = kruntime.DefaultUnstructuredConverter.FromUnstructured(u.Object, &nodes) + if err != nil { + return nil, err + } + + for _, node := range nodes.Items { + if _, ok := node.ObjectMeta.Labels["node-role.kubernetes.io/master"]; ok { + masterNodeNames = append(masterNodeNames, node.ObjectMeta.Name) + continue + } + } + return masterNodeNames, nil +} + +func (e *etcdrenew) validateEtcdCertsExistsAndExpiry(ctx context.Context) error { + e.log.Infoln("validating etcd certs exists, not expired but are not close to expiry") + for _, secretname := range e.secretNames { + cert, err := e.k.KubeGet(ctx, "Secret", namespaceEtcds, secretname) + if err != nil { + return err + } + + var u unstructured.Unstructured + var secret corev1.Secret + if err = json.Unmarshal(cert, &u); err != nil { + return err + } + err = kruntime.DefaultUnstructuredConverter.FromUnstructured(u.Object, &secret) + if err != nil { + return err + } + _, certData, err := utilpem.Parse(secret.Data[corev1.TLSCertKey]) + if err != nil { + return err + } + if !utilcert.IsLessThanMinimumDuration(certData[0], utilcert.DefaultMinDurationPercent) && e.mode == "renewed" { + return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", "secret %s is not near expiry, quitting", secretname) + } + if utilcert.IsCertExpired(certData[0]) { + return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", "secret %s is already expired, quitting", secretname) + } + } + + return nil +} + +func (e *etcdrenew) backupEtcdSecrets(ctx context.Context) error { + e.log.Infoln("backing up etcd secrets now") + for _, secretname := range e.secretNames { + e.log.Infof("Backing up secret %s", secretname) + data, err := e.k.KubeGet(ctx, "Secret", namespaceEtcds, secretname) + if err != nil { + return err + } + e.backupSecrets[secretname] = data + } + return nil +} + +func (e *etcdrenew) deleteEtcdSecrets(ctx context.Context) error { + e.log.Infoln("deleting etcd secrets now") + for _, secretname := range e.secretNames { + e.log.Infof("Deleting secret %s", secretname) + err := e.k.KubeDelete(ctx, "Secret", namespaceEtcds, secretname, false, nil) + if err != nil { + return err + } + } + return nil +} + +func (e *etcdrenew) recoverEtcdSecrets(ctx context.Context) error { + e.log.Infoln("recovering etcd secrets now") + for secretname, data := range e.backupSecrets { + e.log.Infof("Recovering secret %s", secretname) + obj := &unstructured.Unstructured{} + err := obj.UnmarshalJSON(data) + if err != nil { + return api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidRequestContent, "", "The request content was invalid and could not be deserialized: %q.", err) + } + err = e.k.KubeCreateOrUpdate(ctx, obj) + if err != nil { + return err + } + } + return nil +} diff --git a/pkg/frontend/frontend.go b/pkg/frontend/frontend.go index e7d1d720c03..d5b5f46ef75 100644 --- a/pkg/frontend/frontend.go +++ b/pkg/frontend/frontend.go @@ -334,6 +334,8 @@ func (f *frontend) chiAuthenticatedRoutes(router chi.Router) { r.With(f.maintenanceMiddleware.UnplannedMaintenanceSignal).Post("/cordonnode", f.postAdminOpenShiftClusterCordonNode) r.With(f.maintenanceMiddleware.UnplannedMaintenanceSignal).Post("/drainnode", f.postAdminOpenShiftClusterDrainNode) + + r.With(f.maintenanceMiddleware.UnplannedMaintenanceSignal).Post("/etcdcertificaterenew", f.postAdminOpenShiftClusterEtcdCertificateRenew) }) }) diff --git a/pkg/util/cert/cert.go b/pkg/util/cert/cert.go new file mode 100644 index 00000000000..8d434fd5465 --- /dev/null +++ b/pkg/util/cert/cert.go @@ -0,0 +1,27 @@ +package cert + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "crypto/x509" + "time" +) + +const DefaultMinDurationPercent = 0.20 + +// IsLessThanMinimumDuration indicates whether the provided cert has less +// than the provided minimum percentage of its duration remaining. +func IsLessThanMinimumDuration(cert *x509.Certificate, minDurationPercent float64) bool { + duration := cert.NotAfter.Sub(cert.NotBefore) + minDuration := time.Duration(float64(duration.Nanoseconds()) * DefaultMinDurationPercent) + return time.Now().After(cert.NotAfter.Add(-minDuration)) +} + +func IsCertExpired(cert *x509.Certificate) bool { + return time.Now().After(cert.NotAfter) +} + +func DaysUntilExpiration(cert *x509.Certificate) int { + return int(time.Until(cert.NotAfter) / (24 * time.Hour)) +}