Skip to content

Commit

Permalink
GenevaAction for etcd Certificate Renewal
Browse files Browse the repository at this point in the history
  • Loading branch information
SrinivasAtmakuri committed Aug 31, 2023
1 parent e4d4c73 commit ebf86b3
Show file tree
Hide file tree
Showing 3 changed files with 397 additions and 0 deletions.
368 changes: 368 additions & 0 deletions pkg/frontend/admin_openshiftcluster_etcdcertificaterenew.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,368 @@
package frontend

Check failure on line 1 in pkg/frontend/admin_openshiftcluster_etcdcertificaterenew.go

View workflow job for this annotation

GitHub Actions / validate-go

group 2: mixed import type

Check failure on line 1 in pkg/frontend/admin_openshiftcluster_etcdcertificaterenew.go

View workflow job for this annotation

GitHub Actions / validate-go

group 3: duplicate group or invalid group ordering

// Copyright (c) Microsoft Corporation.
// Licensed under the Apache License 2.0.

import (
"context"
"encoding/json"
"fmt"
"net/http"
"path/filepath"
"strings"
"time"

"github.com/Azure/ARO-RP/pkg/api"
"github.com/Azure/ARO-RP/pkg/database/cosmosdb"
"github.com/Azure/ARO-RP/pkg/frontend/adminactions"
"github.com/Azure/ARO-RP/pkg/frontend/middleware"
"github.com/Azure/ARO-RP/pkg/util/steps"
"github.com/go-chi/chi/v5"
configv1 "github.com/openshift/api/config/v1"
operatorv1 "github.com/openshift/api/operator/v1"
"github.com/sirupsen/logrus"
"github.com/ugorji/go/codec"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
kruntime "k8s.io/apimachinery/pkg/runtime"

utilcert "github.com/Azure/ARO-RP/pkg/util/cert"
utilpem "github.com/Azure/ARO-RP/pkg/util/pem"
"github.com/Azure/ARO-RP/pkg/util/version"
)

type etcdrenew struct {
log *logrus.Entry
k adminactions.KubeActions
secretNames []string
mode string
backupSecrets map[string][]byte
}

var etcdOperatorControllerConditionsExpected = map[string]operatorv1.ConditionStatus{
"EtcdCertSignerControllerDegraded": operatorv1.ConditionFalse,
"EtcdMembersAvailable": operatorv1.ConditionTrue,
"NodeInstallerProgressing": operatorv1.ConditionFalse,
"NodeControllerDegraded": operatorv1.ConditionFalse,
"EtcdMembersProgressing": operatorv1.ConditionFalse,
}

var etcdOperatorConditionsExpected = map[configv1.ClusterStatusConditionType]configv1.ConditionStatus{
configv1.OperatorAvailable: configv1.ConditionTrue,
configv1.OperatorProgressing: configv1.ConditionFalse,
configv1.OperatorDegraded: configv1.ConditionFalse,
}

func (f *frontend) postAdminOpenShiftClusterEtcdCertificateRenew(w http.ResponseWriter, r *http.Request) {
ctx := r.Context()
log := ctx.Value(middleware.ContextKeyLog).(*logrus.Entry)
r.URL.Path = filepath.Dir(r.URL.Path)

err := f._postAdminOpenShiftClusterEtcdCertificateRenew(ctx, r, log)

adminReply(log, w, nil, nil, err)
}

// validate cluster is <4.9 and etcd is in expected state
func (e *etcdrenew) validate(ctx context.Context) error {
s := []steps.Step{
steps.Action(e.validateEtcdOperatorControllersState),
steps.Action(e.validateEtcdOperatorState),
steps.Action(e.validateEtcdCertsExistsAndExpiry),
}
_, err := steps.Run(ctx, e.log, 10*time.Second, s, nil)
if err != nil {
return err
}
return nil
}

func (e *etcdrenew) isRenewed(ctx context.Context) error {
s := []steps.Step{
steps.Condition(e.isRevisied, 30*time.Minute, true),
}
_, err := steps.Run(ctx, e.log, 10*time.Second, s, nil)
if err != nil {
return err
}
return nil
}

func (e *etcdrenew) backupAndDelete(ctx context.Context) error {
s := []steps.Step{
steps.Action(e.backupEtcdSecrets),
steps.Action(e.deleteEtcdSecrets),
}
_, err := steps.Run(ctx, e.log, 10*time.Second, s, nil)
if err != nil {
return err
}
return nil
}

func (f *frontend) _postAdminOpenShiftClusterEtcdCertificateRenew(ctx context.Context, r *http.Request, log *logrus.Entry) error {
resourceName := chi.URLParam(r, "resourceName")
resourceType := chi.URLParam(r, "resourceType")
resourceGroupName := chi.URLParam(r, "resourceGroupName")

resourceID := strings.TrimPrefix(r.URL.Path, "/admin")

doc, err := f.dbOpenShiftClusters.Get(ctx, resourceID)
switch {
case cosmosdb.IsErrorStatusCode(err, http.StatusNotFound):
return api.NewCloudError(http.StatusNotFound, api.CloudErrorCodeResourceNotFound, "", "The Resource '%s/%s' under resource group '%s' was not found.", resourceType, resourceName, resourceGroupName)
case err != nil:
return err
}

k, err := f.kubeActionsFactory(log, f.env, doc.OpenShiftCluster)
if err != nil {
return err
}
e := &etcdrenew{
log: log,
k: k,
secretNames: nil,
mode: "renew",
}

if err = e.validateClusterVersion(ctx); err != nil {
return err
}
if err = e.validate(ctx); err != nil {
return err
}

// Fetch secretNames using nodeNames
masterNodeNames, err := fetchNodeNames(ctx, k, log)
if err != nil {
return err
}
if len(masterNodeNames) != 3 {
return api.NewCloudError(http.StatusForbidden, api.CloudErrorCodeForbidden, "", "The cluster doesn't have 3 master nodes")
}

for _, nodeName := range masterNodeNames {
for _, prefix := range []string{"etcd-peer-", "etcd-serving-", "etcd-serving-metrics-"} {
e.secretNames = append(e.secretNames, prefix+nodeName)
}
}

// backup and delete etcd secrets
if err = e.backupAndDelete(ctx); err != nil {
return err
}

// Calling Sleep method
e.log.Infoln("Entering sleep... 3mins")
time.Sleep(3 * time.Minute)

if err = e.isRenewed(ctx); err != nil {
e.mode = "recovery"
} else {
e.mode = "renewed"
}

if e.mode == "renewed" {
if err = e.validate(ctx); err != nil {
return err
}
e.log.Infoln("Done")
}

if e.mode == "recovery" {
e.log.Println("Attempting to recover from backup")
if err = e.recoverEtcdSecrets(ctx); err != nil {
return err
}
e.log.Infoln("Recovered")
}

return nil
}

func (e *etcdrenew) validateClusterVersion(ctx context.Context) error {
e.log.Infoln("validating cluster version now")
rawCV, err := e.k.KubeGet(ctx, "clusterversion", "", "version")
if err != nil {
return err
}
cv := &configv1.ClusterVersion{}
err = codec.NewDecoderBytes(rawCV, &codec.JsonHandle{}).Decode(cv)
if err != nil {
return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", fmt.Sprintf("failed to decode clusterversion, %s", err.Error()))
}
clusterVersion, err := version.GetClusterVersion(cv)
if err != nil {
return err
}
// ETCD ceritificates are autorotated by the operator when close to expiry for cluster running 4.9+
if clusterVersion.Lt(version.NewVersion(4, 9)) {
return api.NewCloudError(http.StatusForbidden, api.CloudErrorCodeForbidden, "", "etcd certificate renewal is not needed for cluster running version 4.9+")
}
return nil
}

func (e *etcdrenew) validateEtcdOperatorControllersState(ctx context.Context) error {
e.log.Infoln("validating etcdOperator Controllers state now")
rawEtcd, err := e.k.KubeGet(ctx, "Etcd", "", "cluster")
if err != nil {
return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", err.Error())
}
etcd := &operatorv1.Etcd{}
err = codec.NewDecoderBytes(rawEtcd, &codec.JsonHandle{}).Decode(etcd)
if err != nil {
return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", fmt.Sprintf("failed to decode etcd object, %s", err.Error()))
}
for _, c := range etcd.Status.Conditions {
if _, ok := etcdOperatorControllerConditionsExpected[c.Type]; !ok {
continue
}
if etcdOperatorControllerConditionsExpected[c.Type] != c.Status && e.mode == "renewed" {
return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", "%s is in state %s, quiting.", c.Type, c.Status)
}
}
return nil
}

func (e *etcdrenew) isRevisied(ctx context.Context) (bool, error) {
isAtRevision := true
rawEtcd, err := e.k.KubeGet(ctx, "Etcd", "", "cluster")
if err != nil {
return false, api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", err.Error())
}
etcd := &operatorv1.Etcd{}
err = codec.NewDecoderBytes(rawEtcd, &codec.JsonHandle{}).Decode(etcd)
if err != nil {
return false, api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", fmt.Sprintf("failed to decode etcd object, %s", err.Error()))
}
for _, s := range etcd.Status.NodeStatuses {
fmt.Println("Latest Revision is %s", etcd.Status.LatestAvailableRevision)

Check failure on line 240 in pkg/frontend/admin_openshiftcluster_etcdcertificaterenew.go

View workflow job for this annotation

GitHub Actions / golangci-lint

printf: fmt.Println call has possible formatting directive %s (govet)
if s.CurrentRevision != etcd.Status.LatestAvailableRevision {
isAtRevision = false
}
}
return isAtRevision, nil
}

func (e *etcdrenew) validateEtcdOperatorState(ctx context.Context) error {
e.log.Infoln("validating Etcd Operator state")
rawEtcdOperator, err := e.k.KubeGet(ctx, "clusteroperator", "", "etcd")
if err != nil {
return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", err.Error())
}
etcdOperator := &configv1.ClusterOperator{}
err = codec.NewDecoderBytes(rawEtcdOperator, &codec.JsonHandle{}).Decode(etcdOperator)
if err != nil {
return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", fmt.Sprintf("failed to decode etcd operator, %s", err.Error()))
}
for _, c := range etcdOperator.Status.Conditions {
if _, ok := etcdOperatorConditionsExpected[c.Type]; !ok {
continue
}
if etcdOperatorConditionsExpected[c.Type] != c.Status && e.mode == "renewed" {
return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", "Etcd Operator is not in expected state, quiting.")
}
}
return nil
}

func fetchNodeNames(ctx context.Context, k adminactions.KubeActions, log *logrus.Entry) ([]string, error) {
var masterNodeNames []string
var u unstructured.Unstructured
var nodes corev1.NodeList

nodeList, err := k.KubeList(ctx, "node", "")
if err != nil {
return nil, err
}
if err = json.Unmarshal(nodeList, &u); err != nil {
return nil, err
}
err = kruntime.DefaultUnstructuredConverter.FromUnstructured(u.Object, &nodes)
if err != nil {
return nil, err
}

for _, node := range nodes.Items {
if _, ok := node.ObjectMeta.Labels["node-role.kubernetes.io/master"]; ok {
masterNodeNames = append(masterNodeNames, node.ObjectMeta.Name)
continue
}
}
return masterNodeNames, nil
}

func (e *etcdrenew) validateEtcdCertsExistsAndExpiry(ctx context.Context) error {
e.log.Infoln("validating etcd certs exists, not expired but are not close to expiry")
for _, secretname := range e.secretNames {
cert, err := e.k.KubeGet(ctx, "Secret", namespaceEtcds, secretname)
if err != nil {
return err
}

var u unstructured.Unstructured
var secret corev1.Secret
if err = json.Unmarshal(cert, &u); err != nil {
return err
}
err = kruntime.DefaultUnstructuredConverter.FromUnstructured(u.Object, &secret)
if err != nil {
return err
}
_, certData, err := utilpem.Parse(secret.Data[corev1.TLSCertKey])
if err != nil {
return err
}
if !utilcert.IsLessThanMinimumDuration(certData[0], utilcert.DefaultMinDurationPercent) && e.mode == "renewed" {
return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", "secret %s is not near expiry, quitting", secretname)
}
if utilcert.IsCertExpired(certData[0]) {
return api.NewCloudError(http.StatusInternalServerError, api.CloudErrorCodeInternalServerError, "", "secret %s is already expired, quitting", secretname)
}
}

return nil
}

func (e *etcdrenew) backupEtcdSecrets(ctx context.Context) error {
e.log.Infoln("backing up etcd secrets now")
for _, secretname := range e.secretNames {
e.log.Infof("Backing up secret %s", secretname)
data, err := e.k.KubeGet(ctx, "Secret", namespaceEtcds, secretname)
if err != nil {
return err
}
e.backupSecrets[secretname] = data
}
return nil
}

func (e *etcdrenew) deleteEtcdSecrets(ctx context.Context) error {
e.log.Infoln("deleting etcd secrets now")
for _, secretname := range e.secretNames {
e.log.Infof("Deleting secret %s", secretname)
err := e.k.KubeDelete(ctx, "Secret", namespaceEtcds, secretname, false, nil)
if err != nil {
return err
}
}
return nil
}

func (e *etcdrenew) recoverEtcdSecrets(ctx context.Context) error {
e.log.Infoln("recovering etcd secrets now")
for secretname, data := range e.backupSecrets {
e.log.Infof("Recovering secret %s", secretname)
obj := &unstructured.Unstructured{}
err := obj.UnmarshalJSON(data)
if err != nil {
return api.NewCloudError(http.StatusBadRequest, api.CloudErrorCodeInvalidRequestContent, "", "The request content was invalid and could not be deserialized: %q.", err)
}
err = e.k.KubeCreateOrUpdate(ctx, obj)
if err != nil {
return err
}
}
return nil
}
2 changes: 2 additions & 0 deletions pkg/frontend/frontend.go
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,8 @@ func (f *frontend) chiAuthenticatedRoutes(router chi.Router) {
r.With(f.maintenanceMiddleware.UnplannedMaintenanceSignal).Post("/cordonnode", f.postAdminOpenShiftClusterCordonNode)

r.With(f.maintenanceMiddleware.UnplannedMaintenanceSignal).Post("/drainnode", f.postAdminOpenShiftClusterDrainNode)

r.With(f.maintenanceMiddleware.UnplannedMaintenanceSignal).Post("/etcdcertificaterenew", f.postAdminOpenShiftClusterEtcdCertificateRenew)
})
})

Expand Down
Loading

0 comments on commit ebf86b3

Please sign in to comment.