diff --git a/pkg/monitor/cluster/cluster.go b/pkg/monitor/cluster/cluster.go index 209a550ead9..65e3bd532f0 100644 --- a/pkg/monitor/cluster/cluster.go +++ b/pkg/monitor/cluster/cluster.go @@ -175,6 +175,7 @@ func (mon *Monitor) Monitor(ctx context.Context) (errs []error) { mon.emitHiveRegistrationStatus, mon.emitOperatorFlagsAndSupportBanner, mon.emitPucmState, + mon.emitEtcdCertificateExpiry, mon.emitPrometheusAlerts, // at the end for now because it's the slowest/least reliable } { err = f(ctx) diff --git a/pkg/monitor/cluster/etcdcertificateexpiry.go b/pkg/monitor/cluster/etcdcertificateexpiry.go new file mode 100644 index 00000000000..dd61b4e3e9d --- /dev/null +++ b/pkg/monitor/cluster/etcdcertificateexpiry.go @@ -0,0 +1,70 @@ +package cluster + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + "fmt" + "math" + "strings" + + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + utilcert "github.com/Azure/ARO-RP/pkg/util/certificate" + utilpem "github.com/Azure/ARO-RP/pkg/util/pem" + "github.com/Azure/ARO-RP/pkg/util/version" +) + +func (mon *Monitor) emitEtcdCertificateExpiry(ctx context.Context) error { + cv, err := mon.getClusterVersion(ctx) + if err != nil { + return err + } + v, err := version.ParseVersion(actualVersion(cv)) + if err != nil { + return err + } + // ETCD ceritificates are autorotated by the operator when close to expiry for cluster running 4.9+ + if !v.Lt(version.NewVersion(4, 9)) { + return nil + } + + secretList, err := mon.cli.CoreV1().Secrets("openshift-etcd").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("type=%s", corev1.SecretTypeTLS)}) + if err != nil { + return err + } + + certNearExpiry := false + minDaysUntilExpiration := math.MaxInt + for _, secret := range secretList.Items { + if strings.Contains(secret.ObjectMeta.Name, "etcd-peer") || strings.Contains(secret.ObjectMeta.Name, "etcd-serving") { + _, certs, err := utilpem.Parse(secret.Data[corev1.TLSCertKey]) + if err != nil { + return err + } + if utilcert.IsLessThanMinimumDuration(certs[0], utilcert.DefaultMinDurationPercent) { + certNearExpiry = true + minDaysUntilExpiration = min(utilcert.DaysUntilExpiration(certs[0]), minDaysUntilExpiration) + } + } + } + + if certNearExpiry { + mon.emitGauge("certificate.expirationdate", 1, map[string]string{ + "daysUntilExpiration": fmt.Sprintf("%d", minDaysUntilExpiration), + "namespace": "openshift-etcd", + "name": "openshift-etcd-certificate", + }) + } + + return nil +} + +func min(a, b int) int { + if a < b { + return a + } + return b +} diff --git a/pkg/monitor/cluster/etcdcertificateexpiry_test.go b/pkg/monitor/cluster/etcdcertificateexpiry_test.go new file mode 100644 index 00000000000..75a58bc8da0 --- /dev/null +++ b/pkg/monitor/cluster/etcdcertificateexpiry_test.go @@ -0,0 +1,100 @@ +package cluster + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "context" + "crypto/x509" + "encoding/pem" + "fmt" + "testing" + "time" + + "github.com/golang/mock/gomock" + configv1 "github.com/openshift/api/config/v1" + configfake "github.com/openshift/client-go/config/clientset/versioned/fake" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" + + mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics" + utiltls "github.com/Azure/ARO-RP/pkg/util/tls" +) + +func TestEtcdCertificateExpiry(t *testing.T) { + ctx := context.Background() + expiration := time.Now().Add(time.Microsecond * 60) + _, cert, err := utiltls.GenerateTestKeyAndCertificate("etcd-cert", nil, nil, false, false, tweakTemplateFn(expiration)) + if err != nil { + t.Fatal(err) + } + + for _, tt := range []struct { + name string + configcli *configfake.Clientset + cli *fake.Clientset + minDaysUntilExpiration int + }{ + { + name: "emit etcd certificate expiry", + configcli: configfake.NewSimpleClientset( + &configv1.ClusterVersion{ + ObjectMeta: metav1.ObjectMeta{ + Name: "version", + }, + Status: configv1.ClusterVersionStatus{ + History: []configv1.UpdateHistory{ + { + State: configv1.CompletedUpdate, + Version: "4.8.1", + }, + }, + }, + }, + ), + cli: fake.NewSimpleClientset( + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "etcd-peer-master-0", + Namespace: "openshift-etcd", + }, + Data: map[string][]byte{ + corev1.TLSCertKey: pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: cert[0].Raw}), + }, + Type: corev1.SecretTypeTLS, + }, + ), + minDaysUntilExpiration: 0, + }, + } { + t.Run(tt.name, func(t *testing.T) { + controller := gomock.NewController(t) + defer controller.Finish() + + m := mock_metrics.NewMockEmitter(controller) + mon := &Monitor{ + cli: tt.cli, + configcli: tt.configcli, + m: m, + } + + m.EXPECT().EmitGauge("certificate.expirationdate", int64(1), map[string]string{ + "daysUntilExpiration": fmt.Sprintf("%d", tt.minDaysUntilExpiration), + "namespace": "openshift-etcd", + "name": "openshift-etcd-certificate", + }) + + err = mon.emitEtcdCertificateExpiry(ctx) + if err != nil { + t.Fatal(err) + } + }) + } +} + +func tweakTemplateFn(expiration time.Time) func(*x509.Certificate) { + return func(template *x509.Certificate) { + template.NotAfter = expiration + } +} diff --git a/pkg/util/certificate/certificate.go b/pkg/util/certificate/certificate.go new file mode 100644 index 00000000000..94701b29b09 --- /dev/null +++ b/pkg/util/certificate/certificate.go @@ -0,0 +1,27 @@ +package certificate + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "crypto/x509" + "time" +) + +const DefaultMinDurationPercent = 0.20 + +// IsLessThanMinimumDuration indicates whether the provided cert has less +// than the provided minimum percentage of its duration remaining. +func IsLessThanMinimumDuration(cert *x509.Certificate, minDurationPercent float64) bool { + duration := cert.NotAfter.Sub(cert.NotBefore) + minDuration := time.Duration(float64(duration.Nanoseconds()) * DefaultMinDurationPercent) + return time.Now().After(cert.NotAfter.Add(-minDuration)) +} + +func IsCertExpired(cert *x509.Certificate) bool { + return time.Now().After(cert.NotAfter) +} + +func DaysUntilExpiration(cert *x509.Certificate) int { + return int(time.Until(cert.NotAfter) / (24 * time.Hour)) +}