diff --git a/pkg/monitor/cluster/certificateexpirationstatuses.go b/pkg/monitor/cluster/certificateexpirationstatuses.go index 120710f835c..fe18949f1bf 100644 --- a/pkg/monitor/cluster/certificateexpirationstatuses.go +++ b/pkg/monitor/cluster/certificateexpirationstatuses.go @@ -4,18 +4,22 @@ import ( "context" "crypto/x509" "fmt" + "math" "strings" "time" operatorv1 "github.com/openshift/api/operator/v1" corev1 "k8s.io/api/core/v1" kerrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "sigs.k8s.io/controller-runtime/pkg/client" "github.com/Azure/ARO-RP/pkg/operator" "github.com/Azure/ARO-RP/pkg/operator/controllers/genevalogging" + utilcert "github.com/Azure/ARO-RP/pkg/util/cert" "github.com/Azure/ARO-RP/pkg/util/dns" "github.com/Azure/ARO-RP/pkg/util/pem" + "github.com/Azure/ARO-RP/pkg/util/version" ) // Copyright (c) Microsoft Corporation. @@ -94,3 +98,55 @@ func secretMissingMetric(namespace, name string) map[string]string { "name": name, } } + +func (mon *Monitor) emitEtcdCertificateExpiry(ctx context.Context) error { + cv, err := mon.getClusterVersion(ctx) + if err != nil { + return err + } + v, err := version.ParseVersion(actualVersion(cv)) + if err != nil { + return err + } + // ETCD ceritificates are autorotated by the operator when close to expiry for cluster running 4.9+ + if !v.Lt(version.NewVersion(4, 9)) { + return nil + } + + secretList, err := mon.cli.CoreV1().Secrets("openshift-etcd").List(ctx, metav1.ListOptions{FieldSelector: fmt.Sprintf("type=%s", corev1.SecretTypeTLS)}) + if err != nil { + return err + } + + certNearExpiry := false + minDaysUntilExpiration := math.MaxInt + for _, secret := range secretList.Items { + if strings.Contains(secret.ObjectMeta.Name, "etcd-peer") || strings.Contains(secret.ObjectMeta.Name, "etcd-serving") { + _, certs, err := pem.Parse(secret.Data[corev1.TLSCertKey]) + if err != nil { + return err + } + if utilcert.IsLessThanMinimumDuration(certs[0], utilcert.DefaultMinDurationPercent) { + certNearExpiry = true + minDaysUntilExpiration = min(utilcert.DaysUntilExpiration(certs[0]), minDaysUntilExpiration) + } + } + } + + if certNearExpiry { + mon.emitGauge("certificate.expirationdate", 1, map[string]string{ + "daysUntilExpiration": fmt.Sprintf("%d", minDaysUntilExpiration), + "namespace": "openshift-etcd", + "name": "openshift-etcd-certificate", + }) + } + + return nil +} + +func min(a, b int) int { + if a < b { + return a + } + return b +} diff --git a/pkg/monitor/cluster/certificateexpirationstatuses_test.go b/pkg/monitor/cluster/certificateexpirationstatuses_test.go index 850e3df9aae..74ff04c1991 100644 --- a/pkg/monitor/cluster/certificateexpirationstatuses_test.go +++ b/pkg/monitor/cluster/certificateexpirationstatuses_test.go @@ -4,13 +4,17 @@ import ( "context" "crypto/x509" "encoding/pem" + "fmt" "testing" "time" "github.com/golang/mock/gomock" + configv1 "github.com/openshift/api/config/v1" operatorv1 "github.com/openshift/api/operator/v1" + configfake "github.com/openshift/client-go/config/clientset/versioned/fake" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + fakeClient "k8s.io/client-go/kubernetes/fake" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/fake" @@ -231,3 +235,74 @@ func buildMonitor(m *mock_metrics.MockEmitter, domain, id string, secrets ...cli }, } } + +func TestEtcdCertificateExpiry(t *testing.T) { + ctx := context.Background() + expiration := time.Now().Add(time.Microsecond * 60) + _, cert, err := utiltls.GenerateTestKeyAndCertificate("etcd-cert", nil, nil, false, false, tweakTemplateFn(expiration)) + if err != nil { + t.Fatal(err) + } + + for _, tt := range []struct { + name string + configcli *configfake.Clientset + cli *fakeClient.Clientset + minDaysUntilExpiration int + }{ + { + name: "emit etcd certificate expiry", + configcli: configfake.NewSimpleClientset( + &configv1.ClusterVersion{ + ObjectMeta: metav1.ObjectMeta{ + Name: "version", + }, + Status: configv1.ClusterVersionStatus{ + History: []configv1.UpdateHistory{ + { + State: configv1.CompletedUpdate, + Version: "4.8.1", + }, + }, + }, + }, + ), + cli: fakeClient.NewSimpleClientset( + &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: "etcd-peer-master-0", + Namespace: "openshift-etcd", + }, + Data: map[string][]byte{ + corev1.TLSCertKey: pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: cert[0].Raw}), + }, + Type: corev1.SecretTypeTLS, + }, + ), + minDaysUntilExpiration: 0, + }, + } { + t.Run(tt.name, func(t *testing.T) { + controller := gomock.NewController(t) + defer controller.Finish() + + m := mock_metrics.NewMockEmitter(controller) + mon := &Monitor{ + cli: tt.cli, + configcli: tt.configcli, + m: m, + } + + m.EXPECT().EmitGauge("certificate.expirationdate", int64(1), map[string]string{ + "daysUntilExpiration": fmt.Sprintf("%d", tt.minDaysUntilExpiration), + "namespace": "openshift-etcd", + "name": "openshift-etcd-certificate", + }) + + err = mon.emitEtcdCertificateExpiry(ctx) + if err != nil { + t.Fatal(err) + } + }) + } +} diff --git a/pkg/monitor/cluster/cluster.go b/pkg/monitor/cluster/cluster.go index 89a320ebd9f..06bd66558a3 100644 --- a/pkg/monitor/cluster/cluster.go +++ b/pkg/monitor/cluster/cluster.go @@ -183,6 +183,7 @@ func (mon *Monitor) Monitor(ctx context.Context) (errs []error) { mon.emitOperatorFlagsAndSupportBanner, mon.emitPucmState, mon.emitCertificateExpirationStatuses, + mon.emitEtcdCertificateExpiry, mon.emitPrometheusAlerts, // at the end for now because it's the slowest/least reliable } { err = f(ctx) diff --git a/pkg/util/cert/cert.go b/pkg/util/cert/cert.go new file mode 100644 index 00000000000..8d434fd5465 --- /dev/null +++ b/pkg/util/cert/cert.go @@ -0,0 +1,27 @@ +package cert + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. + +import ( + "crypto/x509" + "time" +) + +const DefaultMinDurationPercent = 0.20 + +// IsLessThanMinimumDuration indicates whether the provided cert has less +// than the provided minimum percentage of its duration remaining. +func IsLessThanMinimumDuration(cert *x509.Certificate, minDurationPercent float64) bool { + duration := cert.NotAfter.Sub(cert.NotBefore) + minDuration := time.Duration(float64(duration.Nanoseconds()) * DefaultMinDurationPercent) + return time.Now().After(cert.NotAfter.Add(-minDuration)) +} + +func IsCertExpired(cert *x509.Certificate) bool { + return time.Now().After(cert.NotAfter) +} + +func DaysUntilExpiration(cert *x509.Certificate) int { + return int(time.Until(cert.NotAfter) / (24 * time.Hour)) +}