diff --git a/pkg/monitor/cluster/certificateexpirationstatuses.go b/pkg/monitor/cluster/certificateexpirationstatuses.go new file mode 100644 index 00000000000..120710f835c --- /dev/null +++ b/pkg/monitor/cluster/certificateexpirationstatuses.go @@ -0,0 +1,96 @@ +package cluster + +import ( + "context" + "crypto/x509" + "fmt" + "strings" + "time" + + operatorv1 "github.com/openshift/api/operator/v1" + corev1 "k8s.io/api/core/v1" + kerrors "k8s.io/apimachinery/pkg/api/errors" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/Azure/ARO-RP/pkg/operator" + "github.com/Azure/ARO-RP/pkg/operator/controllers/genevalogging" + "github.com/Azure/ARO-RP/pkg/util/dns" + "github.com/Azure/ARO-RP/pkg/util/pem" +) + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. +const ( + certificateExpirationMetricName = "certificate.expirationdate" + secretMissingMetricName = "certificate.secretnotfound" + ingressNamespace = "openshift-ingress-operator" + ingressName = "default" +) + +func (mon *Monitor) emitCertificateExpirationStatuses(ctx context.Context) error { + // report NotAfter dates for Ingress and API (on managed domains), and Geneva (always) + var certs []*x509.Certificate + + mdsdCert, err := mon.getCertificate(ctx, operator.Namespace, operator.SecretName, genevalogging.GenevaCertName) + if kerrors.IsNotFound(err) { + mon.emitGauge(secretMissingMetricName, int64(1), secretMissingMetric(operator.Namespace, operator.SecretName)) + } else if err != nil { + return err + } else { + certs = append(certs, mdsdCert) + } + + if dns.IsManagedDomain(mon.oc.Properties.ClusterProfile.Domain) { + ic := &operatorv1.IngressController{} + err := mon.ocpclientset.Get(ctx, client.ObjectKey{ + Namespace: ingressNamespace, + Name: ingressName, + }, ic) + if err != nil { + return err + } + ingressSecretName := ic.Spec.DefaultCertificate.Name + + // secret with managed certificates is uuid + "-ingress" or "-apiserver" + for _, secretName := range []string{ingressSecretName, strings.Replace(ingressSecretName, "-ingress", "-apiserver", 1)} { + certificate, err := mon.getCertificate(ctx, operator.Namespace, secretName, corev1.TLSCertKey) + if kerrors.IsNotFound(err) { + mon.emitGauge(secretMissingMetricName, int64(1), secretMissingMetric(operator.Namespace, secretName)) + } else if err != nil { + return err + } else { + certs = append(certs, certificate) + } + } + } + + for _, cert := range certs { + daysUntilExpiration := time.Until(cert.NotAfter) / (24 * time.Hour) + mon.emitGauge(certificateExpirationMetricName, 1, map[string]string{ + "subject": cert.Subject.CommonName, + "expirationDate": cert.NotAfter.UTC().Format(time.RFC3339), + "daysUntilExpiration": fmt.Sprintf("%d", daysUntilExpiration), + }) + } + return nil +} + +func (mon *Monitor) getCertificate(ctx context.Context, secretNamespace, secretName, secretKey string) (*x509.Certificate, error) { + secret := &corev1.Secret{} + err := mon.ocpclientset.Get(ctx, client.ObjectKey{ + Namespace: secretNamespace, + Name: secretName, + }, secret) + if err != nil { + return nil, err + } + + return pem.ParseFirstCertificate(secret.Data[secretKey]) +} + +func secretMissingMetric(namespace, name string) map[string]string { + return map[string]string{ + "namespace": namespace, + "name": name, + } +} diff --git a/pkg/monitor/cluster/certificateexpirationstatuses_test.go b/pkg/monitor/cluster/certificateexpirationstatuses_test.go new file mode 100644 index 00000000000..850e3df9aae --- /dev/null +++ b/pkg/monitor/cluster/certificateexpirationstatuses_test.go @@ -0,0 +1,233 @@ +package cluster + +import ( + "context" + "crypto/x509" + "encoding/pem" + "testing" + "time" + + "github.com/golang/mock/gomock" + operatorv1 "github.com/openshift/api/operator/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + "github.com/Azure/ARO-RP/pkg/api" + mock_metrics "github.com/Azure/ARO-RP/pkg/util/mocks/metrics" + utiltls "github.com/Azure/ARO-RP/pkg/util/tls" + "github.com/Azure/ARO-RP/pkg/util/uuid" + utilerror "github.com/Azure/ARO-RP/test/util/error" +) + +// Copyright (c) Microsoft Corporation. +// Licensed under the Apache License 2.0. +type certInfo struct { + secretName, certSubject string +} + +const ( + managedDomainName = "contoso.aroapp.io" + unmanagedDomainName = "aro.contoso.com" +) + +func TestEmitCertificateExpirationStatuses(t *testing.T) { + expiration := time.Now().Add(time.Hour * 24 * 5) + expirationString := expiration.UTC().Format(time.RFC3339) + clusterID := uuid.DefaultGenerator.Generate() + + for _, tt := range []struct { + name string + domain string + certsPresent []certInfo + wantExpirations []map[string]string + wantWarning []map[string]string + wantErr string + }{ + { + name: "only emits MDSD status for unmanaged domain", + domain: unmanagedDomainName, + certsPresent: []certInfo{{"cluster", "geneva.certificate"}}, + wantExpirations: []map[string]string{ + { + "subject": "geneva.certificate", + "expirationDate": expirationString, + "daysUntilExpiration": "4", + }, + }, + }, + { + name: "includes ingress and API status for managed domain", + domain: managedDomainName, + certsPresent: []certInfo{ + {"cluster", "geneva.certificate"}, + {clusterID + "-ingress", managedDomainName}, + {clusterID + "-apiserver", "api." + managedDomainName}, + }, + wantExpirations: []map[string]string{ + { + "subject": "geneva.certificate", + "expirationDate": expirationString, + "daysUntilExpiration": "4", + }, + { + "subject": "contoso.aroapp.io", + "expirationDate": expirationString, + "daysUntilExpiration": "4", + }, + { + "subject": "api.contoso.aroapp.io", + "expirationDate": expirationString, + "daysUntilExpiration": "4", + }, + }, + }, + { + name: "emits warning metric when cluster secret has been deleted", + domain: unmanagedDomainName, + wantWarning: []map[string]string{ + { + "namespace": "openshift-azure-operator", + "name": "cluster", + }, + }, + }, + { + name: "emits warning metric when managed domain secret has been deleted", + domain: managedDomainName, + certsPresent: []certInfo{ + {"cluster", "geneva.certificate"}, + {clusterID + "-ingress", managedDomainName}, + }, + wantExpirations: []map[string]string{ + { + "subject": "geneva.certificate", + "expirationDate": expirationString, + "daysUntilExpiration": "4", + }, + { + "subject": "contoso.aroapp.io", + "expirationDate": expirationString, + "daysUntilExpiration": "4", + }, + }, + wantWarning: []map[string]string{ + { + "namespace": "openshift-azure-operator", + "name": clusterID + "-apiserver", + }, + }, + }, + } { + t.Run(tt.name, func(t *testing.T) { + ctx := context.Background() + + var secrets []client.Object + secretsFromCertInfo, err := generateTestSecrets(tt.certsPresent, tweakTemplateFn(expiration)) + if err != nil { + t.Fatal(err) + } + secrets = append(secrets, secretsFromCertInfo...) + + m := mock_metrics.NewMockEmitter(gomock.NewController(t)) + for _, w := range tt.wantWarning { + m.EXPECT().EmitGauge(secretMissingMetricName, int64(1), w) + } + for _, g := range tt.wantExpirations { + m.EXPECT().EmitGauge(certificateExpirationMetricName, int64(1), g) + } + + mon := buildMonitor(m, tt.domain, clusterID, secrets...) + + err = mon.emitCertificateExpirationStatuses(ctx) + + utilerror.AssertErrorMessage(t, err, tt.wantErr) + }) + } + + t.Run("returns error when secret is present but certificate data has been deleted", func(t *testing.T) { + var secrets []client.Object + data := map[string][]byte{} + s := buildSecret("cluster", data) + secrets = append(secrets, s) + + ctx := context.Background() + m := mock_metrics.NewMockEmitter(gomock.NewController(t)) + mon := buildMonitor(m, managedDomainName, clusterID, secrets...) + + wantErr := "unable to find certificate" + err := mon.emitCertificateExpirationStatuses(ctx) + utilerror.AssertErrorMessage(t, err, wantErr) + }) +} + +func tweakTemplateFn(expiration time.Time) func(*x509.Certificate) { + return func(template *x509.Certificate) { + template.NotAfter = expiration + } +} + +func generateTestSecrets(certsInfo []certInfo, tweakTemplateFn func(*x509.Certificate)) ([]client.Object, error) { + var secrets []client.Object + for _, sec := range certsInfo { + _, cert, err := utiltls.GenerateTestKeyAndCertificate(sec.certSubject, nil, nil, false, false, tweakTemplateFn) + if err != nil { + return nil, err + } + certKey := "tls.crt" + if sec.secretName == "cluster" { + certKey = "gcscert.pem" + } + data := map[string][]byte{ + certKey: pem.EncodeToMemory(&pem.Block{ + Type: "CERTIFICATE", + Bytes: cert[0].Raw, + }), + } + s := buildSecret(sec.secretName, data) + secrets = append(secrets, s) + } + return secrets, nil +} + +func buildSecret(secretName string, data map[string][]byte) *corev1.Secret { + return &corev1.Secret{ + ObjectMeta: metav1.ObjectMeta{ + Name: secretName, + Namespace: "openshift-azure-operator", + }, + Data: data, + } +} + +func buildMonitor(m *mock_metrics.MockEmitter, domain, id string, secrets ...client.Object) *Monitor { + ingressController := &operatorv1.IngressController{ + ObjectMeta: metav1.ObjectMeta{ + Name: "default", + Namespace: "openshift-ingress-operator", + }, + Spec: operatorv1.IngressControllerSpec{ + DefaultCertificate: &corev1.LocalObjectReference{ + Name: id + "-ingress", + }, + }, + } + + ocpclientset := fake. + NewClientBuilder(). + WithObjects(ingressController). + WithObjects(secrets...). + Build() + return &Monitor{ + ocpclientset: ocpclientset, + m: m, + oc: &api.OpenShiftCluster{ + Properties: api.OpenShiftClusterProperties{ + ClusterProfile: api.ClusterProfile{ + Domain: domain, + }, + }, + }, + } +} diff --git a/pkg/monitor/cluster/cluster.go b/pkg/monitor/cluster/cluster.go index 209a550ead9..89a320ebd9f 100644 --- a/pkg/monitor/cluster/cluster.go +++ b/pkg/monitor/cluster/cluster.go @@ -41,6 +41,7 @@ type Monitor struct { m metrics.Emitter arocli aroclient.Interface + ocpclientset client.Client hiveclientset client.Client // access below only via the helper functions in cache.go @@ -91,6 +92,11 @@ func NewMonitor(log *logrus.Entry, restConfig *rest.Config, oc *api.OpenShiftClu return nil, err } + ocpclientset, err := client.New(restConfig, client.Options{}) + if err != nil { + return nil, err + } + hiveclientset, err := getHiveClientSet(hiveRestConfig) if err != nil { log.Error(err) @@ -110,6 +116,7 @@ func NewMonitor(log *logrus.Entry, restConfig *rest.Config, oc *api.OpenShiftClu mcocli: mcocli, arocli: arocli, m: m, + ocpclientset: ocpclientset, hiveclientset: hiveclientset, }, nil } @@ -175,6 +182,7 @@ func (mon *Monitor) Monitor(ctx context.Context) (errs []error) { mon.emitHiveRegistrationStatus, mon.emitOperatorFlagsAndSupportBanner, mon.emitPucmState, + mon.emitCertificateExpirationStatuses, mon.emitPrometheusAlerts, // at the end for now because it's the slowest/least reliable } { err = f(ctx)