From 6a08642d7d02896fa11bbd518140fb13926748ef Mon Sep 17 00:00:00 2001 From: nont Date: Tue, 12 Sep 2023 22:05:39 +0700 Subject: [PATCH] Plug NSG monitoring to mon.WorkOne --- cmd/aro/monitor.go | 4 ++-- pkg/monitor/monitor.go | 5 ++++- pkg/monitor/worker.go | 41 +++++++++++++++++++++++++++++++++++++++-- 3 files changed, 45 insertions(+), 5 deletions(-) diff --git a/cmd/aro/monitor.go b/cmd/aro/monitor.go index e178a848627..fe0e32e3b04 100644 --- a/cmd/aro/monitor.go +++ b/cmd/aro/monitor.go @@ -25,7 +25,7 @@ import ( ) func monitor(ctx context.Context, log *logrus.Entry) error { - _env, err := env.NewCore(ctx, log) + _env, err := env.NewEnv(ctx, log) if err != nil { return err } @@ -126,7 +126,7 @@ func monitor(ctx context.Context, log *logrus.Entry) error { return err } - mon := pkgmonitor.NewMonitor(log.WithField("component", "monitor"), dialer, dbMonitors, dbOpenShiftClusters, dbSubscriptions, m, clusterm, liveConfig) + mon := pkgmonitor.NewMonitor(log.WithField("component", "monitor"), dialer, dbMonitors, dbOpenShiftClusters, dbSubscriptions, m, clusterm, liveConfig, _env) return mon.Run(ctx) } diff --git a/pkg/monitor/monitor.go b/pkg/monitor/monitor.go index 89716d758a7..f03005171e6 100644 --- a/pkg/monitor/monitor.go +++ b/pkg/monitor/monitor.go @@ -16,6 +16,7 @@ import ( "github.com/Azure/ARO-RP/pkg/api" "github.com/Azure/ARO-RP/pkg/database" "github.com/Azure/ARO-RP/pkg/database/cosmosdb" + "github.com/Azure/ARO-RP/pkg/env" "github.com/Azure/ARO-RP/pkg/metrics" "github.com/Azure/ARO-RP/pkg/proxy" "github.com/Azure/ARO-RP/pkg/util/bucket" @@ -36,6 +37,7 @@ type monitor struct { mu sync.RWMutex docs map[string]*cacheDoc subs map[string]*api.SubscriptionDocument + env env.Interface isMaster bool bucketCount int @@ -54,7 +56,7 @@ type Runnable interface { Run(context.Context) error } -func NewMonitor(log *logrus.Entry, dialer proxy.Dialer, dbMonitors database.Monitors, dbOpenShiftClusters database.OpenShiftClusters, dbSubscriptions database.Subscriptions, m, clusterm metrics.Emitter, liveConfig liveconfig.Manager) Runnable { +func NewMonitor(log *logrus.Entry, dialer proxy.Dialer, dbMonitors database.Monitors, dbOpenShiftClusters database.OpenShiftClusters, dbSubscriptions database.Subscriptions, m, clusterm metrics.Emitter, liveConfig liveconfig.Manager, e env.Interface) Runnable { return &monitor{ baseLog: log, dialer: dialer, @@ -67,6 +69,7 @@ func NewMonitor(log *logrus.Entry, dialer proxy.Dialer, dbMonitors database.Moni clusterm: clusterm, docs: map[string]*cacheDoc{}, subs: map[string]*api.SubscriptionDocument{}, + env: e, bucketCount: bucket.Buckets, buckets: map[int]struct{}{}, diff --git a/pkg/monitor/worker.go b/pkg/monitor/worker.go index feb632edf73..3592ceba9c8 100644 --- a/pkg/monitor/worker.go +++ b/pkg/monitor/worker.go @@ -14,6 +14,7 @@ import ( "k8s.io/client-go/rest" "github.com/Azure/ARO-RP/pkg/api" + "github.com/Azure/ARO-RP/pkg/monitor/azure/nsg" "github.com/Azure/ARO-RP/pkg/monitor/cluster" utillog "github.com/Azure/ARO-RP/pkg/util/log" "github.com/Azure/ARO-RP/pkg/util/recover" @@ -212,7 +213,7 @@ out: // cached metrics in the remaining minutes if sub != nil && sub.Subscription != nil && sub.Subscription.State != api.SubscriptionStateSuspended && sub.Subscription.State != api.SubscriptionStateWarned { - mon.workOne(context.Background(), log, v.doc, newh != h) + mon.workOne(context.Background(), log, v.doc, sub, newh != h) } select { @@ -228,7 +229,7 @@ out: } // workOne checks the API server health of a cluster -func (mon *monitor) workOne(ctx context.Context, log *logrus.Entry, doc *api.OpenShiftClusterDocument, hourlyRun bool) { +func (mon *monitor) workOne(ctx context.Context, log *logrus.Entry, doc *api.OpenShiftClusterDocument, sub *api.SubscriptionDocument, hourlyRun bool) { ctx, cancel := context.WithTimeout(ctx, 50*time.Second) defer cancel() @@ -246,6 +247,24 @@ func (mon *monitor) workOne(ctx context.Context, log *logrus.Entry, doc *api.Ope log.Warnf("no hiveShardConfigs set for shard %d", shard) } + var nsgMon *nsg.NSGMonitor + if doc.OpenShiftCluster.Properties.NetworkProfile.PreconfiguredNSG == api.PreconfiguredNSGEnabled && hourlyRun { + fpAuthorizer, err := mon.env.FPAuthorizer(sub.Subscription.Properties.TenantID, mon.env.Environment().ResourceManagerEndpoint) + if err != nil { + // Not stopping here just because can't monitor NSG + log.Error("Unable to create FP Authorizer for NSG monitoring.", err) + mon.m.EmitGauge(nsg.MetricUnsuccessfulFPCreation, int64(1), map[string]string{ + nsg.DimClusterResourceID: doc.ID, + nsg.DimLocation: doc.OpenShiftCluster.Location, + nsg.DimSubscriptionID: sub.ID, + nsg.DimTenantID: sub.Subscription.Properties.TenantID, + }) + } else { + nsgMon = nsg.NewNSGMonitor(log, doc.OpenShiftCluster, sub.ID, mon.env.Environment(), fpAuthorizer, mon.m) + go nsgMon.Monitor(ctx) + } + } + c, err := cluster.NewMonitor(log, restConfig, doc.OpenShiftCluster, mon.clusterm, hiveRestConfig, hourlyRun) if err != nil { log.Error(err) @@ -253,4 +272,22 @@ func (mon *monitor) workOne(ctx context.Context, log *logrus.Entry, doc *api.Ope } c.Monitor(ctx) + + // if doing nsg monitoring, wait until timed out + if nsgMon != nil { + select { + case err := <-nsgMon.Done(): + if err != nil { + log.Error("Error occurred during NSG monitoring", err) + } + case <-ctx.Done(): + log.Info("NSG Monitoring timed out") + mon.m.EmitGauge(nsg.MetricNSGMonitoringTimedOut, int64(1), map[string]string{ + nsg.DimClusterResourceID: doc.ID, + nsg.DimLocation: doc.OpenShiftCluster.Location, + nsg.DimSubscriptionID: sub.ID, + nsg.DimTenantID: sub.Subscription.Properties.TenantID, + }) + } + } }