Skip to content

Commit

Permalink
Plug NSG monitoring to mon.WorkOne
Browse files Browse the repository at this point in the history
  • Loading branch information
nwnt committed Sep 13, 2023
1 parent 81332dc commit 6a08642
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 5 deletions.
4 changes: 2 additions & 2 deletions cmd/aro/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ import (
)

func monitor(ctx context.Context, log *logrus.Entry) error {
_env, err := env.NewCore(ctx, log)
_env, err := env.NewEnv(ctx, log)
if err != nil {
return err
}
Expand Down Expand Up @@ -126,7 +126,7 @@ func monitor(ctx context.Context, log *logrus.Entry) error {
return err
}

mon := pkgmonitor.NewMonitor(log.WithField("component", "monitor"), dialer, dbMonitors, dbOpenShiftClusters, dbSubscriptions, m, clusterm, liveConfig)
mon := pkgmonitor.NewMonitor(log.WithField("component", "monitor"), dialer, dbMonitors, dbOpenShiftClusters, dbSubscriptions, m, clusterm, liveConfig, _env)

return mon.Run(ctx)
}
5 changes: 4 additions & 1 deletion pkg/monitor/monitor.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/Azure/ARO-RP/pkg/api"
"github.com/Azure/ARO-RP/pkg/database"
"github.com/Azure/ARO-RP/pkg/database/cosmosdb"
"github.com/Azure/ARO-RP/pkg/env"
"github.com/Azure/ARO-RP/pkg/metrics"
"github.com/Azure/ARO-RP/pkg/proxy"
"github.com/Azure/ARO-RP/pkg/util/bucket"
Expand All @@ -36,6 +37,7 @@ type monitor struct {
mu sync.RWMutex
docs map[string]*cacheDoc
subs map[string]*api.SubscriptionDocument
env env.Interface

isMaster bool
bucketCount int
Expand All @@ -54,7 +56,7 @@ type Runnable interface {
Run(context.Context) error
}

func NewMonitor(log *logrus.Entry, dialer proxy.Dialer, dbMonitors database.Monitors, dbOpenShiftClusters database.OpenShiftClusters, dbSubscriptions database.Subscriptions, m, clusterm metrics.Emitter, liveConfig liveconfig.Manager) Runnable {
func NewMonitor(log *logrus.Entry, dialer proxy.Dialer, dbMonitors database.Monitors, dbOpenShiftClusters database.OpenShiftClusters, dbSubscriptions database.Subscriptions, m, clusterm metrics.Emitter, liveConfig liveconfig.Manager, e env.Interface) Runnable {
return &monitor{
baseLog: log,
dialer: dialer,
Expand All @@ -67,6 +69,7 @@ func NewMonitor(log *logrus.Entry, dialer proxy.Dialer, dbMonitors database.Moni
clusterm: clusterm,
docs: map[string]*cacheDoc{},
subs: map[string]*api.SubscriptionDocument{},
env: e,

bucketCount: bucket.Buckets,
buckets: map[int]struct{}{},
Expand Down
41 changes: 39 additions & 2 deletions pkg/monitor/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"k8s.io/client-go/rest"

"github.com/Azure/ARO-RP/pkg/api"
"github.com/Azure/ARO-RP/pkg/monitor/azure/nsg"
"github.com/Azure/ARO-RP/pkg/monitor/cluster"
utillog "github.com/Azure/ARO-RP/pkg/util/log"
"github.com/Azure/ARO-RP/pkg/util/recover"
Expand Down Expand Up @@ -212,7 +213,7 @@ out:
// cached metrics in the remaining minutes

if sub != nil && sub.Subscription != nil && sub.Subscription.State != api.SubscriptionStateSuspended && sub.Subscription.State != api.SubscriptionStateWarned {
mon.workOne(context.Background(), log, v.doc, newh != h)
mon.workOne(context.Background(), log, v.doc, sub, newh != h)
}

select {
Expand All @@ -228,7 +229,7 @@ out:
}

// workOne checks the API server health of a cluster
func (mon *monitor) workOne(ctx context.Context, log *logrus.Entry, doc *api.OpenShiftClusterDocument, hourlyRun bool) {
func (mon *monitor) workOne(ctx context.Context, log *logrus.Entry, doc *api.OpenShiftClusterDocument, sub *api.SubscriptionDocument, hourlyRun bool) {
ctx, cancel := context.WithTimeout(ctx, 50*time.Second)
defer cancel()

Expand All @@ -246,11 +247,47 @@ func (mon *monitor) workOne(ctx context.Context, log *logrus.Entry, doc *api.Ope
log.Warnf("no hiveShardConfigs set for shard %d", shard)
}

var nsgMon *nsg.NSGMonitor
if doc.OpenShiftCluster.Properties.NetworkProfile.PreconfiguredNSG == api.PreconfiguredNSGEnabled && hourlyRun {
fpAuthorizer, err := mon.env.FPAuthorizer(sub.Subscription.Properties.TenantID, mon.env.Environment().ResourceManagerEndpoint)
if err != nil {
// Not stopping here just because can't monitor NSG
log.Error("Unable to create FP Authorizer for NSG monitoring.", err)
mon.m.EmitGauge(nsg.MetricUnsuccessfulFPCreation, int64(1), map[string]string{
nsg.DimClusterResourceID: doc.ID,
nsg.DimLocation: doc.OpenShiftCluster.Location,
nsg.DimSubscriptionID: sub.ID,
nsg.DimTenantID: sub.Subscription.Properties.TenantID,
})
} else {
nsgMon = nsg.NewNSGMonitor(log, doc.OpenShiftCluster, sub.ID, mon.env.Environment(), fpAuthorizer, mon.m)
go nsgMon.Monitor(ctx)
}
}

c, err := cluster.NewMonitor(log, restConfig, doc.OpenShiftCluster, mon.clusterm, hiveRestConfig, hourlyRun)
if err != nil {
log.Error(err)
return
}

c.Monitor(ctx)

// if doing nsg monitoring, wait until timed out
if nsgMon != nil {
select {
case err := <-nsgMon.Done():
if err != nil {
log.Error("Error occurred during NSG monitoring", err)
}
case <-ctx.Done():
log.Info("NSG Monitoring timed out")
mon.m.EmitGauge(nsg.MetricNSGMonitoringTimedOut, int64(1), map[string]string{
nsg.DimClusterResourceID: doc.ID,
nsg.DimLocation: doc.OpenShiftCluster.Location,
nsg.DimSubscriptionID: sub.ID,
nsg.DimTenantID: sub.Subscription.Properties.TenantID,
})
}
}
}

0 comments on commit 6a08642

Please sign in to comment.