Skip to content

Commit 5900edd

Browse files
milan-zededaeriknordmark
authored andcommitted
Add flowlog metrics
These metrics will allow to see how many flow records were published or dropped and check if there are any issues such as flowlog queue congestion. Signed-off-by: Milan Lenco <[email protected]>
1 parent 731200d commit 5900edd

File tree

11 files changed

+4758
-4142
lines changed

11 files changed

+4758
-4142
lines changed

api/go/metrics/metrics.pb.go

Lines changed: 822 additions & 619 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

api/images/devconfig.dot

Lines changed: 174 additions & 174 deletions
Large diffs are not rendered by default.

api/images/devconfig.dot.svg

Lines changed: 2664 additions & 2664 deletions
Loading

api/images/devconfig.png

-38.9 KB
Loading

api/proto/metrics/metrics.proto

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -207,6 +207,7 @@ message deviceMetric {
207207
google.protobuf.Timestamp last_processed_config = 19;
208208

209209
repeated CellularMetric cellular = 20;
210+
FlowlogMetric flowlog = 21;
210211
}
211212

212213
message AclMetric {
@@ -560,3 +561,31 @@ message vlanInfo {
560561
uint32 num_trunk_ports = 1; // Number of ports attached to this network instance that are designated trunk
561562
map <uint32, uint32> vlan_counts = 2; // vlan id to it's usage count map
562563
}
564+
565+
// Flowlog stats.
566+
message FlowlogMetric {
567+
// Counting FlowMessage instances.
568+
// Note that FlowMessage is used to package and carry a list of flows and DNS requests.
569+
FlowlogCounters messages = 1;
570+
// Counting FlowMessage.Flows.
571+
FlowlogCounters flows = 2;
572+
// Counting FlowMessage.DnsReqs.
573+
FlowlogCounters dns_requests = 3;
574+
}
575+
576+
// Counters for published/dropped flowlog messages/flows or DNS requests.
577+
// Note that every record is eventually either successfully published or dropped.
578+
// In the process of publishing a flowlog record, one or more failed attempts can be made.
579+
// This means that the total number of fully processed records (i.e. not queued anymore)
580+
// equals the sum of "success" and "drops", while "failed_attempts" is an orthogonal metric.
581+
message FlowlogCounters {
582+
// Flow records successfully published to zedcloud.
583+
uint64 success = 1;
584+
// Flow records dropped because the flowlog queue would not fit them.
585+
// This is either because the queue was already full when the record was created,
586+
// or the publish attempts would keep failing and the queue was nearing its capacity.
587+
uint64 drops = 2;
588+
// The number of failed attempts to publish a flow record.
589+
uint64 failed_attempts = 3;
590+
}
591+

api/python/metrics/metrics_pb2.py

Lines changed: 185 additions & 66 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/pillar/cmd/zedagent/handlemetrics.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -616,6 +616,15 @@ func publishMetrics(ctx *zedagentContext, iteration int) {
616616
}
617617
}
618618

619+
// Report flowlog metrics.
620+
ctx.flowLogMetrics.Lock()
621+
ReportDeviceMetric.Flowlog = &metrics.FlowlogMetric{
622+
Messages: protoEncodeFlowlogCounters(ctx.flowLogMetrics.Messages),
623+
Flows: protoEncodeFlowlogCounters(ctx.flowLogMetrics.Flows),
624+
DnsRequests: protoEncodeFlowlogCounters(ctx.flowLogMetrics.DNSReqs),
625+
}
626+
ctx.flowLogMetrics.Unlock()
627+
619628
ReportMetrics.MetricContent = new(metrics.ZMetricMsg_Dm)
620629
if x, ok := ReportMetrics.GetMetricContent().(*metrics.ZMetricMsg_Dm); ok {
621630
x.Dm = ReportDeviceMetric
@@ -1561,3 +1570,11 @@ func protoEncodeNetworkInstanceMetricProto(status types.NetworkInstanceMetrics)
15611570

15621571
return metric
15631572
}
1573+
1574+
func protoEncodeFlowlogCounters(counters types.FlowlogCounters) *metrics.FlowlogCounters {
1575+
return &metrics.FlowlogCounters{
1576+
Success: counters.Success,
1577+
Drops: counters.Drops,
1578+
FailedAttempts: counters.FailedAttempts,
1579+
}
1580+
}

pkg/pillar/cmd/zedagent/handlenetworkinstance.go

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -444,6 +444,11 @@ func handleAppFlowMonitorImpl(ctxArg interface{}, key string,
444444
case ctx.FlowlogQueue <- pflows:
445445
default:
446446
log.Errorf("Flowlog queue is full, dropping flowlog entry: %+v", pflows.Scope)
447+
ctx.flowLogMetrics.Lock()
448+
ctx.flowLogMetrics.Messages.Drops++
449+
ctx.flowLogMetrics.Flows.Drops += uint64(len(pflows.Flows))
450+
ctx.flowLogMetrics.DNSReqs.Drops += uint64(len(pflows.DnsReqs))
451+
ctx.flowLogMetrics.Unlock()
447452
}
448453
}
449454

@@ -545,11 +550,26 @@ func flowlogTask(ctx *zedagentContext, flowlogQueue <-chan *flowlog.FlowMessage)
545550
err := publishFlowMessage(msg, iteration)
546551
if err == nil {
547552
iteration++
553+
ctx.flowLogMetrics.Lock()
554+
ctx.flowLogMetrics.Messages.Success++
555+
ctx.flowLogMetrics.Flows.Success += uint64(len(msg.Flows))
556+
ctx.flowLogMetrics.DNSReqs.Success += uint64(len(msg.DnsReqs))
557+
ctx.flowLogMetrics.Unlock()
548558
} else {
549559
log.Error(err)
560+
ctx.flowLogMetrics.Lock()
561+
ctx.flowLogMetrics.Messages.FailedAttempts++
562+
ctx.flowLogMetrics.Flows.FailedAttempts += uint64(len(msg.Flows))
563+
ctx.flowLogMetrics.DNSReqs.FailedAttempts += uint64(len(msg.DnsReqs))
564+
ctx.flowLogMetrics.Unlock()
550565
if (100*len(flowlogQueue))/cap(flowlogQueue) > 90 {
551566
// More than 90% of the queue is used, start dropping instead of retrying.
552567
log.Warnf("flowlogTask: dropped flow message: %+v", msg.Scope)
568+
ctx.flowLogMetrics.Lock()
569+
ctx.flowLogMetrics.Messages.Drops++
570+
ctx.flowLogMetrics.Flows.Drops += uint64(len(msg.Flows))
571+
ctx.flowLogMetrics.DNSReqs.Drops += uint64(len(msg.DnsReqs))
572+
ctx.flowLogMetrics.Unlock()
553573
} else {
554574
retry = true
555575
}

pkg/pillar/cmd/zedagent/zedagent.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,7 @@ type zedagentContext struct {
148148
globalConfig types.ConfigItemValueMap
149149
specMap types.ConfigItemSpecMap
150150
globalStatus types.GlobalStatus
151+
flowLogMetrics types.FlowlogMetrics
151152
appContainerStatsTime time.Time // last time the App Container stats uploaded
152153
// The MaintenanceMode can come from GlobalConfig and from the config
153154
// API. Those are merged into maintenanceMode

pkg/pillar/types/flowlogmetrics.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
/*
2+
* Copyright (c) 2021. Zededa, Inc.
3+
* SPDX-License-Identifier: Apache-2.0
4+
*/
5+
6+
package types
7+
8+
import "sync"
9+
10+
// FlowlogCounters encapsulates counters for published/dropped flowlog
11+
// messages/flows or DNS requests.
12+
type FlowlogCounters struct {
13+
Success uint64
14+
Drops uint64
15+
FailedAttempts uint64
16+
}
17+
18+
// FlowlogMetrics contains flowlog metrics as collected by flowlogTask of zedagent.
19+
type FlowlogMetrics struct {
20+
sync.Mutex
21+
Messages FlowlogCounters
22+
Flows FlowlogCounters
23+
DNSReqs FlowlogCounters
24+
}

0 commit comments

Comments
 (0)