-
Notifications
You must be signed in to change notification settings - Fork 818
feat(reexecute/c): decouple metrics server and collector #4415
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 4 commits
0178776
4ad3f2b
fc83c89
848c6ad
be761ac
ca0b993
a7cb056
d36d4ca
c7f3185
9e319d0
2cc0a79
5160738
1832b58
915ae5d
d846616
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -42,7 +42,18 @@ export AWS_REGION=us-east-2 | |
|
|
||
| ### Metrics Collection | ||
|
|
||
| If running with metrics collection, enabled in CI and configured locally with `METRICS_ENABLED=true`, follow the instructions in the e2e [README](../../e2e/README.md#monitoring) to set the required Prometheus environment variables. | ||
| If running locally, there are three options for metrics collection: | ||
|
|
||
| - `METRICS_MODE=disabled`: no metrics are available. | ||
| - `METRICS_MODE=server-only`: starts a Prometheus server exporting VM metrics. A | ||
| link to the metrics endpoint is logged during execution. | ||
| - `METRICS_MODE=full`: starts both a Prometheus server exporting VM metrics and | ||
|
||
| a Prometheus collector. A link to the corresponding Grafana dashboard is | ||
| logged during execution. | ||
|
|
||
| When utilizing the `full` options, follow the instructions in the e2e [README](../../e2e/README.md#monitoring) to set the required Prometheus environment variables. | ||
|
|
||
| Running the re-execution test in CI will always set `METRICS_MODE=full`. | ||
|
|
||
| ## Quick Start | ||
|
|
||
|
|
@@ -230,7 +241,7 @@ The `CONFIG` parameter currently only supports pre-defined configs and not passi | |
|
|
||
| The C-Chain benchmarks export VM metrics to the same Grafana instance as AvalancheGo CI: https://grafana-poc.avax-dev.network/. | ||
|
|
||
| To export metrics for a local run, simply set the Taskfile variable `METRICS_ENABLED=true` either via environment variable or passing it at the command line. | ||
| To export metrics for a local run, simply set the Taskfile variable `METRICS_MODE=full` either via environment variable or passing it at the command line. | ||
|
|
||
| You can view granular C-Chain processing metrics with the label attached to this job (job="c-chain-reexecution") [here](https://grafana-poc.avax-dev.network/d/Gl1I20mnk/c-chain?orgId=1&from=now-5m&to=now&timezone=browser&var-datasource=P1809F7CD0C75ACF3&var-filter=job%7C%3D%7Cc-chain-reexecution&var-chain=C&refresh=10s). | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -48,6 +48,12 @@ import ( | |
| "github.com/ava-labs/avalanchego/vms/platformvm/warp" | ||
| ) | ||
|
|
||
| const ( | ||
| MetricsDisabled metricsMode = iota | ||
| MetricsServerOnly | ||
| MetricsFull | ||
| ) | ||
|
|
||
| var ( | ||
| mainnetXChainID = ids.FromStringOrPanic("2oYMBNV4eNHyqk2fjjV5nVQLDbtmNJzq5s3qs3Lo6ftnC6FByM") | ||
| mainnetCChainID = ids.FromStringOrPanic("2q9e4r6Mu3U68nU1fYjgbR6JvwrRx36CohpAX5UQxse55x1Q5") | ||
|
|
@@ -62,10 +68,11 @@ var ( | |
| startBlockArg uint64 | ||
| endBlockArg uint64 | ||
| chanSizeArg int | ||
| metricsEnabledArg bool | ||
| executionTimeout time.Duration | ||
| labelsArg string | ||
|
|
||
| metricsModeArg = MetricsDisabled | ||
|
|
||
| networkUUID string = uuid.NewString() | ||
| labels = map[string]string{ | ||
| "job": "c-chain-reexecution", | ||
|
|
@@ -94,6 +101,41 @@ var ( | |
| configBytesArg []byte | ||
| ) | ||
|
|
||
| type metricsMode int | ||
|
|
||
| func (m *metricsMode) Set(s string) error { | ||
| s = strings.ToLower(strings.TrimSpace(s)) | ||
|
|
||
| switch s { | ||
| case "disabled": | ||
| *m = MetricsDisabled | ||
| case "server-only": | ||
| *m = MetricsServerOnly | ||
| case "full": | ||
| *m = MetricsFull | ||
| default: | ||
| return fmt.Errorf("invalid metrics mode: %s (valid options: disabled, server-only, full)", s) | ||
| } | ||
| return nil | ||
| } | ||
|
||
|
|
||
| func (m metricsMode) String() string { | ||
| switch m { | ||
| case MetricsDisabled: | ||
| return "disabled" | ||
| case MetricsServerOnly: | ||
| return "server-only" | ||
| case MetricsFull: | ||
| return "full" | ||
| default: | ||
| return "unknown" | ||
| } | ||
| } | ||
|
|
||
| func (m metricsMode) shouldStartServer() bool { return m >= MetricsServerOnly } | ||
|
|
||
| func (m metricsMode) shouldStartCollector() bool { return m == MetricsFull } | ||
|
|
||
| func TestMain(m *testing.M) { | ||
| evm.RegisterAllLibEVMExtras() | ||
|
|
||
|
|
@@ -104,7 +146,7 @@ func TestMain(m *testing.M) { | |
| flag.IntVar(&chanSizeArg, "chan-size", 100, "Size of the channel to use for block processing.") | ||
| flag.DurationVar(&executionTimeout, "execution-timeout", 0, "Benchmark execution timeout. After this timeout has elapsed, terminate the benchmark without error. If 0, no timeout is applied.") | ||
|
|
||
| flag.BoolVar(&metricsEnabledArg, "metrics-enabled", false, "Enable metrics collection.") | ||
| flag.Var(&metricsModeArg, "metrics-mode", "Metrics mode: disabled (no metrics), server-only (creates Prometheus server), or full (creates Prometheus server and starts Prometheus collector)") | ||
| flag.StringVar(&labelsArg, "labels", "", "Comma separated KV list of metric labels to attach to all exported metrics. Ex. \"owner=tim,runner=snoopy\"") | ||
|
|
||
| predefinedConfigKeys := slices.Collect(maps.Keys(predefinedConfigs)) | ||
|
|
@@ -151,7 +193,7 @@ func BenchmarkReexecuteRange(b *testing.B) { | |
| startBlockArg, | ||
| endBlockArg, | ||
| chanSizeArg, | ||
| metricsEnabledArg, | ||
| metricsModeArg, | ||
| ) | ||
| }) | ||
| } | ||
|
|
@@ -164,7 +206,7 @@ func benchmarkReexecuteRange( | |
| startBlock uint64, | ||
| endBlock uint64, | ||
| chanSize int, | ||
| metricsEnabled bool, | ||
| metricsMode metricsMode, | ||
| ) { | ||
| r := require.New(b) | ||
| ctx := context.Background() | ||
|
|
@@ -185,9 +227,8 @@ func benchmarkReexecuteRange( | |
| r.NoError(prefixGatherer.Register("avalanche_snowman", consensusRegistry)) | ||
|
|
||
| log := tests.NewDefaultLogger("c-chain-reexecution") | ||
|
|
||
| if metricsEnabled { | ||
| collectRegistry(b, log, "c-chain-reexecution", prefixGatherer, labels) | ||
| if metricsMode.shouldStartServer() { | ||
| collectRegistry(b, log, "c-chain-reexecution", prefixGatherer, labels, metricsMode.shouldStartCollector()) | ||
|
||
| } | ||
|
|
||
| var ( | ||
|
|
@@ -554,20 +595,40 @@ func newConsensusMetrics(registry prometheus.Registerer) (*consensusMetrics, err | |
| return m, nil | ||
| } | ||
|
|
||
| // collectRegistry starts prometheus and collects metrics from the provided gatherer. | ||
| // Attaches the provided labels + GitHub labels if available to the collected metrics. | ||
| func collectRegistry(tb testing.TB, log logging.Logger, name string, gatherer prometheus.Gatherer, labels map[string]string) { | ||
| // collectRegistry starts a Prometheus server for the provided gatherer. If | ||
| // startCollector is true, it also starts a Prometheus collector configured to | ||
| // scrape the Prometheus server and attaches the provided labels + GitHub | ||
| // labels if available to the collected metrics. | ||
| func collectRegistry( | ||
| tb testing.TB, | ||
| log logging.Logger, | ||
| name string, | ||
| gatherer prometheus.Gatherer, | ||
| labels map[string]string, | ||
| startCollector bool, | ||
| ) { | ||
| r := require.New(tb) | ||
|
|
||
| server, err := tests.NewPrometheusServer(gatherer) | ||
aaronbuchwald marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| r.NoError(err) | ||
|
|
||
| if !startCollector { | ||
| log.Info("metrics endpoint available", | ||
| zap.String("url", fmt.Sprintf("http://%s/ext/metrics", server.Address())), | ||
| ) | ||
|
|
||
| tb.Cleanup(func() { | ||
| r.NoError(server.Stop()) | ||
| }) | ||
| return | ||
| } | ||
|
|
||
| startPromCtx, cancel := context.WithTimeout(context.Background(), tests.DefaultTimeout) | ||
| defer cancel() | ||
|
|
||
| logger := tests.NewDefaultLogger("prometheus") | ||
| r.NoError(tmpnet.StartPrometheus(startPromCtx, logger)) | ||
|
|
||
| server, err := tests.NewPrometheusServer(gatherer) | ||
| r.NoError(err) | ||
|
|
||
| var sdConfigFilePath string | ||
| tb.Cleanup(func() { | ||
| // Ensure a final metrics scrape. | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.