diff --git a/.github/actions/c-chain-reexecution-benchmark/action.yml b/.github/actions/c-chain-reexecution-benchmark/action.yml index d0f119a02c80..3385ff50a0c1 100644 --- a/.github/actions/c-chain-reexecution-benchmark/action.yml +++ b/.github/actions/c-chain-reexecution-benchmark/action.yml @@ -95,7 +95,8 @@ runs: LABELS=${{ env.LABELS }} \ BENCHMARK_OUTPUT_FILE=${{ env.BENCHMARK_OUTPUT_FILE }} \ RUNNER_NAME=${{ inputs.runner_name }} \ - METRICS_ENABLED=true + METRICS_SERVER_ENABLED=true \ + METRICS_COLLECTOR_ENABLED=true prometheus_url: ${{ inputs.prometheus-url }} prometheus_push_url: ${{ inputs.prometheus-push-url }} prometheus_username: ${{ inputs.prometheus-username }} diff --git a/Taskfile.yml b/Taskfile.yml index 7f722c814bba..4b347993f6a3 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -203,7 +203,8 @@ tasks: END_BLOCK: '{{.END_BLOCK}}' LABELS: '{{.LABELS | default ""}}' BENCHMARK_OUTPUT_FILE: '{{.BENCHMARK_OUTPUT_FILE | default ""}}' - METRICS_ENABLED: '{{.METRICS_ENABLED | default "false"}}' + METRICS_SERVER_ENABLED: '{{.METRICS_SERVER_ENABLED | default "false"}}' + METRICS_COLLECTOR_ENABLED: '{{.METRICS_COLLECTOR_ENABLED | default "false"}}' cmd: | CURRENT_STATE_DIR={{.CURRENT_STATE_DIR}} \ BLOCK_DIR={{.BLOCK_DIR}} \ @@ -213,7 +214,8 @@ tasks: END_BLOCK={{.END_BLOCK}} \ LABELS={{.LABELS}} \ BENCHMARK_OUTPUT_FILE={{.BENCHMARK_OUTPUT_FILE}} \ - METRICS_ENABLED={{.METRICS_ENABLED}} \ + METRICS_SERVER_ENABLED={{.METRICS_SERVER_ENABLED}} \ + METRICS_COLLECTOR_ENABLED={{.METRICS_COLLECTOR_ENABLED}} \ bash -x ./scripts/benchmark_cchain_range.sh reexecute-cchain-range-with-copied-data: @@ -228,7 +230,8 @@ tasks: END_BLOCK: '{{.END_BLOCK | default "250000"}}' LABELS: '{{.LABELS | default ""}}' BENCHMARK_OUTPUT_FILE: '{{.BENCHMARK_OUTPUT_FILE | default ""}}' - METRICS_ENABLED: '{{.METRICS_ENABLED | default "false"}}' + METRICS_SERVER_ENABLED: '{{.METRICS_SERVER_ENABLED | default "false"}}' + METRICS_COLLECTOR_ENABLED: '{{.METRICS_COLLECTOR_ENABLED | default "false"}}' cmds: - task: import-cchain-reexecute-range vars: @@ -245,7 +248,8 @@ tasks: END_BLOCK: '{{.END_BLOCK}}' LABELS: '{{.LABELS}}' BENCHMARK_OUTPUT_FILE: '{{.BENCHMARK_OUTPUT_FILE}}' - METRICS_ENABLED: '{{.METRICS_ENABLED}}' + METRICS_SERVER_ENABLED: '{{.METRICS_SERVER_ENABLED}}' + METRICS_COLLECTOR_ENABLED: '{{.METRICS_COLLECTOR_ENABLED}}' test-bootstrap-monitor-e2e: desc: Runs bootstrap monitor e2e tests diff --git a/scripts/benchmark_cchain_range.sh b/scripts/benchmark_cchain_range.sh index 3072143da87a..092d02bfe7d6 100755 --- a/scripts/benchmark_cchain_range.sh +++ b/scripts/benchmark_cchain_range.sh @@ -10,6 +10,8 @@ set -euo pipefail # END_BLOCK: The ending block height (inclusive). # LABELS (optional): Comma-separated key=value pairs for metric labels. # BENCHMARK_OUTPUT_FILE (optional): If set, benchmark output is also written to this file. +# METRICS_SERVER_ENABLED (optional): If set, enables the metrics server. +# METRICS_COLLECTOR_ENABLED (optional): If set, enables the metrics collector. : "${BLOCK_DIR:?BLOCK_DIR must be set}" : "${CURRENT_STATE_DIR:?CURRENT_STATE_DIR must be set}" @@ -25,10 +27,11 @@ cmd="go test -timeout=0 -v -benchtime=1x -bench=BenchmarkReexecuteRange -run=^$ --start-block=\"${START_BLOCK}\" \ --end-block=\"${END_BLOCK}\" \ ${LABELS:+--labels=\"${LABELS}\"} \ - ${METRICS_ENABLED:+--metrics-enabled=\"${METRICS_ENABLED}\"}" + ${METRICS_SERVER_ENABLED:+--metrics-server-enabled=\"${METRICS_SERVER_ENABLED}\"} \ + ${METRICS_COLLECTOR_ENABLED:+--metrics-collector-enabled=\"${METRICS_COLLECTOR_ENABLED}\"}" if [ -n "${BENCHMARK_OUTPUT_FILE:-}" ]; then eval "$cmd" | tee "${BENCHMARK_OUTPUT_FILE}" else eval "$cmd" -fi \ No newline at end of file +fi diff --git a/tests/reexecute/c/README.md b/tests/reexecute/c/README.md index f0f8fe6aa670..07b01d4d8117 100644 --- a/tests/reexecute/c/README.md +++ b/tests/reexecute/c/README.md @@ -42,7 +42,14 @@ export AWS_REGION=us-east-2 ### Metrics Collection -If running with metrics collection, enabled in CI and configured locally with `METRICS_ENABLED=true`, follow the instructions in the e2e [README](../../e2e/README.md#monitoring) to set the required Prometheus environment variables. +If running locally, metrics collection can be customized via the following parameters: + +- `METRICS_SERVER_ENABLED`: starts a Prometheus server exporting VM metrics. +- `METRICS_COLLECTOR_ENABLED`: starts a Prometheus collector (if enabled, then `METRICS_SERVER_ENABLED` must be enabled as well). + +When utilizing the metrics collector feature, follow the instructions in the e2e [README](../../e2e/README.md#monitoring) to set the required Prometheus environment variables. + +Running the re-execution test in CI will always set `METRICS_SERVER_ENABLED=true` and `METRICS_COLLECTOR_ENABLED=true`. ## Quick Start @@ -230,7 +237,7 @@ The `CONFIG` parameter currently only supports pre-defined configs and not passi The C-Chain benchmarks export VM metrics to the same Grafana instance as AvalancheGo CI: https://grafana-poc.avax-dev.network/. -To export metrics for a local run, simply set the Taskfile variable `METRICS_ENABLED=true` either via environment variable or passing it at the command line. +To export metrics for a local run, simply set the Taskfile variables `METRICS_SERVER_ENABLED=true` and `METRICS_COLLECTOR_ENABLED=true` either via environment variable or passing it at the command line. You can view granular C-Chain processing metrics with the label attached to this job (job="c-chain-reexecution") [here](https://grafana-poc.avax-dev.network/d/Gl1I20mnk/c-chain?orgId=1&from=now-5m&to=now&timezone=browser&var-datasource=P1809F7CD0C75ACF3&var-filter=job%7C%3D%7Cc-chain-reexecution&var-chain=C&refresh=10s). diff --git a/tests/reexecute/c/vm_reexecute_test.go b/tests/reexecute/c/vm_reexecute_test.go index e379d36ddf6b..44b77458217c 100644 --- a/tests/reexecute/c/vm_reexecute_test.go +++ b/tests/reexecute/c/vm_reexecute_test.go @@ -6,7 +6,6 @@ package vm import ( "context" "encoding/binary" - "errors" "flag" "fmt" "maps" @@ -61,10 +60,12 @@ var ( startBlockArg uint64 endBlockArg uint64 chanSizeArg int - metricsEnabledArg bool executionTimeout time.Duration labelsArg string + metricsServerEnabledArg bool + metricsCollectorEnabledArg bool + networkUUID string = uuid.NewString() labels = map[string]string{ "job": "c-chain-reexecution", @@ -103,7 +104,8 @@ func TestMain(m *testing.M) { flag.IntVar(&chanSizeArg, "chan-size", 100, "Size of the channel to use for block processing.") flag.DurationVar(&executionTimeout, "execution-timeout", 0, "Benchmark execution timeout. After this timeout has elapsed, terminate the benchmark without error. If 0, no timeout is applied.") - flag.BoolVar(&metricsEnabledArg, "metrics-enabled", false, "Enable metrics collection.") + flag.BoolVar(&metricsServerEnabledArg, "metrics-server-enabled", false, "Whether to enable the metrics server.") + flag.BoolVar(&metricsCollectorEnabledArg, "metrics-collector-enabled", false, "Whether to enable the metrics collector (if true, then metrics-server-enabled must be true as well).") flag.StringVar(&labelsArg, "labels", "", "Comma separated KV list of metric labels to attach to all exported metrics. Ex. \"owner=tim,runner=snoopy\"") predefinedConfigKeys := slices.Collect(maps.Keys(predefinedConfigs)) @@ -117,6 +119,11 @@ func TestMain(m *testing.M) { flag.Parse() + if metricsCollectorEnabledArg && !metricsServerEnabledArg { + fmt.Fprint(os.Stderr, "metrics collector is enabled but metrics server is disabled.\n") + os.Exit(1) + } + customLabels, err := parseCustomLabels(labelsArg) if err != nil { fmt.Fprintf(os.Stderr, "failed to parse labels: %v\n", err) @@ -150,7 +157,8 @@ func BenchmarkReexecuteRange(b *testing.B) { startBlockArg, endBlockArg, chanSizeArg, - metricsEnabledArg, + metricsServerEnabledArg, + metricsCollectorEnabledArg, ) }) } @@ -163,7 +171,8 @@ func benchmarkReexecuteRange( startBlock uint64, endBlock uint64, chanSize int, - metricsEnabled bool, + metricsServerEnabled bool, + metricsCollectorEnabled bool, ) { r := require.New(b) ctx := context.Background() @@ -182,8 +191,12 @@ func benchmarkReexecuteRange( log := tests.NewDefaultLogger("c-chain-reexecution") - if metricsEnabled { - collectRegistry(b, log, "c-chain-reexecution", prefixGatherer, labels) + if metricsServerEnabled { + serverAddr := startServer(b, log, prefixGatherer) + + if metricsCollectorEnabled { + startCollector(b, log, "c-chain-reexecution", labels, serverAddr) + } } var ( @@ -546,9 +559,33 @@ func newConsensusMetrics(registry prometheus.Registerer) (*consensusMetrics, err return m, nil } -// collectRegistry starts prometheus and collects metrics from the provided gatherer. -// Attaches the provided labels + GitHub labels if available to the collected metrics. -func collectRegistry(tb testing.TB, log logging.Logger, name string, gatherer prometheus.Gatherer, labels map[string]string) { +// startServer starts a Prometheus server for the provided gatherer and returns +// the server address. +func startServer( + tb testing.TB, + log logging.Logger, + gatherer prometheus.Gatherer, +) string { + r := require.New(tb) + + server, err := tests.NewPrometheusServer(gatherer) + r.NoError(err) + + log.Info("metrics endpoint available", + zap.String("url", fmt.Sprintf("http://%s/ext/metrics", server.Address())), + ) + + tb.Cleanup(func() { + r.NoError(server.Stop()) + }) + + return server.Address() +} + +// startCollector starts a Prometheus collector configured to scrape the server +// listening on serverAddr. startCollector also attaches the provided labels + +// Github labels if available to the collected metrics. +func startCollector(tb testing.TB, log logging.Logger, name string, labels map[string]string, serverAddr string) { r := require.New(tb) startPromCtx, cancel := context.WithTimeout(context.Background(), tests.DefaultTimeout) @@ -557,32 +594,27 @@ func collectRegistry(tb testing.TB, log logging.Logger, name string, gatherer pr logger := tests.NewDefaultLogger("prometheus") r.NoError(tmpnet.StartPrometheus(startPromCtx, logger)) - server, err := tests.NewPrometheusServer(gatherer) - r.NoError(err) - var sdConfigFilePath string tb.Cleanup(func() { // Ensure a final metrics scrape. // This default delay is set above the default scrape interval used by StartPrometheus. time.Sleep(tmpnet.NetworkShutdownDelay) - r.NoError(errors.Join( - server.Stop(), - func() error { - if sdConfigFilePath != "" { - return os.Remove(sdConfigFilePath) - } - return nil - }(), - )) + r.NoError(func() error { + if sdConfigFilePath != "" { + return os.Remove(sdConfigFilePath) + } + return nil + }(), + ) checkMetricsCtx, cancel := context.WithTimeout(context.Background(), tests.DefaultTimeout) defer cancel() r.NoError(tmpnet.CheckMetricsExist(checkMetricsCtx, logger, networkUUID)) }) - sdConfigFilePath, err = tmpnet.WritePrometheusSDConfig(name, tmpnet.SDConfig{ - Targets: []string{server.Address()}, + sdConfigFilePath, err := tmpnet.WritePrometheusSDConfig(name, tmpnet.SDConfig{ + Targets: []string{serverAddr}, Labels: labels, }, true /* withGitHubLabels */) r.NoError(err)