ava-labs · aaronbuchwald · Oct 20, 2025 · Oct 12, 2025 · Oct 13, 2025 · Oct 13, 2025
@@ -95,7 +95,8 @@ runs:
             LABELS=${{ env.LABELS }} \
             BENCHMARK_OUTPUT_FILE=${{ env.BENCHMARK_OUTPUT_FILE }} \
             RUNNER_NAME=${{ inputs.runner_name }} \
-            METRICS_ENABLED=true
+            METRICS_SERVER_ENABLED=true \
+            METRICS_COLLECTOR_ENABLED=true
         prometheus_url: ${{ inputs.prometheus-url }}
         prometheus_push_url: ${{ inputs.prometheus-push-url }}
         prometheus_username: ${{ inputs.prometheus-username }}

@@ -203,7 +203,8 @@ tasks:
       END_BLOCK: '{{.END_BLOCK}}'
       LABELS: '{{.LABELS | default ""}}'
       BENCHMARK_OUTPUT_FILE: '{{.BENCHMARK_OUTPUT_FILE | default ""}}'
-      METRICS_ENABLED: '{{.METRICS_ENABLED | default "false"}}'
+      METRICS_SERVER_ENABLED: '{{.METRICS_SERVER_ENABLED | default "false"}}'
+      METRICS_COLLECTOR_ENABLED: '{{.METRICS_COLLECTOR_ENABLED | default "false"}}'
     cmd: |
       CURRENT_STATE_DIR={{.CURRENT_STATE_DIR}} \
       BLOCK_DIR={{.BLOCK_DIR}} \
@@ -213,7 +214,8 @@ tasks:
       END_BLOCK={{.END_BLOCK}} \
       LABELS={{.LABELS}} \
       BENCHMARK_OUTPUT_FILE={{.BENCHMARK_OUTPUT_FILE}} \
-      METRICS_ENABLED={{.METRICS_ENABLED}} \
+      METRICS_SERVER_ENABLED={{.METRICS_SERVER_ENABLED}} \
+      METRICS_COLLECTOR_ENABLED={{.METRICS_COLLECTOR_ENABLED}} \
       bash -x ./scripts/benchmark_cchain_range.sh
 
   reexecute-cchain-range-with-copied-data:
@@ -228,7 +230,8 @@ tasks:
       END_BLOCK: '{{.END_BLOCK | default "250000"}}'
       LABELS: '{{.LABELS | default ""}}'
       BENCHMARK_OUTPUT_FILE: '{{.BENCHMARK_OUTPUT_FILE | default ""}}'
-      METRICS_ENABLED: '{{.METRICS_ENABLED | default "false"}}'
+      METRICS_SERVER_ENABLED: '{{.METRICS_SERVER_ENABLED | default "false"}}'
+      METRICS_COLLECTOR_ENABLED: '{{.METRICS_COLLECTOR_ENABLED | default "false"}}'
     cmds:
       - task: import-cchain-reexecute-range
         vars:
@@ -245,7 +248,8 @@ tasks:
           END_BLOCK: '{{.END_BLOCK}}'
           LABELS: '{{.LABELS}}'
           BENCHMARK_OUTPUT_FILE: '{{.BENCHMARK_OUTPUT_FILE}}'
-          METRICS_ENABLED: '{{.METRICS_ENABLED}}'
+          METRICS_SERVER_ENABLED: '{{.METRICS_SERVER_ENABLED}}'
+          METRICS_COLLECTOR_ENABLED: '{{.METRICS_COLLECTOR_ENABLED}}'
 
   test-bootstrap-monitor-e2e:
     desc: Runs bootstrap monitor e2e tests

@@ -10,6 +10,8 @@ set -euo pipefail
 #   END_BLOCK: The ending block height (inclusive).
 #   LABELS (optional): Comma-separated key=value pairs for metric labels.
 #   BENCHMARK_OUTPUT_FILE (optional): If set, benchmark output is also written to this file.
+#   METRICS_SERVER_ENABLED (optional): If set, enables the metrics server.
+#   METRICS_COLLECTOR_ENABLED (optional): If set, enables the metrics collector.
 
 : "${BLOCK_DIR:?BLOCK_DIR must be set}"
 : "${CURRENT_STATE_DIR:?CURRENT_STATE_DIR must be set}"
@@ -25,10 +27,11 @@ cmd="go test -timeout=0 -v -benchtime=1x -bench=BenchmarkReexecuteRange -run=^$
   --start-block=\"${START_BLOCK}\" \
   --end-block=\"${END_BLOCK}\" \
   ${LABELS:+--labels=\"${LABELS}\"} \
-  ${METRICS_ENABLED:+--metrics-enabled=\"${METRICS_ENABLED}\"}"
+  ${METRICS_SERVER_ENABLED:+--metrics-server-enabled=\"${METRICS_SERVER_ENABLED}\"} \
+  ${METRICS_COLLECTOR_ENABLED:+--metrics-collector-enabled=\"${METRICS_COLLECTOR_ENABLED}\"}"
 
 if [ -n "${BENCHMARK_OUTPUT_FILE:-}" ]; then
   eval "$cmd" | tee "${BENCHMARK_OUTPUT_FILE}"
 else
   eval "$cmd"
-fi
+fi
@@ -42,7 +42,14 @@ export AWS_REGION=us-east-2
 
 ### Metrics Collection
 
-If running with metrics collection, enabled in CI and configured locally with `METRICS_ENABLED=true`, follow the instructions in the e2e [README](../../e2e/README.md#monitoring) to set the required Prometheus environment variables.
+If running locally, metrics collection can be customized via the following parameters:
+
+- `METRICS_SERVER_ENABLED`: starts a Prometheus server exporting VM metrics.
+- `METRICS_COLLECTOR_ENABLED`: starts a Prometheus collector (if enabled, then `METRICS_SERVER_ENABLED` must be enabled as well).
+
+When utilizing the metrics collector feature, follow the instructions in the e2e [README](../../e2e/README.md#monitoring) to set the required Prometheus environment variables.
+
+Running the re-execution test in CI will always set `METRICS_SERVER_ENABLED=true` and `METRICS_COLLECTOR_ENABLED=true`.
 
 ## Quick Start
 
@@ -230,7 +237,7 @@ The `CONFIG` parameter currently only supports pre-defined configs and not passi
 
 The C-Chain benchmarks export VM metrics to the same Grafana instance as AvalancheGo CI: https://grafana-poc.avax-dev.network/.
 
-To export metrics for a local run, simply set the Taskfile variable `METRICS_ENABLED=true` either via environment variable or passing it at the command line.
+To export metrics for a local run, simply set the Taskfile variables `METRICS_SERVER_ENABLED=true` and `METRICS_COLLECTOR_ENABLED=true` either via environment variable or passing it at the command line.
 
 You can view granular C-Chain processing metrics with the label attached to this job (job="c-chain-reexecution") [here](https://grafana-poc.avax-dev.network/d/Gl1I20mnk/c-chain?orgId=1&from=now-5m&to=now&timezone=browser&var-datasource=P1809F7CD0C75ACF3&var-filter=job%7C%3D%7Cc-chain-reexecution&var-chain=C&refresh=10s).
 

@@ -6,7 +6,6 @@ package vm
 import (
 	"context"
 	"encoding/binary"
-	"errors"
 	"flag"
 	"fmt"
 	"maps"
@@ -61,10 +60,12 @@ var (
 	startBlockArg      uint64
 	endBlockArg        uint64
 	chanSizeArg        int
-	metricsEnabledArg  bool
 	executionTimeout   time.Duration
 	labelsArg          string
 
+	metricsServerEnabledArg    bool
+	metricsCollectorEnabledArg bool
+
 	networkUUID string = uuid.NewString()
 	labels             = map[string]string{
 		"job":               "c-chain-reexecution",
@@ -103,7 +104,8 @@ func TestMain(m *testing.M) {
 	flag.IntVar(&chanSizeArg, "chan-size", 100, "Size of the channel to use for block processing.")
 	flag.DurationVar(&executionTimeout, "execution-timeout", 0, "Benchmark execution timeout. After this timeout has elapsed, terminate the benchmark without error. If 0, no timeout is applied.")
 
-	flag.BoolVar(&metricsEnabledArg, "metrics-enabled", false, "Enable metrics collection.")
+	flag.BoolVar(&metricsServerEnabledArg, "metrics-server-enabled", false, "Whether to enable the metrics server.")
+	flag.BoolVar(&metricsCollectorEnabledArg, "metrics-collector-enabled", false, "Whether to enable the metrics collector (if true, then metrics-server-enabled must be true as well).")
 	flag.StringVar(&labelsArg, "labels", "", "Comma separated KV list of metric labels to attach to all exported metrics. Ex. \"owner=tim,runner=snoopy\"")
 
 	predefinedConfigKeys := slices.Collect(maps.Keys(predefinedConfigs))
@@ -117,6 +119,11 @@ func TestMain(m *testing.M) {
 
 	flag.Parse()
 
+	if metricsCollectorEnabledArg && !metricsServerEnabledArg {
+		fmt.Fprint(os.Stderr, "metrics collector is enabled but metrics server is disabled.\n")
+		os.Exit(1)
+	}
+
 	customLabels, err := parseCustomLabels(labelsArg)
 	if err != nil {
 		fmt.Fprintf(os.Stderr, "failed to parse labels: %v\n", err)
@@ -150,7 +157,8 @@ func BenchmarkReexecuteRange(b *testing.B) {
 			startBlockArg,
 			endBlockArg,
 			chanSizeArg,
-			metricsEnabledArg,
+			metricsServerEnabledArg,
+			metricsCollectorEnabledArg,
 		)
 	})
 }
@@ -163,7 +171,8 @@ func benchmarkReexecuteRange(
 	startBlock uint64,
 	endBlock uint64,
 	chanSize int,
-	metricsEnabled bool,
+	metricsServerEnabled bool,
+	metricsCollectorEnabled bool,
 ) {
 	r := require.New(b)
 	ctx := context.Background()
@@ -182,8 +191,12 @@ func benchmarkReexecuteRange(
 
 	log := tests.NewDefaultLogger("c-chain-reexecution")
 
-	if metricsEnabled {
-		collectRegistry(b, log, "c-chain-reexecution", prefixGatherer, labels)
+	if metricsServerEnabled {
+		serverAddr := startServer(b, log, prefixGatherer)
+
+		if metricsCollectorEnabled {
+			startCollector(b, log, "c-chain-reexecution", labels, serverAddr)
+		}
 	}
 
 	var (
@@ -546,9 +559,33 @@ func newConsensusMetrics(registry prometheus.Registerer) (*consensusMetrics, err
 	return m, nil
 }
 
-// collectRegistry starts prometheus and collects metrics from the provided gatherer.
-// Attaches the provided labels + GitHub labels if available to the collected metrics.
-func collectRegistry(tb testing.TB, log logging.Logger, name string, gatherer prometheus.Gatherer, labels map[string]string) {
+// startServer starts a Prometheus server for the provided gatherer and returns
+// the server address.
+func startServer(
+	tb testing.TB,
+	log logging.Logger,
+	gatherer prometheus.Gatherer,
+) string {
+	r := require.New(tb)
+
+	server, err := tests.NewPrometheusServer(gatherer)
+	r.NoError(err)
+
+	log.Info("metrics endpoint available",
+		zap.String("url", fmt.Sprintf("http://%s/ext/metrics", server.Address())),
+	)
+
+	tb.Cleanup(func() {
+		r.NoError(server.Stop())
+	})
+
+	return server.Address()
+}
+
+// startCollector starts a Prometheus collector configured to scrape the server
+// listening on serverAddr. startCollector also attaches the provided labels +
+// Github labels if available to the collected metrics.
+func startCollector(tb testing.TB, log logging.Logger, name string, labels map[string]string, serverAddr string) {
 	r := require.New(tb)
 
 	startPromCtx, cancel := context.WithTimeout(context.Background(), tests.DefaultTimeout)
@@ -557,32 +594,27 @@ func collectRegistry(tb testing.TB, log logging.Logger, name string, gatherer pr
 	logger := tests.NewDefaultLogger("prometheus")
 	r.NoError(tmpnet.StartPrometheus(startPromCtx, logger))
 
-	server, err := tests.NewPrometheusServer(gatherer)
-	r.NoError(err)
-
 	var sdConfigFilePath string
 	tb.Cleanup(func() {
 		// Ensure a final metrics scrape.
 		// This default delay is set above the default scrape interval used by StartPrometheus.
 		time.Sleep(tmpnet.NetworkShutdownDelay)
 
-		r.NoError(errors.Join(
-			server.Stop(),
-			func() error {
-				if sdConfigFilePath != "" {
-					return os.Remove(sdConfigFilePath)
-				}
-				return nil
-			}(),
-		))
+		r.NoError(func() error {
+			if sdConfigFilePath != "" {
+				return os.Remove(sdConfigFilePath)
+			}
+			return nil
+		}(),
+		)
 
 		checkMetricsCtx, cancel := context.WithTimeout(context.Background(), tests.DefaultTimeout)
 		defer cancel()
 		r.NoError(tmpnet.CheckMetricsExist(checkMetricsCtx, logger, networkUUID))
 	})
 
-	sdConfigFilePath, err = tmpnet.WritePrometheusSDConfig(name, tmpnet.SDConfig{
-		Targets: []string{server.Address()},
+	sdConfigFilePath, err := tmpnet.WritePrometheusSDConfig(name, tmpnet.SDConfig{
+		Targets: []string{serverAddr},
 		Labels:  labels,
 	}, true /* withGitHubLabels */)
 	r.NoError(err)