ava-labs · aaronbuchwald · Oct 20, 2025 · Oct 12, 2025 · Oct 13, 2025 · Oct 13, 2025
@@ -95,7 +95,7 @@ runs:
             LABELS=${{ env.LABELS }} \
             BENCHMARK_OUTPUT_FILE=${{ env.BENCHMARK_OUTPUT_FILE }} \
             RUNNER_NAME=${{ inputs.runner_name }} \
-            METRICS_ENABLED=true
+            METRICS_MODE="full"
         prometheus_url: ${{ inputs.prometheus-url }}
         prometheus_push_url: ${{ inputs.prometheus-push-url }}
         prometheus_username: ${{ inputs.prometheus-username }}

@@ -203,7 +203,7 @@ tasks:
       END_BLOCK: '{{.END_BLOCK}}'
       LABELS: '{{.LABELS | default ""}}'
       BENCHMARK_OUTPUT_FILE: '{{.BENCHMARK_OUTPUT_FILE | default ""}}'
-      METRICS_ENABLED: '{{.METRICS_ENABLED | default "false"}}'
+      METRICS_MODE: '{{.METRICS_MODE | default "disabled"}}'
     cmd: |
       CURRENT_STATE_DIR={{.CURRENT_STATE_DIR}} \
       BLOCK_DIR={{.BLOCK_DIR}} \
@@ -213,7 +213,7 @@ tasks:
       END_BLOCK={{.END_BLOCK}} \
       LABELS={{.LABELS}} \
       BENCHMARK_OUTPUT_FILE={{.BENCHMARK_OUTPUT_FILE}} \
-      METRICS_ENABLED={{.METRICS_ENABLED}} \
+      METRICS_MODE={{.METRICS_MODE}} \
       bash -x ./scripts/benchmark_cchain_range.sh
 
   reexecute-cchain-range-with-copied-data:
@@ -228,7 +228,7 @@ tasks:
       END_BLOCK: '{{.END_BLOCK | default "250000"}}'
       LABELS: '{{.LABELS | default ""}}'
       BENCHMARK_OUTPUT_FILE: '{{.BENCHMARK_OUTPUT_FILE | default ""}}'
-      METRICS_ENABLED: '{{.METRICS_ENABLED | default "false"}}'
+      METRICS_MODE: '{{.METRICS_MODE | default "disabled"}}'
     cmds:
       - task: import-cchain-reexecute-range
         vars:
@@ -245,7 +245,7 @@ tasks:
           END_BLOCK: '{{.END_BLOCK}}'
           LABELS: '{{.LABELS}}'
           BENCHMARK_OUTPUT_FILE: '{{.BENCHMARK_OUTPUT_FILE}}'
-          METRICS_ENABLED: '{{.METRICS_ENABLED}}'
+          METRICS_MODE: '{{.METRICS_MODE}}'
 
   test-bootstrap-monitor-e2e:
     desc: Runs bootstrap monitor e2e tests

@@ -25,10 +25,10 @@ cmd="go test -timeout=0 -v -benchtime=1x -bench=BenchmarkReexecuteRange -run=^$
   --start-block=\"${START_BLOCK}\" \
   --end-block=\"${END_BLOCK}\" \
   ${LABELS:+--labels=\"${LABELS}\"} \
-  ${METRICS_ENABLED:+--metrics-enabled=\"${METRICS_ENABLED}\"}"
+  ${METRICS_MODE:+--metrics-mode=\"${METRICS_MODE}\"}"
 
 if [ -n "${BENCHMARK_OUTPUT_FILE:-}" ]; then
   eval "$cmd" | tee "${BENCHMARK_OUTPUT_FILE}"
 else
   eval "$cmd"
-fi
+fi
@@ -42,7 +42,18 @@ export AWS_REGION=us-east-2
 
 ### Metrics Collection
 
-If running with metrics collection, enabled in CI and configured locally with `METRICS_ENABLED=true`, follow the instructions in the e2e [README](../../e2e/README.md#monitoring) to set the required Prometheus environment variables.
+If running locally, there are three options for metrics collection:
+
+- `METRICS_MODE=disabled`: no metrics are available.
+- `METRICS_MODE=server-only`: starts a Prometheus server exporting VM metrics. A
+  link to the metrics endpoint is logged during execution.
+- `METRICS_MODE=full`: starts both a Prometheus server exporting VM metrics and
+  a Prometheus collector. A link to the corresponding Grafana dashboard is
+  logged during execution.
+
+When utilizing the `full` options, follow the instructions in the e2e [README](../../e2e/README.md#monitoring) to set the required Prometheus environment variables.
+
+Running the re-execution test in CI will always set `METRICS_MODE=full`.
 
 ## Quick Start
 
@@ -230,7 +241,7 @@ The `CONFIG` parameter currently only supports pre-defined configs and not passi
 
 The C-Chain benchmarks export VM metrics to the same Grafana instance as AvalancheGo CI: https://grafana-poc.avax-dev.network/.
 
-To export metrics for a local run, simply set the Taskfile variable `METRICS_ENABLED=true` either via environment variable or passing it at the command line.
+To export metrics for a local run, simply set the Taskfile variable `METRICS_MODE=full` either via environment variable or passing it at the command line.
 
 You can view granular C-Chain processing metrics with the label attached to this job (job="c-chain-reexecution") [here](https://grafana-poc.avax-dev.network/d/Gl1I20mnk/c-chain?orgId=1&from=now-5m&to=now&timezone=browser&var-datasource=P1809F7CD0C75ACF3&var-filter=job%7C%3D%7Cc-chain-reexecution&var-chain=C&refresh=10s).
 

@@ -48,6 +48,12 @@ import (
 	"github.com/ava-labs/avalanchego/vms/platformvm/warp"
 )
 
+const (
+	MetricsDisabled metricsMode = iota
+	MetricsServerOnly
+	MetricsFull
+)
+
 var (
 	mainnetXChainID    = ids.FromStringOrPanic("2oYMBNV4eNHyqk2fjjV5nVQLDbtmNJzq5s3qs3Lo6ftnC6FByM")
 	mainnetCChainID    = ids.FromStringOrPanic("2q9e4r6Mu3U68nU1fYjgbR6JvwrRx36CohpAX5UQxse55x1Q5")
@@ -62,10 +68,11 @@ var (
 	startBlockArg      uint64
 	endBlockArg        uint64
 	chanSizeArg        int
-	metricsEnabledArg  bool
 	executionTimeout   time.Duration
 	labelsArg          string
 
+	metricsModeArg = MetricsDisabled
+
 	networkUUID string = uuid.NewString()
 	labels             = map[string]string{
 		"job":               "c-chain-reexecution",
@@ -94,6 +101,41 @@ var (
 	configBytesArg []byte
 )
 
+type metricsMode int
+
+func (m *metricsMode) Set(s string) error {
+	s = strings.ToLower(strings.TrimSpace(s))
+
+	switch s {
+	case "disabled":
+		*m = MetricsDisabled
+	case "server-only":
+		*m = MetricsServerOnly
+	case "full":
+		*m = MetricsFull
+	default:
+		return fmt.Errorf("invalid metrics mode: %s (valid options: disabled, server-only, full)", s)
+	}
+	return nil
+}
+
+func (m metricsMode) String() string {
+	switch m {
+	case MetricsDisabled:
+		return "disabled"
+	case MetricsServerOnly:
+		return "server-only"
+	case MetricsFull:
+		return "full"
+	default:
+		return "unknown"
+	}
+}
+
+func (m metricsMode) shouldStartServer() bool { return m >= MetricsServerOnly }
+
+func (m metricsMode) shouldStartCollector() bool { return m == MetricsFull }
+
 func TestMain(m *testing.M) {
 	evm.RegisterAllLibEVMExtras()
 
@@ -104,7 +146,7 @@ func TestMain(m *testing.M) {
 	flag.IntVar(&chanSizeArg, "chan-size", 100, "Size of the channel to use for block processing.")
 	flag.DurationVar(&executionTimeout, "execution-timeout", 0, "Benchmark execution timeout. After this timeout has elapsed, terminate the benchmark without error. If 0, no timeout is applied.")
 
-	flag.BoolVar(&metricsEnabledArg, "metrics-enabled", false, "Enable metrics collection.")
+	flag.Var(&metricsModeArg, "metrics-mode", "Metrics mode: disabled (no metrics), server-only (creates Prometheus server), or full (creates Prometheus server and starts Prometheus collector)")
 	flag.StringVar(&labelsArg, "labels", "", "Comma separated KV list of metric labels to attach to all exported metrics. Ex. \"owner=tim,runner=snoopy\"")
 
 	predefinedConfigKeys := slices.Collect(maps.Keys(predefinedConfigs))
@@ -151,7 +193,7 @@ func BenchmarkReexecuteRange(b *testing.B) {
 			startBlockArg,
 			endBlockArg,
 			chanSizeArg,
-			metricsEnabledArg,
+			metricsModeArg,
 		)
 	})
 }
@@ -164,7 +206,7 @@ func benchmarkReexecuteRange(
 	startBlock uint64,
 	endBlock uint64,
 	chanSize int,
-	metricsEnabled bool,
+	metricsMode metricsMode,
 ) {
 	r := require.New(b)
 	ctx := context.Background()
@@ -185,9 +227,8 @@ func benchmarkReexecuteRange(
 	r.NoError(prefixGatherer.Register("avalanche_snowman", consensusRegistry))
 
 	log := tests.NewDefaultLogger("c-chain-reexecution")
-
-	if metricsEnabled {
-		collectRegistry(b, log, "c-chain-reexecution", prefixGatherer, labels)
+	if metricsMode.shouldStartServer() {
+		collectRegistry(b, log, "c-chain-reexecution", prefixGatherer, labels, metricsMode.shouldStartCollector())
 	}
 
 	var (
@@ -554,20 +595,40 @@ func newConsensusMetrics(registry prometheus.Registerer) (*consensusMetrics, err
 	return m, nil
 }
 
-// collectRegistry starts prometheus and collects metrics from the provided gatherer.
-// Attaches the provided labels + GitHub labels if available to the collected metrics.
-func collectRegistry(tb testing.TB, log logging.Logger, name string, gatherer prometheus.Gatherer, labels map[string]string) {
+// collectRegistry starts a Prometheus server for the provided gatherer. If
+// startCollector is true, it also starts a Prometheus collector configured to
+// scrape the Prometheus server and attaches the provided labels + GitHub
+// labels if available to the collected metrics.
+func collectRegistry(
+	tb testing.TB,
+	log logging.Logger,
+	name string,
+	gatherer prometheus.Gatherer,
+	labels map[string]string,
+	startCollector bool,
+) {
 	r := require.New(tb)
 
+	server, err := tests.NewPrometheusServer(gatherer)
+	r.NoError(err)
+
+	if !startCollector {
+		log.Info("metrics endpoint available",
+			zap.String("url", fmt.Sprintf("http://%s/ext/metrics", server.Address())),
+		)
+
+		tb.Cleanup(func() {
+			r.NoError(server.Stop())
+		})
+		return
+	}
+
 	startPromCtx, cancel := context.WithTimeout(context.Background(), tests.DefaultTimeout)
 	defer cancel()
 
 	logger := tests.NewDefaultLogger("prometheus")
 	r.NoError(tmpnet.StartPrometheus(startPromCtx, logger))
 
-	server, err := tests.NewPrometheusServer(gatherer)
-	r.NoError(err)
-
 	var sdConfigFilePath string
 	tb.Cleanup(func() {
 		// Ensure a final metrics scrape.