Skip to content

Commit 36fa7f3

Browse files
feat(reexecute/c): decouple metrics server and collector (#4415)
1 parent eca3432 commit 36fa7f3

File tree

5 files changed

+80
-33
lines changed

5 files changed

+80
-33
lines changed

.github/actions/c-chain-reexecution-benchmark/action.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,8 @@ runs:
9595
LABELS=${{ env.LABELS }} \
9696
BENCHMARK_OUTPUT_FILE=${{ env.BENCHMARK_OUTPUT_FILE }} \
9797
RUNNER_NAME=${{ inputs.runner_name }} \
98-
METRICS_ENABLED=true
98+
METRICS_SERVER_ENABLED=true \
99+
METRICS_COLLECTOR_ENABLED=true
99100
prometheus_url: ${{ inputs.prometheus-url }}
100101
prometheus_push_url: ${{ inputs.prometheus-push-url }}
101102
prometheus_username: ${{ inputs.prometheus-username }}

Taskfile.yml

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,8 @@ tasks:
203203
END_BLOCK: '{{.END_BLOCK}}'
204204
LABELS: '{{.LABELS | default ""}}'
205205
BENCHMARK_OUTPUT_FILE: '{{.BENCHMARK_OUTPUT_FILE | default ""}}'
206-
METRICS_ENABLED: '{{.METRICS_ENABLED | default "false"}}'
206+
METRICS_SERVER_ENABLED: '{{.METRICS_SERVER_ENABLED | default "false"}}'
207+
METRICS_COLLECTOR_ENABLED: '{{.METRICS_COLLECTOR_ENABLED | default "false"}}'
207208
cmd: |
208209
CURRENT_STATE_DIR={{.CURRENT_STATE_DIR}} \
209210
BLOCK_DIR={{.BLOCK_DIR}} \
@@ -213,7 +214,8 @@ tasks:
213214
END_BLOCK={{.END_BLOCK}} \
214215
LABELS={{.LABELS}} \
215216
BENCHMARK_OUTPUT_FILE={{.BENCHMARK_OUTPUT_FILE}} \
216-
METRICS_ENABLED={{.METRICS_ENABLED}} \
217+
METRICS_SERVER_ENABLED={{.METRICS_SERVER_ENABLED}} \
218+
METRICS_COLLECTOR_ENABLED={{.METRICS_COLLECTOR_ENABLED}} \
217219
bash -x ./scripts/benchmark_cchain_range.sh
218220
219221
reexecute-cchain-range-with-copied-data:
@@ -228,7 +230,8 @@ tasks:
228230
END_BLOCK: '{{.END_BLOCK | default "250000"}}'
229231
LABELS: '{{.LABELS | default ""}}'
230232
BENCHMARK_OUTPUT_FILE: '{{.BENCHMARK_OUTPUT_FILE | default ""}}'
231-
METRICS_ENABLED: '{{.METRICS_ENABLED | default "false"}}'
233+
METRICS_SERVER_ENABLED: '{{.METRICS_SERVER_ENABLED | default "false"}}'
234+
METRICS_COLLECTOR_ENABLED: '{{.METRICS_COLLECTOR_ENABLED | default "false"}}'
232235
cmds:
233236
- task: import-cchain-reexecute-range
234237
vars:
@@ -245,7 +248,8 @@ tasks:
245248
END_BLOCK: '{{.END_BLOCK}}'
246249
LABELS: '{{.LABELS}}'
247250
BENCHMARK_OUTPUT_FILE: '{{.BENCHMARK_OUTPUT_FILE}}'
248-
METRICS_ENABLED: '{{.METRICS_ENABLED}}'
251+
METRICS_SERVER_ENABLED: '{{.METRICS_SERVER_ENABLED}}'
252+
METRICS_COLLECTOR_ENABLED: '{{.METRICS_COLLECTOR_ENABLED}}'
249253

250254
test-bootstrap-monitor-e2e:
251255
desc: Runs bootstrap monitor e2e tests

scripts/benchmark_cchain_range.sh

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ set -euo pipefail
1010
# END_BLOCK: The ending block height (inclusive).
1111
# LABELS (optional): Comma-separated key=value pairs for metric labels.
1212
# BENCHMARK_OUTPUT_FILE (optional): If set, benchmark output is also written to this file.
13+
# METRICS_SERVER_ENABLED (optional): If set, enables the metrics server.
14+
# METRICS_COLLECTOR_ENABLED (optional): If set, enables the metrics collector.
1315

1416
: "${BLOCK_DIR:?BLOCK_DIR must be set}"
1517
: "${CURRENT_STATE_DIR:?CURRENT_STATE_DIR must be set}"
@@ -25,10 +27,11 @@ cmd="go test -timeout=0 -v -benchtime=1x -bench=BenchmarkReexecuteRange -run=^$
2527
--start-block=\"${START_BLOCK}\" \
2628
--end-block=\"${END_BLOCK}\" \
2729
${LABELS:+--labels=\"${LABELS}\"} \
28-
${METRICS_ENABLED:+--metrics-enabled=\"${METRICS_ENABLED}\"}"
30+
${METRICS_SERVER_ENABLED:+--metrics-server-enabled=\"${METRICS_SERVER_ENABLED}\"} \
31+
${METRICS_COLLECTOR_ENABLED:+--metrics-collector-enabled=\"${METRICS_COLLECTOR_ENABLED}\"}"
2932

3033
if [ -n "${BENCHMARK_OUTPUT_FILE:-}" ]; then
3134
eval "$cmd" | tee "${BENCHMARK_OUTPUT_FILE}"
3235
else
3336
eval "$cmd"
34-
fi
37+
fi

tests/reexecute/c/README.md

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,14 @@ export AWS_REGION=us-east-2
4242

4343
### Metrics Collection
4444

45-
If running with metrics collection, enabled in CI and configured locally with `METRICS_ENABLED=true`, follow the instructions in the e2e [README](../../e2e/README.md#monitoring) to set the required Prometheus environment variables.
45+
If running locally, metrics collection can be customized via the following parameters:
46+
47+
- `METRICS_SERVER_ENABLED`: starts a Prometheus server exporting VM metrics.
48+
- `METRICS_COLLECTOR_ENABLED`: starts a Prometheus collector (if enabled, then `METRICS_SERVER_ENABLED` must be enabled as well).
49+
50+
When utilizing the metrics collector feature, follow the instructions in the e2e [README](../../e2e/README.md#monitoring) to set the required Prometheus environment variables.
51+
52+
Running the re-execution test in CI will always set `METRICS_SERVER_ENABLED=true` and `METRICS_COLLECTOR_ENABLED=true`.
4653

4754
## Quick Start
4855

@@ -230,7 +237,7 @@ The `CONFIG` parameter currently only supports pre-defined configs and not passi
230237

231238
The C-Chain benchmarks export VM metrics to the same Grafana instance as AvalancheGo CI: https://grafana-poc.avax-dev.network/.
232239

233-
To export metrics for a local run, simply set the Taskfile variable `METRICS_ENABLED=true` either via environment variable or passing it at the command line.
240+
To export metrics for a local run, simply set the Taskfile variables `METRICS_SERVER_ENABLED=true` and `METRICS_COLLECTOR_ENABLED=true` either via environment variable or passing it at the command line.
234241

235242
You can view granular C-Chain processing metrics with the label attached to this job (job="c-chain-reexecution") [here](https://grafana-poc.avax-dev.network/d/Gl1I20mnk/c-chain?orgId=1&from=now-5m&to=now&timezone=browser&var-datasource=P1809F7CD0C75ACF3&var-filter=job%7C%3D%7Cc-chain-reexecution&var-chain=C&refresh=10s).
236243

tests/reexecute/c/vm_reexecute_test.go

Lines changed: 56 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@ package vm
66
import (
77
"context"
88
"encoding/binary"
9-
"errors"
109
"flag"
1110
"fmt"
1211
"maps"
@@ -61,10 +60,12 @@ var (
6160
startBlockArg uint64
6261
endBlockArg uint64
6362
chanSizeArg int
64-
metricsEnabledArg bool
6563
executionTimeout time.Duration
6664
labelsArg string
6765

66+
metricsServerEnabledArg bool
67+
metricsCollectorEnabledArg bool
68+
6869
networkUUID string = uuid.NewString()
6970
labels = map[string]string{
7071
"job": "c-chain-reexecution",
@@ -103,7 +104,8 @@ func TestMain(m *testing.M) {
103104
flag.IntVar(&chanSizeArg, "chan-size", 100, "Size of the channel to use for block processing.")
104105
flag.DurationVar(&executionTimeout, "execution-timeout", 0, "Benchmark execution timeout. After this timeout has elapsed, terminate the benchmark without error. If 0, no timeout is applied.")
105106

106-
flag.BoolVar(&metricsEnabledArg, "metrics-enabled", false, "Enable metrics collection.")
107+
flag.BoolVar(&metricsServerEnabledArg, "metrics-server-enabled", false, "Whether to enable the metrics server.")
108+
flag.BoolVar(&metricsCollectorEnabledArg, "metrics-collector-enabled", false, "Whether to enable the metrics collector (if true, then metrics-server-enabled must be true as well).")
107109
flag.StringVar(&labelsArg, "labels", "", "Comma separated KV list of metric labels to attach to all exported metrics. Ex. \"owner=tim,runner=snoopy\"")
108110

109111
predefinedConfigKeys := slices.Collect(maps.Keys(predefinedConfigs))
@@ -117,6 +119,11 @@ func TestMain(m *testing.M) {
117119

118120
flag.Parse()
119121

122+
if metricsCollectorEnabledArg && !metricsServerEnabledArg {
123+
fmt.Fprint(os.Stderr, "metrics collector is enabled but metrics server is disabled.\n")
124+
os.Exit(1)
125+
}
126+
120127
customLabels, err := parseCustomLabels(labelsArg)
121128
if err != nil {
122129
fmt.Fprintf(os.Stderr, "failed to parse labels: %v\n", err)
@@ -150,7 +157,8 @@ func BenchmarkReexecuteRange(b *testing.B) {
150157
startBlockArg,
151158
endBlockArg,
152159
chanSizeArg,
153-
metricsEnabledArg,
160+
metricsServerEnabledArg,
161+
metricsCollectorEnabledArg,
154162
)
155163
})
156164
}
@@ -163,7 +171,8 @@ func benchmarkReexecuteRange(
163171
startBlock uint64,
164172
endBlock uint64,
165173
chanSize int,
166-
metricsEnabled bool,
174+
metricsServerEnabled bool,
175+
metricsCollectorEnabled bool,
167176
) {
168177
r := require.New(b)
169178
ctx := context.Background()
@@ -182,8 +191,12 @@ func benchmarkReexecuteRange(
182191

183192
log := tests.NewDefaultLogger("c-chain-reexecution")
184193

185-
if metricsEnabled {
186-
collectRegistry(b, log, "c-chain-reexecution", prefixGatherer, labels)
194+
if metricsServerEnabled {
195+
serverAddr := startServer(b, log, prefixGatherer)
196+
197+
if metricsCollectorEnabled {
198+
startCollector(b, log, "c-chain-reexecution", labels, serverAddr)
199+
}
187200
}
188201

189202
var (
@@ -546,9 +559,33 @@ func newConsensusMetrics(registry prometheus.Registerer) (*consensusMetrics, err
546559
return m, nil
547560
}
548561

549-
// collectRegistry starts prometheus and collects metrics from the provided gatherer.
550-
// Attaches the provided labels + GitHub labels if available to the collected metrics.
551-
func collectRegistry(tb testing.TB, log logging.Logger, name string, gatherer prometheus.Gatherer, labels map[string]string) {
562+
// startServer starts a Prometheus server for the provided gatherer and returns
563+
// the server address.
564+
func startServer(
565+
tb testing.TB,
566+
log logging.Logger,
567+
gatherer prometheus.Gatherer,
568+
) string {
569+
r := require.New(tb)
570+
571+
server, err := tests.NewPrometheusServer(gatherer)
572+
r.NoError(err)
573+
574+
log.Info("metrics endpoint available",
575+
zap.String("url", fmt.Sprintf("http://%s/ext/metrics", server.Address())),
576+
)
577+
578+
tb.Cleanup(func() {
579+
r.NoError(server.Stop())
580+
})
581+
582+
return server.Address()
583+
}
584+
585+
// startCollector starts a Prometheus collector configured to scrape the server
586+
// listening on serverAddr. startCollector also attaches the provided labels +
587+
// Github labels if available to the collected metrics.
588+
func startCollector(tb testing.TB, log logging.Logger, name string, labels map[string]string, serverAddr string) {
552589
r := require.New(tb)
553590

554591
startPromCtx, cancel := context.WithTimeout(context.Background(), tests.DefaultTimeout)
@@ -557,32 +594,27 @@ func collectRegistry(tb testing.TB, log logging.Logger, name string, gatherer pr
557594
logger := tests.NewDefaultLogger("prometheus")
558595
r.NoError(tmpnet.StartPrometheus(startPromCtx, logger))
559596

560-
server, err := tests.NewPrometheusServer(gatherer)
561-
r.NoError(err)
562-
563597
var sdConfigFilePath string
564598
tb.Cleanup(func() {
565599
// Ensure a final metrics scrape.
566600
// This default delay is set above the default scrape interval used by StartPrometheus.
567601
time.Sleep(tmpnet.NetworkShutdownDelay)
568602

569-
r.NoError(errors.Join(
570-
server.Stop(),
571-
func() error {
572-
if sdConfigFilePath != "" {
573-
return os.Remove(sdConfigFilePath)
574-
}
575-
return nil
576-
}(),
577-
))
603+
r.NoError(func() error {
604+
if sdConfigFilePath != "" {
605+
return os.Remove(sdConfigFilePath)
606+
}
607+
return nil
608+
}(),
609+
)
578610

579611
checkMetricsCtx, cancel := context.WithTimeout(context.Background(), tests.DefaultTimeout)
580612
defer cancel()
581613
r.NoError(tmpnet.CheckMetricsExist(checkMetricsCtx, logger, networkUUID))
582614
})
583615

584-
sdConfigFilePath, err = tmpnet.WritePrometheusSDConfig(name, tmpnet.SDConfig{
585-
Targets: []string{server.Address()},
616+
sdConfigFilePath, err := tmpnet.WritePrometheusSDConfig(name, tmpnet.SDConfig{
617+
Targets: []string{serverAddr},
586618
Labels: labels,
587619
}, true /* withGitHubLabels */)
588620
r.NoError(err)

0 commit comments

Comments
 (0)