ai-performance-engineering/code/templates/benchmark_run.yaml at main · cfregly/ai-performance-engineering · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
apiVersion: benchmarking.aisp.dev/v1alpha1
kind: BenchmarkRun
metadata:
  name: publication-inference-stack-b200
  labels:
    aisp.dev/benchmark-class: publication_grade
    aisp.dev/owner: performance-engineering
spec:
  intent:
    benchmarkClass: publication_grade
    workloadType: inference
    schedulerPath: slinky-kueue
    cadence: pre_release
  layers:
    - name: micro
      enabled: true
      objective: Isolate subsystem ceilings and regressions before rolling up to user-visible behavior.
      suites:
        - name: nccl-allreduce
          repoCommand: cluster/scripts/run_allreduce_stability.sh --run-id ${RUN_ID} --hosts ${HOSTS} --ssh-user ${SSH_USER}
        - name: fio-storage
          repoCommand: cluster/scripts/run_fio_all_nodes.sh --run-id ${RUN_ID} --hosts ${HOSTS} --ssh-user ${SSH_USER} --runtime 30 --repeats 3
        - name: model-server-kernels
          repoCommand: python -m cli.aisp bench run --targets labs/persistent_decode:persistent_decode --profile deep_dive --single-gpu
    - name: component
      enabled: true
      objective: Measure serving, data pipeline, and control-plane subsystems with stable workload specs.
      suites:
        - name: vllm-concurrency-sweep
          repoCommand: python -m cli.aisp cluster common-eval --preset common-answer-fast
        - name: dataloader
          repoCommand: python -m cli.aisp bench run --targets labs/async_input_pipeline --profile minimal
        - name: job-startup
          repoCommand: python -m cli.aisp cluster common-eval --preset multinode-readiness
    - name: end_to_end
      enabled: true
      objective: Validate realistic workflows after micro and component bottlenecks are understood.
      suites:
        - name: customer-serving-workflow
          repoCommand: python -m cli.aisp cluster common-eval --preset modern-llm
  workload:
    model: openai/gpt-oss-20b
    sequenceLengthMix:
      - inputTokens: 512
        outputTokens: 128
        weight: 0.7
      - inputTokens: 1024
        outputTokens: 256
        weight: 0.3
    precision: bf16
    batchingPolicy: continuous
    concurrencyModel: closed_loop
    datasetRef: eval_datasets/README.md
    fixedAcrossTrials:
      - model
      - sequenceLengthMix
      - precision
      - batchingPolicy
      - concurrencyModel
  comparison:
    variableUnderTest: runtime_version
    baseline:
      artifactRef: cluster/published/current
      description: currently published canonical package
    candidate:
      artifactRef: cluster/runs/${RUN_ID}
      description: run under test
    controls:
      fixed:
        model: openai/gpt-oss-20b
        sequenceLengthMix: "[{512/128@0.7},{1024/256@0.3}]"
        precision: bf16
        batchingPolicy: continuous
        concurrencyModel: closed_loop
      compareOneVariableAtATime: true
  metrics:
    training:
      enabled: false
      primary:
        - time_to_train_hours
        - mfu_pct
        - scaling_efficiency_pct
        - training_reliability_pct
    inference:
      enabled: true
      primary:
        - ttft_ms
        - tokens_per_second
        - p99_latency_ms
        - jitter_ms
        - cost_per_request_usd
        - cost_per_token_usd
      secondary:
        - request_goodput
        - p50_latency_ms
        - gpu_utilization_pct
  trials:
    minReplicates: 5
    confidenceLevel: 0.95
    outlierPolicy:
      method: mad
      action: flag_and_keep
    report:
      distributions: true
      confidenceIntervals: true
      rankLevelOutliers: true
  bottleneckAnalysis:
    taxonomy:
      - compute_bound
      - comm_bound
      - input_bound
      - control_plane_bound
    decomposeOverheads:
      - compute
      - communication
      - storage
      - orchestration
    instrumentation:
      compute_bound:
        - model_server_metrics
        - gpu_counters
        - ncu
      comm_bound:
        - nccl_traces
        - rdma_probes
        - nvlink_exporter
      input_bound:
        - storage_probes
        - dataloader_metrics
        - network_probes
      control_plane_bound:
        - scheduler_timing
        - queue_depth
        - job_startup_trace
  distributed:
    requireRankLevelVisibility: true
    collectives:
      - latency
      - bandwidth
      - outliers
    nodeDiagnosis:
      validate:
        - rdma_pathing
        - gpu_nic_affinity
        - pcie_link_health
        - nvlink_health
        - thermal_throttling
      remediation:
        - isolate_bad_node
        - cordon_problematic_node
      exporters:
        - node-pci-exporter
        - nvlink-exporter
        - gpu-thermal-exporter
  observability:
    correlation:
      stableJoinKeys:
        - run_id
        - benchmark_case_id
        - scheduler_run_id
        - job_uid
        - pod_uid
        - node_name
        - gpu_uuid
        - rank_id
        - request_id
        - trace_id
      publishedNumberLineage:
        rawArtifactManifestDigest: true
        warehouseRowLineage: true
        querySpecCaptured: true
    telemetrySources:
      service:
        - model_server_metrics
        - request_traces
        - queue_time
      infrastructure:
        - dcgm-exporter
        - nvlink-exporter
        - node-pci-exporter
        - ping-exporter
        - node-problem-detector
        - hpc-verification
      scheduler:
        - kubernetes_events
        - kueue
        - slinky
    scenarioPlaybooks:
      - tail_latency_regression
      - low_gpu_utilization
      - distributed_straggler
      - scheduler_backpressure
  provenance:
    capture:
      pinnedWorkloadSpec: true
      imageDigest: true
      driverCudaNcclRuntimeVersions: true
      hardwareTopology: true
      immutableRawArtifacts: true
      auditTrail: true
    signing:
      required: true
      backend: sigstore
      attestationFormat: in_toto
  executionPolicy:
    publicationGrade:
      dedicatedNodes: true
      stableBackgroundLoad: true
      fixedTopology: true
      topologyExclusiveScheduling: true
    realismGrade:
      multiTenantScenarios: false
      captureClusterContext: true
  sinks:
    rawArtifacts:
      store: object_storage
      pathTemplate: s3://benchmarks/raw/${RUN_ID}/
      retentionClass: cold_immutable
      artifacts:
        - logs
        - traces
        - profiler_reports
        - manifests
        - exported_metrics
    hotMetrics:
      store: prometheus
      retentionDays: 30
      cardinalityBudget:
        maxActiveSeries: 2000000
        dropDimensions:
          - request_id
          - trace_id
          - prompt_hash
    curatedWarehouse:
      store: parquet_duckdb
      layout:
        facts:
          - benchmark_run_fact
          - serving_outcome_fact
          - telemetry_slice_fact
        dimensions:
          - software_version_dim
          - hardware_topology_dim
          - cluster_region_dim
          - workload_dim
          - artifact_lineage_dim
      retention:
        hotDays: 30
        warmDays: 180
        coldDays: 730
      lineage:
        publishedNumbersTraceableToRaw: true
        manifestDigestColumn: true
        workloadSpecDigestColumn: true
  automation:
    ci:
      canary: true
      nightly: true
      preRelease: true