-
Notifications
You must be signed in to change notification settings - Fork 185
Expand file tree
/
Copy pathperformance_warehouse_contract.yaml
More file actions
134 lines (127 loc) · 2.85 KB
/
performance_warehouse_contract.yaml
File metadata and controls
134 lines (127 loc) · 2.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# Use this file when defining the warehouse contract that sits behind BenchmarkRun
# and benchmark publication. It is a design template, not a deployment-specific
# manifest.
identity:
run_keys:
- run_id
- benchmark_case_id
- workload_spec_digest
- artifact_manifest_digest
- scheduler_run_id
workload_specific_keys:
training:
- rank_id
inference:
- request_id
- trace_id
resource_keys:
- job_uid
- pod_uid
- node_name
- gpu_uuid
raw_artifact_store:
backend: object_storage
uri_template: s3://benchmarks/raw/${RUN_ID}/
immutability: true
retention_tier: cold
required_artifacts:
- logs
- traces
- profiler_reports
- manifests
- exported_metrics
- scheduler_events
curated_warehouse:
backend: parquet_duckdb # parquet_duckdb | clickhouse | bigquery | snowflake | iceberg
tables:
facts:
- benchmark_run_fact
- serving_outcome_fact
- training_outcome_fact
- telemetry_slice_fact
dimensions:
- software_version_dim
- hardware_topology_dim
- cluster_region_dim
- workload_dim
- artifact_lineage_dim
stable_dimensions:
software_versions:
- image_digest
- driver_version
- cuda_version
- nccl_version
- framework_version
- runtime_version
hardware_topology:
- gpu_model
- gpu_count
- pcie_layout
- nvlink_topology
- gpu_nic_affinity
- numa_layout
cluster_metadata:
- cluster
- region
- availability_zone
- node_pool
- scheduler_path
- resource_flavor
workload_parameters:
- model
- precision
- protocol
- deployment_model
- batching_policy
- concurrency_point
- prompt_distribution
- completion_distribution
lineage:
published_numbers_traceable_to_raw: true
manifest_digest_column: true
workload_spec_digest_column: true
query_spec_digest_column: true
hot_metrics:
backend: prometheus
retention_days: 30
cardinality_budget:
max_active_series: 2000000
allowed_labels:
- run_id
- benchmark_case_id
- cluster
- region
- node_pool
- gpu_model
- scheduler_path
- concurrency_bucket
forbidden_labels:
- request_id
- trace_id
- prompt_hash
- arbitrary_user_label
telemetry_sources:
service:
- model_server_metrics
- request_traces
- queue_time
infrastructure:
- dcgm-exporter
- nvlink-exporter
- node-pci-exporter
- ping-exporter
- node-problem-detector
- hpc-verification
scheduler:
- kubernetes_events
- kueue
- slinky
retention_tiers:
hot_days: 30
warm_days: 180
cold_days: 730
scenario_playbooks:
- tail_latency_regression
- low_gpu_utilization
- distributed_straggler
- scheduler_backpressure