|
7 | 7 | import pytest |
8 | 8 |
|
9 | 9 | from aiperf.common.models.telemetry_models import TelemetryMetrics, TelemetryRecord |
| 10 | +from tests.aiperf_mock_server.dcgm_faker import DCGMFaker |
10 | 11 |
|
11 | 12 |
|
12 | 13 | @pytest.fixture |
13 | 14 | def sample_dcgm_data(): |
14 | | - """Sample DCGM metrics data in Prometheus format (single GPU).""" |
15 | | - |
16 | | - return """# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz) |
17 | | -# TYPE DCGM_FI_DEV_SM_CLOCK gauge |
18 | | -DCGM_FI_DEV_SM_CLOCK{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 210 |
19 | | -# HELP DCGM_FI_DEV_MEM_CLOCK Memory clock frequency (in MHz) |
20 | | -# TYPE DCGM_FI_DEV_MEM_CLOCK gauge |
21 | | -DCGM_FI_DEV_MEM_CLOCK{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 405 |
22 | | -# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W) |
23 | | -# TYPE DCGM_FI_DEV_POWER_USAGE gauge |
24 | | -DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 22.582000 |
25 | | -# HELP DCGM_FI_DEV_POWER_MGMT_LIMIT Power management limit (in W) |
26 | | -# TYPE DCGM_FI_DEV_POWER_MGMT_LIMIT gauge |
27 | | -DCGM_FI_DEV_POWER_MGMT_LIMIT{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 300.0 |
28 | | -# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ) |
29 | | -# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter |
30 | | -DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 955287014 |
31 | | -# HELP DCGM_FI_DEV_GPU_UTIL GPU utilization (in %) |
32 | | -# TYPE DCGM_FI_DEV_GPU_UTIL gauge |
33 | | -DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 1 |
34 | | -# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB) |
35 | | -# TYPE DCGM_FI_DEV_FB_USED gauge |
36 | | -DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 46614 |
37 | | -# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB) |
38 | | -# TYPE DCGM_FI_DEV_FB_FREE gauge |
39 | | -DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 2048 |
40 | | -# HELP DCGM_FI_DEV_FB_TOTAL Total framebuffer memory (in MiB) |
41 | | -# TYPE DCGM_FI_DEV_FB_TOTAL gauge |
42 | | -DCGM_FI_DEV_FB_TOTAL{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 48662 |
43 | | -# HELP DCGM_FI_DEV_MEM_COPY_UTIL Memory copy utilization (in %) |
44 | | -# TYPE DCGM_FI_DEV_MEM_COPY_UTIL gauge |
45 | | -DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 15 |
46 | | -# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered |
47 | | -# TYPE DCGM_FI_DEV_XID_ERRORS gauge |
48 | | -DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 0 |
49 | | -# HELP DCGM_FI_DEV_POWER_VIOLATION Throttling duration due to power constraints (in us) |
50 | | -# TYPE DCGM_FI_DEV_POWER_VIOLATION counter |
51 | | -DCGM_FI_DEV_POWER_VIOLATION{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 12000 |
52 | | -# HELP DCGM_FI_DEV_THERMAL_VIOLATION Throttling duration due to thermal constraints (in us) |
53 | | -# TYPE DCGM_FI_DEV_THERMAL_VIOLATION counter |
54 | | -DCGM_FI_DEV_THERMAL_VIOLATION{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 5000 |
55 | | -""" |
| 15 | + """Sample DCGM metrics from DCGMFaker (single GPU).""" |
| 16 | + |
| 17 | + faker = DCGMFaker( |
| 18 | + gpu_name="rtx6000", |
| 19 | + num_gpus=1, |
| 20 | + seed=42, |
| 21 | + hostname="ed7e7a5e585f", |
| 22 | + initial_load=0.1, |
| 23 | + ) |
| 24 | + return faker.generate() |
56 | 25 |
|
57 | 26 |
|
58 | 27 | @pytest.fixture |
59 | 28 | def multi_gpu_dcgm_data(): |
60 | | - """Sample DCGM metrics data with multiple GPUs.""" |
61 | | - |
62 | | - return """# HELP DCGM_FI_DEV_POWER_USAGE Power draw (in W) |
63 | | -# TYPE DCGM_FI_DEV_POWER_USAGE gauge |
64 | | -DCGM_FI_DEV_POWER_USAGE{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 79.60 |
65 | | -DCGM_FI_DEV_POWER_USAGE{gpu="1",UUID="GPU-12345678-1234-1234-1234-123456789abc",pci_bus_id="00000000:03:00.0",device="nvidia1",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 42.09 |
66 | | -DCGM_FI_DEV_POWER_USAGE{gpu="2",UUID="GPU-87654321-4321-4321-4321-cba987654321",pci_bus_id="00000000:04:00.0",device="nvidia2",modelName="NVIDIA H100 PCIe",Hostname="ed7e7a5e585f"} 43.99 |
67 | | -# HELP DCGM_FI_DEV_POWER_MGMT_LIMIT Power management limit (in W) |
68 | | -# TYPE DCGM_FI_DEV_POWER_MGMT_LIMIT gauge |
69 | | -DCGM_FI_DEV_POWER_MGMT_LIMIT{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 300.0 |
70 | | -DCGM_FI_DEV_POWER_MGMT_LIMIT{gpu="1",UUID="GPU-12345678-1234-1234-1234-123456789abc",pci_bus_id="00000000:03:00.0",device="nvidia1",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 300.0 |
71 | | -DCGM_FI_DEV_POWER_MGMT_LIMIT{gpu="2",UUID="GPU-87654321-4321-4321-4321-cba987654321",pci_bus_id="00000000:04:00.0",device="nvidia2",modelName="NVIDIA H100 PCIe",Hostname="ed7e7a5e585f"} 700.0 |
72 | | -# HELP DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION Total energy consumption since boot (in mJ) |
73 | | -# TYPE DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION counter |
74 | | -DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 280000000 |
75 | | -DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="1",UUID="GPU-12345678-1234-1234-1234-123456789abc",pci_bus_id="00000000:03:00.0",device="nvidia1",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 230000000 |
76 | | -DCGM_FI_DEV_TOTAL_ENERGY_CONSUMPTION{gpu="2",UUID="GPU-87654321-4321-4321-4321-cba987654321",pci_bus_id="00000000:04:00.0",device="nvidia2",modelName="NVIDIA H100 PCIe",Hostname="ed7e7a5e585f"} 250000000 |
77 | | -# HELP DCGM_FI_DEV_GPU_UTIL GPU utilization (in %) |
78 | | -# TYPE DCGM_FI_DEV_GPU_UTIL gauge |
79 | | -DCGM_FI_DEV_GPU_UTIL{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 34 |
80 | | -DCGM_FI_DEV_GPU_UTIL{gpu="1",UUID="GPU-12345678-1234-1234-1234-123456789abc",pci_bus_id="00000000:03:00.0",device="nvidia1",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 0 |
81 | | -DCGM_FI_DEV_GPU_UTIL{gpu="2",UUID="GPU-87654321-4321-4321-4321-cba987654321",pci_bus_id="00000000:04:00.0",device="nvidia2",modelName="NVIDIA H100 PCIe",Hostname="ed7e7a5e585f"} 0 |
82 | | -# HELP DCGM_FI_DEV_FB_USED Framebuffer memory used (in MiB) |
83 | | -# TYPE DCGM_FI_DEV_FB_USED gauge |
84 | | -DCGM_FI_DEV_FB_USED{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 15640 |
85 | | -DCGM_FI_DEV_FB_USED{gpu="1",UUID="GPU-12345678-1234-1234-1234-123456789abc",pci_bus_id="00000000:03:00.0",device="nvidia1",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 0 |
86 | | -DCGM_FI_DEV_FB_USED{gpu="2",UUID="GPU-87654321-4321-4321-4321-cba987654321",pci_bus_id="00000000:04:00.0",device="nvidia2",modelName="NVIDIA H100 PCIe",Hostname="ed7e7a5e585f"} 0 |
87 | | -# HELP DCGM_FI_DEV_FB_FREE Framebuffer memory free (in MiB) |
88 | | -# TYPE DCGM_FI_DEV_FB_FREE gauge |
89 | | -DCGM_FI_DEV_FB_FREE{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 33022 |
90 | | -DCGM_FI_DEV_FB_FREE{gpu="1",UUID="GPU-12345678-1234-1234-1234-123456789abc",pci_bus_id="00000000:03:00.0",device="nvidia1",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 48662 |
91 | | -DCGM_FI_DEV_FB_FREE{gpu="2",UUID="GPU-87654321-4321-4321-4321-cba987654321",pci_bus_id="00000000:04:00.0",device="nvidia2",modelName="NVIDIA H100 PCIe",Hostname="ed7e7a5e585f"} 81920 |
92 | | -# HELP DCGM_FI_DEV_FB_TOTAL Total framebuffer memory (in MiB) |
93 | | -# TYPE DCGM_FI_DEV_FB_TOTAL gauge |
94 | | -DCGM_FI_DEV_FB_TOTAL{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 48662 |
95 | | -DCGM_FI_DEV_FB_TOTAL{gpu="1",UUID="GPU-12345678-1234-1234-1234-123456789abc",pci_bus_id="00000000:03:00.0",device="nvidia1",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 48662 |
96 | | -DCGM_FI_DEV_FB_TOTAL{gpu="2",UUID="GPU-87654321-4321-4321-4321-cba987654321",pci_bus_id="00000000:04:00.0",device="nvidia2",modelName="NVIDIA H100 PCIe",Hostname="ed7e7a5e585f"} 81920 |
97 | | -# HELP DCGM_FI_DEV_MEM_COPY_UTIL Memory copy utilization (in %) |
98 | | -# TYPE DCGM_FI_DEV_MEM_COPY_UTIL gauge |
99 | | -DCGM_FI_DEV_MEM_COPY_UTIL{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 20 |
100 | | -DCGM_FI_DEV_MEM_COPY_UTIL{gpu="1",UUID="GPU-12345678-1234-1234-1234-123456789abc",pci_bus_id="00000000:03:00.0",device="nvidia1",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 0 |
101 | | -DCGM_FI_DEV_MEM_COPY_UTIL{gpu="2",UUID="GPU-87654321-4321-4321-4321-cba987654321",pci_bus_id="00000000:04:00.0",device="nvidia2",modelName="NVIDIA H100 PCIe",Hostname="ed7e7a5e585f"} 10 |
102 | | -# HELP DCGM_FI_DEV_XID_ERRORS Value of the last XID error encountered |
103 | | -# TYPE DCGM_FI_DEV_XID_ERRORS gauge |
104 | | -DCGM_FI_DEV_XID_ERRORS{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 0 |
105 | | -DCGM_FI_DEV_XID_ERRORS{gpu="1",UUID="GPU-12345678-1234-1234-1234-123456789abc",pci_bus_id="00000000:03:00.0",device="nvidia1",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 0 |
106 | | -DCGM_FI_DEV_XID_ERRORS{gpu="2",UUID="GPU-87654321-4321-4321-4321-cba987654321",pci_bus_id="00000000:04:00.0",device="nvidia2",modelName="NVIDIA H100 PCIe",Hostname="ed7e7a5e585f"} 0 |
107 | | -# HELP DCGM_FI_DEV_POWER_VIOLATION Throttling duration due to power constraints (in us) |
108 | | -# TYPE DCGM_FI_DEV_POWER_VIOLATION counter |
109 | | -DCGM_FI_DEV_POWER_VIOLATION{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 15000 |
110 | | -DCGM_FI_DEV_POWER_VIOLATION{gpu="1",UUID="GPU-12345678-1234-1234-1234-123456789abc",pci_bus_id="00000000:03:00.0",device="nvidia1",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 0 |
111 | | -DCGM_FI_DEV_POWER_VIOLATION{gpu="2",UUID="GPU-87654321-4321-4321-4321-cba987654321",pci_bus_id="00000000:04:00.0",device="nvidia2",modelName="NVIDIA H100 PCIe",Hostname="ed7e7a5e585f"} 8000 |
112 | | -# HELP DCGM_FI_DEV_THERMAL_VIOLATION Throttling duration due to thermal constraints (in us) |
113 | | -# TYPE DCGM_FI_DEV_THERMAL_VIOLATION counter |
114 | | -DCGM_FI_DEV_THERMAL_VIOLATION{gpu="0",UUID="GPU-ef6ef310-f8e2-cef9-036e-8f12d59b5ffc",pci_bus_id="00000000:02:00.0",device="nvidia0",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 3000 |
115 | | -DCGM_FI_DEV_THERMAL_VIOLATION{gpu="1",UUID="GPU-12345678-1234-1234-1234-123456789abc",pci_bus_id="00000000:03:00.0",device="nvidia1",modelName="NVIDIA RTX 6000 Ada Generation",Hostname="ed7e7a5e585f"} 0 |
116 | | -DCGM_FI_DEV_THERMAL_VIOLATION{gpu="2",UUID="GPU-87654321-4321-4321-4321-cba987654321",pci_bus_id="00000000:04:00.0",device="nvidia2",modelName="NVIDIA H100 PCIe",Hostname="ed7e7a5e585f"} 5000 |
117 | | -""" |
| 29 | + """Multi-GPU DCGM metrics from DCGMFaker (3 GPUs, mixed types).""" |
| 30 | + |
| 31 | + faker = DCGMFaker( |
| 32 | + gpu_name="rtx6000", |
| 33 | + num_gpus=3, |
| 34 | + seed=42, |
| 35 | + hostname="ed7e7a5e585f", |
| 36 | + initial_load=0.3, |
| 37 | + ) |
| 38 | + return faker.generate() |
118 | 39 |
|
119 | 40 |
|
120 | 41 | @pytest.fixture |
|
0 commit comments