From 9da7f6ec67267a00946f4867e4b1f094f82acdf1 Mon Sep 17 00:00:00 2001 From: Chengdong Li Date: Thu, 24 Apr 2025 16:51:14 +0800 Subject: [PATCH 1/2] GPU: fix missing libnvidia-ml.so error. --- README.md | 243 +++++++++++++++++-------------- crates/psh-system/src/error.rs | 2 +- crates/psh-system/src/gpu/raw.rs | 38 ++++- src/otlp/gauges/gpu.rs | 9 +- 4 files changed, 176 insertions(+), 116 deletions(-) diff --git a/README.md b/README.md index 539759e..24553f1 100644 --- a/README.md +++ b/README.md @@ -1,107 +1,136 @@ -# Performance Savior Home (PSH) - -[![image](https://img.shields.io/github/v/release/OptimatistOpenSource/psh?include_prereleases&color=blue)](https://github.com/OptimatistOpenSource/psh/releases) -[![License: LGPL v3](https://img.shields.io/badge/License-LGPL%20v3-blue.svg)](http://www.gnu.org/licenses/lgpl-3.0) -[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](http://www.gnu.org/licenses/gpl-3.0) -[![image](https://img.shields.io/github/stars/OptimatistOpenSource/psh)](https://github.com/OptimatistOpenSource/psh/stargazers) -[![image](https://img.shields.io/github/issues/OptimatistOpenSource/psh)](https://github.com/OptimatistOpenSource/psh/issues) - -Performance Savior Home (PSH) collects software and hardware performance data when the cloud service is running. - -PSH's layout has WASM sitting at the top tier, while the foundation is made up -of operators responsible for scooping up performance stats, utilizing tech like -eBPF and the perf_event_open interface. This setup brings both a secure -environment and user-friendliness to the table, making it a breeze to work with -while keeping things locked down tight. - -It protects both the performance acquisition and computation algorithms of performance engineers and the sensitive data of companies applying PSH. - -## Overview - -Performance Savior Home (PSH) is a cutting-edge performance monitoring and analytics solution designed for cloud services. -It securely harvests software and hardware performance metrics while your cloud applications are in operation, safeguarding both the intricate performance tuning algorithms of engineers and the sensitive corporate data of its adopters. - -PSH achieves this through a dual-layered architecture leveraging WebAssembly (WASM) at the top and an array of robust operators at its foundation. - -PSH encapsulates low-level performance monitoring capabilities within WASM, -streamlining the development of performance collection tools with simplicity and -grace. Built with Rust, PSH inherently boasts memory safety, further enhancing -its robustness and reliability in high-stakes environments. - -PSH's vision is to reduce the duplication of construction within the enterprise -and to collect performance data in a reliable, low-overhead, and elegant way. - -## Key Features - -- **Secure Sandboxing**: Leverages WASM to create a secure sandbox for - performance data acquisition and processing algorithms, ensuring isolation and - preventing unauthorized access. Permission control ensures that sensitive data - is not collected, while WASM's performance data processing algorithms are - easier to protect. -- **Low-Level Insights**: PSH harnesses eBPF and perf_event_open to gather - detailed, real-time performance metrics from both software and hardware - levels, encompassing a wide spectrum of metrics across various system layers. - The result is a 360-degree view of your application's performance footprint. -- **Cross-Platform Compatibility**: PSH is designed from the ground up with - performance data acquisition and analysis for the ARM platform in mind, and is - compatible with both x86_64 and RISC-V architectures. -- **Highly Scalable Architecture**: PSH is designed for effortless scalability, - allowing users to easily extend both the algorithms executed within the WASM - environment and the range of performance events captured by operators. This - flexibility ensures that as technology stacks evolve or new monitoring - requirements arise, PSH can be adapted swiftly to meet those needs, - future-proofing your performance monitoring strategy. -- **Minimal Performance Overhead**: Preliminary testing indicates that PSH's - data collection incurs a negligible operational overhead, with current - measurements suggesting an impact of merely around 3%. This ensures that while - comprehensive monitoring is in place, the system's primary functions remain - unaffected, preserving optimal performance and responsiveness. - -## Config - -The default config is located in `/etc/psh/config.toml`. - -See [config template](./doc/config.toml) - -## Contribution Guide - -We welcome contributions! Please refer to the following guide for details on how -to get involved. - -Before submitting a pull request (PR) to PSH, it's crucial to perform a -self-check to ensure the quality and adherence to coding standards. Follow these -steps for an effective self-check: - -- Run Clippy: Execute `cargo clippy`, a lint tool for Rust designed to catch - common mistakes and enhance the overall quality of your Rust code. - -- Format Code: Utilize `cargo fmt` to format your Rust code, ensuring - consistency in code formatting across the project. - -- Security Audit: Employ `cargo audit` to enhance the security of your Rust - code. This command reviews your dependencies for any security vulnerabilities - reported to the RustSec Advisory Database. If you haven't installed - `cargo-audit` yet, you can do so by running `cargo install cargo-audit`. - -Failing to adhere to these self-check steps might result in your PR not being -reviewed promptly. Without completing these checks, the chances of finding a -reviewer willing to assess your PR may be reduced. Therefore, it is essential to -diligently follow the outlined steps to increase the likelihood of a successful -and timely review for your pull request. - -## Acknowledgments - -The development of the Performance Savior Home (PSH) project can be attributed -to the collaborative efforts and shared vision of Optimatsit Technology Co., Ltd -and Zhejiang University's -[SPAIL – System Performance Analytics Intelligence Lab](https://github.com/ZJU-SPAIL). - -

- - -

- -## License - -Performance Savior Home is distributed under the terms of the LGPL3.0/GPL3.0 -License. +# Performance Savior Home (PSH) + +[![image](https://img.shields.io/github/v/release/OptimatistOpenSource/psh?include_prereleases&color=blue)](https://github.com/OptimatistOpenSource/psh/releases) +[![License: LGPL v3](https://img.shields.io/badge/License-LGPL%20v3-blue.svg)](http://www.gnu.org/licenses/lgpl-3.0) +[![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](http://www.gnu.org/licenses/gpl-3.0) +[![image](https://img.shields.io/github/stars/OptimatistOpenSource/psh)](https://github.com/OptimatistOpenSource/psh/stargazers) +[![image](https://img.shields.io/github/issues/OptimatistOpenSource/psh)](https://github.com/OptimatistOpenSource/psh/issues) + +Performance Savior Home (PSH) collects software and hardware performance data when the cloud service is running. + +PSH's layout has WASM sitting at the top tier, while the foundation is made up +of operators responsible for scooping up performance stats, utilizing tech like +eBPF and the perf_event_open interface. This setup brings both a secure +environment and user-friendliness to the table, making it a breeze to work with +while keeping things locked down tight. + +It protects both the performance acquisition and computation algorithms of performance engineers and the sensitive data of companies applying PSH. + +## Overview + +Performance Savior Home (PSH) is a cutting-edge performance monitoring and analytics solution designed for cloud services. +It securely harvests software and hardware performance metrics while your cloud applications are in operation, safeguarding both the intricate performance tuning algorithms of engineers and the sensitive corporate data of its adopters. + +PSH achieves this through a dual-layered architecture leveraging WebAssembly (WASM) at the top and an array of robust operators at its foundation. + +PSH encapsulates low-level performance monitoring capabilities within WASM, +streamlining the development of performance collection tools with simplicity and +grace. Built with Rust, PSH inherently boasts memory safety, further enhancing +its robustness and reliability in high-stakes environments. + +PSH's vision is to reduce the duplication of construction within the enterprise +and to collect performance data in a reliable, low-overhead, and elegant way. + +## Key Features + +- **Secure Sandboxing**: Leverages WASM to create a secure sandbox for + performance data acquisition and processing algorithms, ensuring isolation and + preventing unauthorized access. Permission control ensures that sensitive data + is not collected, while WASM's performance data processing algorithms are + easier to protect. +- **Low-Level Insights**: PSH harnesses eBPF and perf_event_open to gather + detailed, real-time performance metrics from both software and hardware + levels, encompassing a wide spectrum of metrics across various system layers. + The result is a 360-degree view of your application's performance footprint. +- **Cross-Platform Compatibility**: PSH is designed from the ground up with + performance data acquisition and analysis for the ARM platform in mind, and is + compatible with both x86_64 and RISC-V architectures. +- **Highly Scalable Architecture**: PSH is designed for effortless scalability, + allowing users to easily extend both the algorithms executed within the WASM + environment and the range of performance events captured by operators. This + flexibility ensures that as technology stacks evolve or new monitoring + requirements arise, PSH can be adapted swiftly to meet those needs, + future-proofing your performance monitoring strategy. +- **Minimal Performance Overhead**: Preliminary testing indicates that PSH's + data collection incurs a negligible operational overhead, with current + measurements suggesting an impact of merely around 3%. This ensures that while + comprehensive monitoring is in place, the system's primary functions remain + unaffected, preserving optimal performance and responsiveness. + +## Config + +The default config is located in `/etc/psh/config.toml`. + +See [config template](./doc/config.toml) + +## Contribution Guide + +We welcome contributions! Please refer to the following guide for details on how +to get involved. + +Before submitting a pull request (PR) to PSH, it's crucial to perform a +self-check to ensure the quality and adherence to coding standards. Follow these +steps for an effective self-check: + +- Run Clippy: Execute `cargo clippy`, a lint tool for Rust designed to catch + common mistakes and enhance the overall quality of your Rust code. + +- Format Code: Utilize `cargo fmt` to format your Rust code, ensuring + consistency in code formatting across the project. + +- Security Audit: Employ `cargo audit` to enhance the security of your Rust + code. This command reviews your dependencies for any security vulnerabilities + reported to the RustSec Advisory Database. If you haven't installed + `cargo-audit` yet, you can do so by running `cargo install cargo-audit`. + +Failing to adhere to these self-check steps might result in your PR not being +reviewed promptly. Without completing these checks, the chances of finding a +reviewer willing to assess your PR may be reduced. Therefore, it is essential to +diligently follow the outlined steps to increase the likelihood of a successful +and timely review for your pull request. + +## Known issues: +### Warning: Failed to initialize NVML with all available methods +PSH requires NVIDIA Management Library (NVML) to collect GPU statistics. This warning appears when PSH cannot find or initialize the NVML library. PSH attempts to initialize NVML in the following order: + +1. Default system path +2. Architecture-specific paths: + - For x86_64: + - `/usr/lib/x86_64-linux-gnu/libnvidia-ml.so` + - `/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1` + - For ARM64: + - `/usr/lib/aarch64-linux-gnu/libnvidia-ml.so` + - `/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.1` + +To resolve this issue, you can: + +1. Install the NVIDIA driver if not already installed +2. If the library is installed in a non-standard location, add it to `LD_LIBRARY_PATH`: + ```bash + export LD_LIBRARY_PATH=/path/to/nvidia/lib:$LD_LIBRARY_PATH + ``` +3. Create a symbolic link to the library in the standard path for your architecture: + ```bash + # For x86_64 + sudo ln -s /path/to/libnvidia-ml.so /usr/lib/x86_64-linux-gnu/libnvidia-ml.so + + # For ARM64 + sudo ln -s /path/to/libnvidia-ml.so /usr/lib/aarch64-linux-gnu/libnvidia-ml.so + ``` + +## Acknowledgments + +The development of the Performance Savior Home (PSH) project can be attributed +to the collaborative efforts and shared vision of Optimatsit Technology Co., Ltd +and Zhejiang University's +[SPAIL – System Performance Analytics Intelligence Lab](https://github.com/ZJU-SPAIL). + +

+ + +

+ +## License + +Performance Savior Home is distributed under the terms of the LGPL3.0/GPL3.0 +License. diff --git a/crates/psh-system/src/error.rs b/crates/psh-system/src/error.rs index 7c0d57d..308b0b0 100644 --- a/crates/psh-system/src/error.rs +++ b/crates/psh-system/src/error.rs @@ -33,7 +33,7 @@ pub enum Error { InvalidCpuMask(String), #[error("Value is empty")] EmptyValue, - #[error("Failed to init nvml: {0}.")] + #[error(transparent)] Nvml(#[from] NvmlError), } diff --git a/crates/psh-system/src/gpu/raw.rs b/crates/psh-system/src/gpu/raw.rs index 38a82da..ef2e9a6 100644 --- a/crates/psh-system/src/gpu/raw.rs +++ b/crates/psh-system/src/gpu/raw.rs @@ -1,4 +1,4 @@ -use std::sync::LazyLock; +use std::{ffi::OsStr, path::Path, sync::LazyLock}; use nvml_wrapper::{Nvml, enum_wrappers::device::TemperatureSensor, error::NvmlError}; @@ -6,12 +6,38 @@ use crate::error::Result; use super::{GpuInfo, GpuStats}; -static NVML: LazyLock> = LazyLock::new(|| match Nvml::init() { - Ok(n) => Some(n), - Err(e) => { - tracing::warn!("{e}"); - None +#[cfg(target_arch = "x86_64")] +const LIB_PATHS: &[&str] = &[ + "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so", + "/usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1", +]; + +#[cfg(target_arch = "aarch64")] +const LIB_PATHS: &[&str] = &[ + "/usr/lib/aarch64-linux-gnu/libnvidia-ml.so", + "/usr/lib/aarch64-linux-gnu/libnvidia-ml.so.1", +]; + +#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))] +const LIB_PATHS: &[&str] = &[]; + +static NVML: LazyLock> = LazyLock::new(|| { + // First try default initialization + if let Ok(nvml) = Nvml::init() { + return Some(nvml); } + + // If default fails, try with specific library paths + for path in LIB_PATHS { + if Path::new(path).exists() { + if let Ok(nvml) = Nvml::builder().lib_path(OsStr::new(path)).init() { + return Some(nvml); + } + } + } + + tracing::warn!("Failed to initialize NVML with all available methods"); + None }); pub fn gpu_info() -> Result { diff --git a/src/otlp/gauges/gpu.rs b/src/otlp/gauges/gpu.rs index 5a9945b..f846043 100644 --- a/src/otlp/gauges/gpu.rs +++ b/src/otlp/gauges/gpu.rs @@ -14,6 +14,7 @@ use opentelemetry::{KeyValue, metrics::ObservableGauge}; use psh_system::gpu::NvidiaHandle; +use tracing::error; impl super::super::Otlp { pub fn gpu_gauges(&self) -> ObservableGauge { @@ -26,8 +27,12 @@ impl super::super::Otlp { .u64_observable_gauge("NvGpuStat") .with_description("System profile nvgpu statistics.") .with_callback(move |gauge| { - let Ok(gpustats) = nvgpu.stat(Some(interval)) else { - return; + let gpustats = match nvgpu.stat(Some(interval)) { + Ok(stats) => stats, + Err(e) => { + error!("Failed to collect GPU stats: {}", e); + return; + } }; for stat in gpustats { From c38953af8e21af4b8892f42307e86b43bc4e2cc7 Mon Sep 17 00:00:00 2001 From: Chengdong Li Date: Thu, 24 Apr 2025 17:04:06 +0800 Subject: [PATCH 2/2] GPU: add more metrics for nvgpu device. --- crates/psh-system/src/gpu/mod.rs | 47 +++++++++- crates/psh-system/src/gpu/raw.rs | 149 ++++++++++++++++++++++++++++--- src/otlp/gauges/gpu.rs | 95 ++++++++++++++++++-- 3 files changed, 267 insertions(+), 24 deletions(-) diff --git a/crates/psh-system/src/gpu/mod.rs b/crates/psh-system/src/gpu/mod.rs index 76b11aa..c99b28d 100644 --- a/crates/psh-system/src/gpu/mod.rs +++ b/crates/psh-system/src/gpu/mod.rs @@ -1,6 +1,7 @@ mod handle; mod raw; +use nvml_wrapper::enum_wrappers::device::MemoryLocation; use nvml_wrapper::struct_wrappers::device::{MemoryInfo, PciInfo, Utilization}; pub use handle::NvidiaHandle; @@ -11,18 +12,56 @@ pub struct GpuInfo { pub cuda_driver_version: i32, } +#[derive(Clone, Debug)] +pub struct EccErrorInfo { + pub location: MemoryLocation, + pub corrected_volatile: u64, + pub corrected_aggregate: u64, + pub uncorrected_volatile: u64, + pub uncorrected_aggregate: u64, +} + #[derive(Clone, Debug)] pub struct GpuStats { + // Static fields (rarely change) pub uuid: String, - /// the vec index is fan index - pub fan_speeds: Vec, - pub vbios_version: String, - pub temperature: u32, pub name: String, + pub vbios_version: String, pub pci_info: PciInfo, pub irq_num: u32, pub max_pcie_link_gen: u32, + pub max_pcie_link_width: u32, + + // Dynamic fields (change frequently) + // Temperature and cooling + pub temperature: u32, + pub fan_speeds: Vec, + + // PCIe status pub current_pcie_link_gen: u32, + pub current_pcie_link_width: u32, + + // Performance and utilization pub utilization_rates: Utilization, + pub performance_state: u32, + pub compute_mode: u32, + + // Memory pub memory_info: MemoryInfo, + pub ecc_errors: Vec, + + // Power + pub power_usage: u32, + pub power_limit: u32, + pub enforced_power_limit: u32, + + // Clocks + pub memory_clock: u32, + pub graphics_clock: u32, + pub sm_clock: u32, + pub video_clock: u32, + + // Processes + pub graphics_processes_count: u32, + pub compute_processes_count: u32, } diff --git a/crates/psh-system/src/gpu/raw.rs b/crates/psh-system/src/gpu/raw.rs index ef2e9a6..4bd5a1d 100644 --- a/crates/psh-system/src/gpu/raw.rs +++ b/crates/psh-system/src/gpu/raw.rs @@ -1,10 +1,14 @@ use std::{ffi::OsStr, path::Path, sync::LazyLock}; -use nvml_wrapper::{Nvml, enum_wrappers::device::TemperatureSensor, error::NvmlError}; +use nvml_wrapper::{ + Nvml, + enum_wrappers::device::{Clock, EccCounter, MemoryError, MemoryLocation, TemperatureSensor}, + error::NvmlError, +}; use crate::error::Result; -use super::{GpuInfo, GpuStats}; +use super::{EccErrorInfo, GpuInfo, GpuStats}; #[cfg(target_arch = "x86_64")] const LIB_PATHS: &[&str] = &[ @@ -56,24 +60,145 @@ pub fn gpu_stats() -> Result> { let mut var = Vec::with_capacity(count as usize); for i in 0..count { let device = nvml.device_by_index(i)?; + + // Static fields + let uuid = device.uuid()?; + let name = device.name()?; + let vbios_version = device.vbios_version()?; + let pci_info = device.pci_info()?; + let irq_num = device.irq_num()?; + let max_pcie_link_gen = device.max_pcie_link_gen()?; + let max_pcie_link_width = device.max_pcie_link_width()?; + + // Temperature and cooling + let temperature = device.temperature(TemperatureSensor::Gpu)?; let num_fans = device.num_fans()?; let mut fan_speeds = Vec::with_capacity(num_fans as usize); for i in 0..num_fans { fan_speeds.push(device.fan_speed(i)?); } + // PCIe status + let current_pcie_link_gen = device.current_pcie_link_gen()?; + let current_pcie_link_width = device.current_pcie_link_width()?; + + // Performance and utilization + let utilization_rates = device.utilization_rates()?; + let performance_state = device.performance_state()? as u32; + let compute_mode = device.compute_mode()? as u32; + + // Memory + let memory_info = device.memory_info()?; + let mut ecc_errors = Vec::new(); + match device.is_ecc_enabled() { + Ok(_) => { + let locations = [ + MemoryLocation::L1Cache, + MemoryLocation::L2Cache, + MemoryLocation::Device, + MemoryLocation::RegisterFile, + MemoryLocation::Texture, + MemoryLocation::Shared, + MemoryLocation::Cbu, + MemoryLocation::SRAM, + ]; + + for location in locations { + let corrected_volatile = device.memory_error_counter( + MemoryError::Corrected, + EccCounter::Volatile, + location.clone(), + )?; + let corrected_aggregate = device.memory_error_counter( + MemoryError::Corrected, + EccCounter::Aggregate, + location.clone(), + )?; + let uncorrected_volatile = device.memory_error_counter( + MemoryError::Uncorrected, + EccCounter::Volatile, + location.clone(), + )?; + let uncorrected_aggregate = device.memory_error_counter( + MemoryError::Uncorrected, + EccCounter::Aggregate, + location.clone(), + )?; + + ecc_errors.push(EccErrorInfo { + location: location.clone(), + corrected_volatile, + corrected_aggregate, + uncorrected_volatile, + uncorrected_aggregate, + }); + } + } + Err(NvmlError::NotSupported) => { + // device does not support ECC + } + Err(e) => { + tracing::warn!("Failed to collect ecc errors: {}", e); + return Err(e.into()); + } + } + + // Power + let power_usage = device.power_usage()?; + let power_limit = device.power_management_limit()?; + let enforced_power_limit = device.enforced_power_limit()?; + + // Clocks + let memory_clock = device.clock_info(Clock::Memory)?; + let graphics_clock = device.clock_info(Clock::Graphics)?; + let sm_clock = device.clock_info(Clock::SM)?; + let video_clock = device.clock_info(Clock::Video)?; + + // Processes + let graphics_processes_count = device.running_graphics_processes_count()?; + let compute_processes_count = device.running_compute_processes_count()?; + var.push(GpuStats { - uuid: device.uuid()?, + // Static fields + uuid, + name, + vbios_version, + pci_info, + irq_num, + max_pcie_link_gen, + max_pcie_link_width, + + // Temperature and cooling + temperature, fan_speeds, - vbios_version: device.vbios_version()?, - temperature: device.temperature(TemperatureSensor::Gpu)?, - name: device.name()?, - pci_info: device.pci_info()?, - irq_num: device.irq_num()?, - max_pcie_link_gen: device.max_pcie_link_gen()?, - current_pcie_link_gen: device.current_pcie_link_gen()?, - utilization_rates: device.utilization_rates()?, - memory_info: device.memory_info()?, + + // PCIe status + current_pcie_link_gen, + current_pcie_link_width, + + // Performance and utilization + utilization_rates, + performance_state, + compute_mode, + + // Memory + memory_info, + ecc_errors, + + // Power + power_usage, + power_limit, + enforced_power_limit, + + // Clocks + memory_clock, + graphics_clock, + sm_clock, + video_clock, + + // Processes + graphics_processes_count, + compute_processes_count, }); } diff --git a/src/otlp/gauges/gpu.rs b/src/otlp/gauges/gpu.rs index f846043..0dc8995 100644 --- a/src/otlp/gauges/gpu.rs +++ b/src/otlp/gauges/gpu.rs @@ -36,31 +36,96 @@ impl super::super::Otlp { }; for stat in gpustats { - let vals = [ + // Report per device metrics + let device_vals = [ + // Static fields (stat.irq_num.into(), KeyValue::new("stat", "irq_num")), + ( + stat.max_pcie_link_gen.into(), + KeyValue::new("stat", "max_pcie_link_gen"), + ), + ( + stat.max_pcie_link_width.into(), + KeyValue::new("stat", "max_pcie_link_width"), + ), + // Temperature and cooling ( stat.temperature.into(), KeyValue::new("stat", "temperature"), ), + // PCIe status ( - stat.max_pcie_link_gen.into(), - KeyValue::new("stat", "max_pcie_link_gen"), + stat.current_pcie_link_gen.into(), + KeyValue::new("stat", "current_pcie_link_gen"), + ), + ( + stat.current_pcie_link_width.into(), + KeyValue::new("stat", "current_pcie_link_width"), + ), + // Performance and utilization + ( + stat.utilization_rates.gpu.into(), + KeyValue::new("stat", "utilization_rates_gpu"), + ), + ( + stat.utilization_rates.memory.into(), + KeyValue::new("stat", "utilization_rates_memory"), ), + ( + stat.performance_state.into(), + KeyValue::new("stat", "performance_state"), + ), + ( + stat.compute_mode.into(), + KeyValue::new("stat", "compute_mode"), + ), + // Memory ( stat.memory_info.total, KeyValue::new("stat", "memory_total"), ), (stat.memory_info.used, KeyValue::new("stat", "memory_used")), + (stat.memory_info.free, KeyValue::new("stat", "memory_free")), + // Power ( - stat.utilization_rates.memory.into(), - KeyValue::new("stat", "utilization_rates_memory"), + stat.power_usage.into(), + KeyValue::new("stat", "power_usage"), ), ( - stat.utilization_rates.gpu.into(), - KeyValue::new("stat", "utilization_rates_gpu"), + stat.power_limit.into(), + KeyValue::new("stat", "power_limit"), + ), + ( + stat.enforced_power_limit.into(), + KeyValue::new("stat", "enforced_power_limit"), + ), + // Clocks + ( + stat.memory_clock.into(), + KeyValue::new("stat", "memory_clock"), + ), + ( + stat.graphics_clock.into(), + KeyValue::new("stat", "graphics_clock"), + ), + (stat.sm_clock.into(), KeyValue::new("stat", "sm_clock")), + ( + stat.video_clock.into(), + KeyValue::new("stat", "video_clock"), + ), + // Processes + ( + stat.graphics_processes_count.into(), + KeyValue::new("stat", "graphics_processes_count"), + ), + ( + stat.compute_processes_count.into(), + KeyValue::new("stat", "compute_processes_count"), ), ]; - for val in vals.into_iter() { + + // Report all static metrics + for val in device_vals.into_iter() { gauge.observe( val.0, &[ @@ -71,6 +136,20 @@ impl super::super::Otlp { ], ); } + + // Report fan speeds with fan index + for (idx, speed) in stat.fan_speeds.iter().enumerate() { + gauge.observe( + (*speed).into(), + &[ + KeyValue::new("host", host.clone()), + KeyValue::new("uuid", stat.uuid.clone()), + KeyValue::new("name", stat.name.clone()), + KeyValue::new("stat", "fan_speed"), + KeyValue::new("fan_index", idx.to_string()), + ], + ); + } } }) .build();