diff --git a/Cargo.lock b/Cargo.lock index 5031e1f..8babd2e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -744,6 +744,41 @@ dependencies = [ "libc", ] +[[package]] +name = "darling" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc7f46116c46ff9ab3eb1597a45688b6715c6e628b5c133e288e709a29bcb4ee" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d00b9596d185e565c2207a0b01f8bd1a135483d02d9b7b0a54b11da8d53412e" +dependencies = [ + "fnv", + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.100", +] + +[[package]] +name = "darling_macro" +version = "0.20.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc34b93ccb385b40dc71c6fceac4b2ad23662c7eeb248cf10d529b7e055b6ead" +dependencies = [ + "darling_core", + "quote", + "syn 2.0.100", +] + [[package]] name = "debugid" version = "0.8.0" @@ -1450,6 +1485,12 @@ version = "2.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25a2bc672d1148e28034f176e01fffebb08b35768468cc954630da77a1449005" +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + [[package]] name = "idna" version = "1.0.3" @@ -1876,6 +1917,29 @@ dependencies = [ "libc", ] +[[package]] +name = "nvml-wrapper" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c9bff0aa1d48904a1385ea2a8b97576fbdcbc9a3cfccd0d31fe978e1c4038c5" +dependencies = [ + "bitflags", + "libloading", + "nvml-wrapper-sys", + "static_assertions", + "thiserror 1.0.69", + "wrapcenum-derive", +] + +[[package]] +name = "nvml-wrapper-sys" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "698d45156f28781a4e79652b6ebe2eaa0589057d588d3aec1333f6466f13fcb5" +dependencies = [ + "libloading", +] + [[package]] name = "object" version = "0.36.7" @@ -2226,6 +2290,7 @@ dependencies = [ "local-ip-address", "mimalloc", "nix", + "nvml-wrapper", "opentelemetry", "opentelemetry-otlp", "opentelemetry_sdk", @@ -2258,8 +2323,10 @@ version = "0.0.0" dependencies = [ "anyhow", "num_cpus", + "nvml-wrapper", "procfs", "thiserror 2.0.12", + "tracing", "uname", "which 7.0.2", ] @@ -2813,6 +2880,12 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3" +[[package]] +name = "static_assertions" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f" + [[package]] name = "strsim" version = "0.11.1" @@ -4181,6 +4254,18 @@ dependencies = [ "wast 35.0.2", ] +[[package]] +name = "wrapcenum-derive" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a76ff259533532054cfbaefb115c613203c73707017459206380f03b3b3f266e" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 2.0.100", +] + [[package]] name = "write16" version = "1.0.0" diff --git a/Cargo.toml b/Cargo.toml index 620e02e..b0c2684 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,6 +49,7 @@ crossbeam = { workspace = true } influxdb-line-protocol = { workspace = true } psh-proto = { workspace = true } mimalloc = { workspace = true } +nvml-wrapper = { workspace = true } [lints] workspace = true @@ -88,6 +89,7 @@ crossbeam = "0.8" influxdb-line-protocol = "2" psh-proto = { git = "https://github.com/OptimatistOpenSource/psh-proto.git", rev = "ca2919053029cb584b478611f8bf8496bf3cf7f7" } mimalloc = "0.1" +nvml-wrapper = "0.10.0" [workspace.lints.rust] diff --git a/crates/psh-system/Cargo.toml b/crates/psh-system/Cargo.toml index a199db2..c0f917d 100644 --- a/crates/psh-system/Cargo.toml +++ b/crates/psh-system/Cargo.toml @@ -9,6 +9,8 @@ procfs = { workspace = true } thiserror = { workspace = true } uname = { workspace = true } which = { workspace = true } +nvml-wrapper = { workspace = true } +tracing = { workspace = true } [dev-dependencies] num_cpus = { workspace = true } diff --git a/crates/psh-system/src/error.rs b/crates/psh-system/src/error.rs index 7aa2ef7..7c0d57d 100644 --- a/crates/psh-system/src/error.rs +++ b/crates/psh-system/src/error.rs @@ -14,6 +14,7 @@ use std::{io, str::Utf8Error}; +use nvml_wrapper::error::NvmlError; use thiserror::Error; #[derive(Debug, Error)] @@ -32,6 +33,8 @@ pub enum Error { InvalidCpuMask(String), #[error("Value is empty")] EmptyValue, + #[error("Failed to init nvml: {0}.")] + Nvml(#[from] NvmlError), } pub type Result = std::result::Result; diff --git a/crates/psh-system/src/gpu/handle.rs b/crates/psh-system/src/gpu/handle.rs new file mode 100644 index 0000000..eba0ba2 --- /dev/null +++ b/crates/psh-system/src/gpu/handle.rs @@ -0,0 +1,43 @@ +use std::{sync::LazyLock, time::Duration}; + +use crate::{error::Result, utils::Handle}; + +use super::{ + GpuInfo, GpuStats, + raw::{gpu_info, gpu_stats}, +}; + +static GPU_INFO_HANDLE: LazyLock> = LazyLock::new(|| Handle::new(gpu_info)); +static GPU_STATS_HANDLE: LazyLock>> = LazyLock::new(|| Handle::new(gpu_stats)); + +#[derive(Clone, Debug)] +pub struct NvidiaHandle { + info: Handle, + stat: Handle>, +} + +impl NvidiaHandle { + pub fn new() -> Self { + Self { + info: GPU_INFO_HANDLE.clone(), + stat: GPU_STATS_HANDLE.clone(), + } + } + + pub fn info(&self) -> Result { + self.info.get(None) + } + + pub fn stat(&self, interval: D) -> Result> + where + D: Into>, + { + self.stat.get(interval.into()) + } +} + +impl Default for NvidiaHandle { + fn default() -> Self { + Self::new() + } +} diff --git a/crates/psh-system/src/gpu/mod.rs b/crates/psh-system/src/gpu/mod.rs new file mode 100644 index 0000000..76b11aa --- /dev/null +++ b/crates/psh-system/src/gpu/mod.rs @@ -0,0 +1,28 @@ +mod handle; +mod raw; + +use nvml_wrapper::struct_wrappers::device::{MemoryInfo, PciInfo, Utilization}; + +pub use handle::NvidiaHandle; + +#[derive(Clone, Debug, Default, PartialEq, Eq, PartialOrd, Ord)] +pub struct GpuInfo { + pub driver_version: String, + pub cuda_driver_version: i32, +} + +#[derive(Clone, Debug)] +pub struct GpuStats { + pub uuid: String, + /// the vec index is fan index + pub fan_speeds: Vec, + pub vbios_version: String, + pub temperature: u32, + pub name: String, + pub pci_info: PciInfo, + pub irq_num: u32, + pub max_pcie_link_gen: u32, + pub current_pcie_link_gen: u32, + pub utilization_rates: Utilization, + pub memory_info: MemoryInfo, +} diff --git a/crates/psh-system/src/gpu/raw.rs b/crates/psh-system/src/gpu/raw.rs new file mode 100644 index 0000000..38a82da --- /dev/null +++ b/crates/psh-system/src/gpu/raw.rs @@ -0,0 +1,55 @@ +use std::sync::LazyLock; + +use nvml_wrapper::{Nvml, enum_wrappers::device::TemperatureSensor, error::NvmlError}; + +use crate::error::Result; + +use super::{GpuInfo, GpuStats}; + +static NVML: LazyLock> = LazyLock::new(|| match Nvml::init() { + Ok(n) => Some(n), + Err(e) => { + tracing::warn!("{e}"); + None + } +}); + +pub fn gpu_info() -> Result { + let nvml = NVML.as_ref().ok_or(NvmlError::Uninitialized)?; + Ok(GpuInfo { + driver_version: nvml.sys_driver_version()?, + cuda_driver_version: nvml.sys_cuda_driver_version()?, + }) +} + +pub fn gpu_stats() -> Result> { + let nvml = NVML.as_ref().ok_or(NvmlError::Uninitialized)?; + + let count = nvml.device_count()?; + + let mut var = Vec::with_capacity(count as usize); + for i in 0..count { + let device = nvml.device_by_index(i)?; + let num_fans = device.num_fans()?; + let mut fan_speeds = Vec::with_capacity(num_fans as usize); + for i in 0..num_fans { + fan_speeds.push(device.fan_speed(i)?); + } + + var.push(GpuStats { + uuid: device.uuid()?, + fan_speeds, + vbios_version: device.vbios_version()?, + temperature: device.temperature(TemperatureSensor::Gpu)?, + name: device.name()?, + pci_info: device.pci_info()?, + irq_num: device.irq_num()?, + max_pcie_link_gen: device.max_pcie_link_gen()?, + current_pcie_link_gen: device.current_pcie_link_gen()?, + utilization_rates: device.utilization_rates()?, + memory_info: device.memory_info()?, + }); + } + + Ok(var) +} diff --git a/crates/psh-system/src/lib.rs b/crates/psh-system/src/lib.rs index 1dd270c..ec2b15a 100644 --- a/crates/psh-system/src/lib.rs +++ b/crates/psh-system/src/lib.rs @@ -15,6 +15,7 @@ pub mod cpu; pub mod disk; pub mod error; +pub mod gpu; pub mod interrupt; pub mod memory; pub mod network;