Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions src/api/server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ pub async fn run_api_mode(args: &ApiArgs) {
let gpu_readers = get_gpu_readers();
let cpu_readers = get_cpu_readers();
let memory_readers = get_memory_readers();
let mut disks = Disks::new_with_refreshed_list();
loop {
let all_gpu_info = gpu_readers
.iter()
Expand All @@ -140,8 +141,9 @@ pub async fn run_api_mode(args: &ApiArgs) {
Vec::new()
};

// Collect disk/storage info (cached in state to avoid per-request collection)
let storage_info = collect_storage_info();
// Refresh disk info in-place instead of creating a new Disks instance
disks.refresh(true);
let storage_info = collect_storage_info_from(&disks);

let mut state = state_clone.write().await;
state.gpu_info = all_gpu_info;
Expand Down Expand Up @@ -409,14 +411,13 @@ fn cleanup_socket(path: &std::path::Path) {
}
}

/// Collect storage/disk information
/// This is called in the background task and cached in AppState
fn collect_storage_info() -> Vec<StorageInfo> {
/// Collect storage/disk information from a pre-existing Disks instance.
/// The caller is responsible for calling `refresh_list()` before this function.
fn collect_storage_info_from(disks: &Disks) -> Vec<StorageInfo> {
let mut storage_info = Vec::new();
let disks = Disks::new_with_refreshed_list();
let hostname = get_hostname();

let mut filtered_disks = filter_docker_aware_disks(&disks);
let mut filtered_disks = filter_docker_aware_disks(disks);
filtered_disks.sort_by(|a, b| {
a.mount_point()
.to_string_lossy()
Expand Down
77 changes: 55 additions & 22 deletions src/device/readers/nvidia.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,10 @@ pub struct NvidiaGpuReader {
cuda_version: OnceLock<String>,
/// Cached static device information per device index
device_static_info: OnceLock<HashMap<u32, DeviceStaticInfo>>,
/// Cached NVML handle (initialized once, reused across calls)
nvml: Mutex<Option<Nvml>>,
/// Cached System instance for process info (reused across calls)
system: Mutex<System>,
}

impl Default for NvidiaGpuReader {
Expand All @@ -51,6 +55,8 @@ impl NvidiaGpuReader {
driver_version: OnceLock::new(),
cuda_version: OnceLock::new(),
device_static_info: OnceLock::new(),
nvml: Mutex::new(Nvml::init().ok()),
system: Mutex::new(System::new()),
}
}

Expand Down Expand Up @@ -78,6 +84,35 @@ impl NvidiaGpuReader {
.clone()
}

/// Execute a closure with a reference to the cached NVML handle.
/// Reinitializes the handle if it was previously unavailable or became invalid.
fn with_nvml<F, T>(&self, f: F) -> Result<T, NvmlError>
where
F: FnOnce(&Nvml) -> T,
{
let mut guard = self.nvml.lock().map_err(|_| NvmlError::Unknown)?;
// Try to use existing handle first
if let Some(ref nvml) = *guard {
// Validate the handle is still usable by querying device count
if nvml.device_count().is_ok() {
return Ok(f(nvml));
}
// Handle is stale, drop and reinitialize below
}
// Initialize or reinitialize
match Nvml::init() {
Ok(nvml) => {
let result = f(&nvml);
*guard = Some(nvml);
Ok(result)
}
Err(e) => {
*guard = None;
Err(e)
}
}
}

/// Get cached static device info for all devices, initializing if needed
fn get_device_static_info(&self, nvml: &Nvml) -> &HashMap<u32, DeviceStaticInfo> {
self.device_static_info.get_or_init(|| {
Expand All @@ -103,6 +138,17 @@ impl NvidiaGpuReader {
})
}

/// Get GPU processes using cached NVML handle, falling back to nvidia-smi
fn get_gpu_processes_cached(&self) -> (Vec<ProcessInfo>, HashSet<u32>) {
match self.with_nvml(get_gpu_processes_nvml) {
Ok(result) => result,
Err(e) => {
set_nvml_status(e);
get_gpu_processes_nvidia_smi()
}
}
}

/// Get GPU info using NVML with cached static values
fn get_gpu_info_nvml(&self, nvml: &Nvml) -> Vec<GpuInfo> {
let mut gpu_info = Vec::new();
Expand Down Expand Up @@ -165,14 +211,14 @@ impl NvidiaGpuReader {

impl GpuReader for NvidiaGpuReader {
fn get_gpu_info(&self) -> Vec<GpuInfo> {
// Try NVML first
match Nvml::init() {
Ok(nvml) => {
// Try cached NVML handle first
match self.with_nvml(|nvml| self.get_gpu_info_nvml(nvml)) {
Ok(info) => {
// Clear any previous error status on success
if let Ok(mut status) = NVML_STATUS.lock() {
*status = None;
}
self.get_gpu_info_nvml(&nvml)
info
}
Err(e) => {
// Store the error status for notification
Expand All @@ -183,19 +229,19 @@ impl GpuReader for NvidiaGpuReader {
}

fn get_process_info(&self) -> Vec<ProcessInfo> {
// Create a lightweight system instance and only refresh what we need
use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, UpdateKind};
let mut system = System::new();
// Refresh processes with user information

// Reuse the cached System instance
let mut system = self.system.lock().unwrap_or_else(|e| e.into_inner());
system.refresh_processes_specifics(
ProcessesToUpdate::All,
true,
ProcessRefreshKind::everything().with_user(UpdateKind::Always),
);
system.refresh_memory();

// Get GPU processes and PIDs
let (gpu_processes, gpu_pids) = get_gpu_processes();
// Get GPU processes and PIDs using cached NVML handle
let (gpu_processes, gpu_pids) = self.get_gpu_processes_cached();

// Get all system processes
let mut all_processes = get_all_processes(&system, &gpu_pids);
Expand Down Expand Up @@ -231,19 +277,6 @@ pub fn get_nvml_status_message() -> Option<String> {
}
}

// Helper function to get GPU processes
fn get_gpu_processes() -> (Vec<ProcessInfo>, HashSet<u32>) {
// Try NVML first
match Nvml::init() {
Ok(nvml) => get_gpu_processes_nvml(&nvml),
Err(e) => {
// Store the error status for notification
set_nvml_status(e);
get_gpu_processes_nvidia_smi()
}
}
}

// Get GPU processes using NVML
fn get_gpu_processes_nvml(nvml: &Nvml) -> (Vec<ProcessInfo>, HashSet<u32>) {
let mut gpu_processes = Vec::new();
Expand Down