diff --git a/src/api/server.rs b/src/api/server.rs index 5301002..adfb9d3 100644 --- a/src/api/server.rs +++ b/src/api/server.rs @@ -115,6 +115,7 @@ pub async fn run_api_mode(args: &ApiArgs) { let gpu_readers = get_gpu_readers(); let cpu_readers = get_cpu_readers(); let memory_readers = get_memory_readers(); + let mut disks = Disks::new_with_refreshed_list(); loop { let all_gpu_info = gpu_readers .iter() @@ -140,8 +141,9 @@ pub async fn run_api_mode(args: &ApiArgs) { Vec::new() }; - // Collect disk/storage info (cached in state to avoid per-request collection) - let storage_info = collect_storage_info(); + // Refresh disk info in-place instead of creating a new Disks instance + disks.refresh(true); + let storage_info = collect_storage_info_from(&disks); let mut state = state_clone.write().await; state.gpu_info = all_gpu_info; @@ -409,14 +411,13 @@ fn cleanup_socket(path: &std::path::Path) { } } -/// Collect storage/disk information -/// This is called in the background task and cached in AppState -fn collect_storage_info() -> Vec { +/// Collect storage/disk information from a pre-existing Disks instance. +/// The caller is responsible for calling `refresh_list()` before this function. +fn collect_storage_info_from(disks: &Disks) -> Vec { let mut storage_info = Vec::new(); - let disks = Disks::new_with_refreshed_list(); let hostname = get_hostname(); - let mut filtered_disks = filter_docker_aware_disks(&disks); + let mut filtered_disks = filter_docker_aware_disks(disks); filtered_disks.sort_by(|a, b| { a.mount_point() .to_string_lossy() diff --git a/src/device/readers/nvidia.rs b/src/device/readers/nvidia.rs index a405eb2..86e5c0b 100644 --- a/src/device/readers/nvidia.rs +++ b/src/device/readers/nvidia.rs @@ -37,6 +37,10 @@ pub struct NvidiaGpuReader { cuda_version: OnceLock, /// Cached static device information per device index device_static_info: OnceLock>, + /// Cached NVML handle (initialized once, reused across calls) + nvml: Mutex>, + /// Cached System instance for process info (reused across calls) + system: Mutex, } impl Default for NvidiaGpuReader { @@ -51,6 +55,8 @@ impl NvidiaGpuReader { driver_version: OnceLock::new(), cuda_version: OnceLock::new(), device_static_info: OnceLock::new(), + nvml: Mutex::new(Nvml::init().ok()), + system: Mutex::new(System::new()), } } @@ -78,6 +84,35 @@ impl NvidiaGpuReader { .clone() } + /// Execute a closure with a reference to the cached NVML handle. + /// Reinitializes the handle if it was previously unavailable or became invalid. + fn with_nvml(&self, f: F) -> Result + where + F: FnOnce(&Nvml) -> T, + { + let mut guard = self.nvml.lock().map_err(|_| NvmlError::Unknown)?; + // Try to use existing handle first + if let Some(ref nvml) = *guard { + // Validate the handle is still usable by querying device count + if nvml.device_count().is_ok() { + return Ok(f(nvml)); + } + // Handle is stale, drop and reinitialize below + } + // Initialize or reinitialize + match Nvml::init() { + Ok(nvml) => { + let result = f(&nvml); + *guard = Some(nvml); + Ok(result) + } + Err(e) => { + *guard = None; + Err(e) + } + } + } + /// Get cached static device info for all devices, initializing if needed fn get_device_static_info(&self, nvml: &Nvml) -> &HashMap { self.device_static_info.get_or_init(|| { @@ -103,6 +138,17 @@ impl NvidiaGpuReader { }) } + /// Get GPU processes using cached NVML handle, falling back to nvidia-smi + fn get_gpu_processes_cached(&self) -> (Vec, HashSet) { + match self.with_nvml(get_gpu_processes_nvml) { + Ok(result) => result, + Err(e) => { + set_nvml_status(e); + get_gpu_processes_nvidia_smi() + } + } + } + /// Get GPU info using NVML with cached static values fn get_gpu_info_nvml(&self, nvml: &Nvml) -> Vec { let mut gpu_info = Vec::new(); @@ -165,14 +211,14 @@ impl NvidiaGpuReader { impl GpuReader for NvidiaGpuReader { fn get_gpu_info(&self) -> Vec { - // Try NVML first - match Nvml::init() { - Ok(nvml) => { + // Try cached NVML handle first + match self.with_nvml(|nvml| self.get_gpu_info_nvml(nvml)) { + Ok(info) => { // Clear any previous error status on success if let Ok(mut status) = NVML_STATUS.lock() { *status = None; } - self.get_gpu_info_nvml(&nvml) + info } Err(e) => { // Store the error status for notification @@ -183,10 +229,10 @@ impl GpuReader for NvidiaGpuReader { } fn get_process_info(&self) -> Vec { - // Create a lightweight system instance and only refresh what we need use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, UpdateKind}; - let mut system = System::new(); - // Refresh processes with user information + + // Reuse the cached System instance + let mut system = self.system.lock().unwrap_or_else(|e| e.into_inner()); system.refresh_processes_specifics( ProcessesToUpdate::All, true, @@ -194,8 +240,8 @@ impl GpuReader for NvidiaGpuReader { ); system.refresh_memory(); - // Get GPU processes and PIDs - let (gpu_processes, gpu_pids) = get_gpu_processes(); + // Get GPU processes and PIDs using cached NVML handle + let (gpu_processes, gpu_pids) = self.get_gpu_processes_cached(); // Get all system processes let mut all_processes = get_all_processes(&system, &gpu_pids); @@ -231,19 +277,6 @@ pub fn get_nvml_status_message() -> Option { } } -// Helper function to get GPU processes -fn get_gpu_processes() -> (Vec, HashSet) { - // Try NVML first - match Nvml::init() { - Ok(nvml) => get_gpu_processes_nvml(&nvml), - Err(e) => { - // Store the error status for notification - set_nvml_status(e); - get_gpu_processes_nvidia_smi() - } - } -} - // Get GPU processes using NVML fn get_gpu_processes_nvml(nvml: &Nvml) -> (Vec, HashSet) { let mut gpu_processes = Vec::new();