From c5b7d8da6e44b64eb292cbd33da92fb65bb0662d Mon Sep 17 00:00:00 2001 From: Jeongkyu Shin Date: Fri, 6 Feb 2026 13:22:45 +0900 Subject: [PATCH] fix: prevent file descriptor leak in API mode by reusing resource handles Cache Nvml, System, and Disks instances instead of recreating them on every metrics collection iteration, preventing FD exhaustion in long-running API server processes. - Store Nvml handle in NvidiaGpuReader with graceful reinit on failure - Reuse sysinfo::System across get_process_info() calls - Create Disks once before the API loop and refresh in-place each cycle - Remove now-unused standalone get_gpu_processes() function Closes #118 --- src/api/server.rs | 15 +++---- src/device/readers/nvidia.rs | 77 +++++++++++++++++++++++++----------- 2 files changed, 63 insertions(+), 29 deletions(-) diff --git a/src/api/server.rs b/src/api/server.rs index 53010024..adfb9d3f 100644 --- a/src/api/server.rs +++ b/src/api/server.rs @@ -115,6 +115,7 @@ pub async fn run_api_mode(args: &ApiArgs) { let gpu_readers = get_gpu_readers(); let cpu_readers = get_cpu_readers(); let memory_readers = get_memory_readers(); + let mut disks = Disks::new_with_refreshed_list(); loop { let all_gpu_info = gpu_readers .iter() @@ -140,8 +141,9 @@ pub async fn run_api_mode(args: &ApiArgs) { Vec::new() }; - // Collect disk/storage info (cached in state to avoid per-request collection) - let storage_info = collect_storage_info(); + // Refresh disk info in-place instead of creating a new Disks instance + disks.refresh(true); + let storage_info = collect_storage_info_from(&disks); let mut state = state_clone.write().await; state.gpu_info = all_gpu_info; @@ -409,14 +411,13 @@ fn cleanup_socket(path: &std::path::Path) { } } -/// Collect storage/disk information -/// This is called in the background task and cached in AppState -fn collect_storage_info() -> Vec { +/// Collect storage/disk information from a pre-existing Disks instance. +/// The caller is responsible for calling `refresh_list()` before this function. +fn collect_storage_info_from(disks: &Disks) -> Vec { let mut storage_info = Vec::new(); - let disks = Disks::new_with_refreshed_list(); let hostname = get_hostname(); - let mut filtered_disks = filter_docker_aware_disks(&disks); + let mut filtered_disks = filter_docker_aware_disks(disks); filtered_disks.sort_by(|a, b| { a.mount_point() .to_string_lossy() diff --git a/src/device/readers/nvidia.rs b/src/device/readers/nvidia.rs index a405eb2a..86e5c0b9 100644 --- a/src/device/readers/nvidia.rs +++ b/src/device/readers/nvidia.rs @@ -37,6 +37,10 @@ pub struct NvidiaGpuReader { cuda_version: OnceLock, /// Cached static device information per device index device_static_info: OnceLock>, + /// Cached NVML handle (initialized once, reused across calls) + nvml: Mutex>, + /// Cached System instance for process info (reused across calls) + system: Mutex, } impl Default for NvidiaGpuReader { @@ -51,6 +55,8 @@ impl NvidiaGpuReader { driver_version: OnceLock::new(), cuda_version: OnceLock::new(), device_static_info: OnceLock::new(), + nvml: Mutex::new(Nvml::init().ok()), + system: Mutex::new(System::new()), } } @@ -78,6 +84,35 @@ impl NvidiaGpuReader { .clone() } + /// Execute a closure with a reference to the cached NVML handle. + /// Reinitializes the handle if it was previously unavailable or became invalid. + fn with_nvml(&self, f: F) -> Result + where + F: FnOnce(&Nvml) -> T, + { + let mut guard = self.nvml.lock().map_err(|_| NvmlError::Unknown)?; + // Try to use existing handle first + if let Some(ref nvml) = *guard { + // Validate the handle is still usable by querying device count + if nvml.device_count().is_ok() { + return Ok(f(nvml)); + } + // Handle is stale, drop and reinitialize below + } + // Initialize or reinitialize + match Nvml::init() { + Ok(nvml) => { + let result = f(&nvml); + *guard = Some(nvml); + Ok(result) + } + Err(e) => { + *guard = None; + Err(e) + } + } + } + /// Get cached static device info for all devices, initializing if needed fn get_device_static_info(&self, nvml: &Nvml) -> &HashMap { self.device_static_info.get_or_init(|| { @@ -103,6 +138,17 @@ impl NvidiaGpuReader { }) } + /// Get GPU processes using cached NVML handle, falling back to nvidia-smi + fn get_gpu_processes_cached(&self) -> (Vec, HashSet) { + match self.with_nvml(get_gpu_processes_nvml) { + Ok(result) => result, + Err(e) => { + set_nvml_status(e); + get_gpu_processes_nvidia_smi() + } + } + } + /// Get GPU info using NVML with cached static values fn get_gpu_info_nvml(&self, nvml: &Nvml) -> Vec { let mut gpu_info = Vec::new(); @@ -165,14 +211,14 @@ impl NvidiaGpuReader { impl GpuReader for NvidiaGpuReader { fn get_gpu_info(&self) -> Vec { - // Try NVML first - match Nvml::init() { - Ok(nvml) => { + // Try cached NVML handle first + match self.with_nvml(|nvml| self.get_gpu_info_nvml(nvml)) { + Ok(info) => { // Clear any previous error status on success if let Ok(mut status) = NVML_STATUS.lock() { *status = None; } - self.get_gpu_info_nvml(&nvml) + info } Err(e) => { // Store the error status for notification @@ -183,10 +229,10 @@ impl GpuReader for NvidiaGpuReader { } fn get_process_info(&self) -> Vec { - // Create a lightweight system instance and only refresh what we need use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, UpdateKind}; - let mut system = System::new(); - // Refresh processes with user information + + // Reuse the cached System instance + let mut system = self.system.lock().unwrap_or_else(|e| e.into_inner()); system.refresh_processes_specifics( ProcessesToUpdate::All, true, @@ -194,8 +240,8 @@ impl GpuReader for NvidiaGpuReader { ); system.refresh_memory(); - // Get GPU processes and PIDs - let (gpu_processes, gpu_pids) = get_gpu_processes(); + // Get GPU processes and PIDs using cached NVML handle + let (gpu_processes, gpu_pids) = self.get_gpu_processes_cached(); // Get all system processes let mut all_processes = get_all_processes(&system, &gpu_pids); @@ -231,19 +277,6 @@ pub fn get_nvml_status_message() -> Option { } } -// Helper function to get GPU processes -fn get_gpu_processes() -> (Vec, HashSet) { - // Try NVML first - match Nvml::init() { - Ok(nvml) => get_gpu_processes_nvml(&nvml), - Err(e) => { - // Store the error status for notification - set_nvml_status(e); - get_gpu_processes_nvidia_smi() - } - } -} - // Get GPU processes using NVML fn get_gpu_processes_nvml(nvml: &Nvml) -> (Vec, HashSet) { let mut gpu_processes = Vec::new();