Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 13 additions & 16 deletions src/device/readers/nvidia.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,14 +18,13 @@ use crate::device::process_list::{get_all_processes, merge_gpu_processes};
use crate::device::readers::common_cache::{DetailBuilder, DeviceStaticInfo, MAX_DEVICES};
use crate::device::types::{GpuInfo, ProcessInfo};
use crate::device::GpuReader;
use crate::utils::get_hostname;
use crate::utils::{get_hostname, with_global_system};
use chrono::Local;
use nvml_wrapper::enums::device::UsedGpuMemory;
use nvml_wrapper::error::NvmlError;
use nvml_wrapper::{cuda_driver_version_major, cuda_driver_version_minor, Nvml};
use std::collections::{HashMap, HashSet};
use std::sync::{Mutex, OnceLock};
use sysinfo::System;

// Global status for NVML error messages
static NVML_STATUS: Mutex<Option<String>> = Mutex::new(None);
Expand All @@ -39,8 +38,6 @@ pub struct NvidiaGpuReader {
device_static_info: OnceLock<HashMap<u32, DeviceStaticInfo>>,
/// Cached NVML handle (initialized once, reused across calls)
nvml: Mutex<Option<Nvml>>,
/// Cached System instance for process info (reused across calls)
system: Mutex<System>,
}

impl Default for NvidiaGpuReader {
Expand All @@ -56,7 +53,6 @@ impl NvidiaGpuReader {
cuda_version: OnceLock::new(),
device_static_info: OnceLock::new(),
nvml: Mutex::new(Nvml::init().ok()),
system: Mutex::new(System::new()),
}
}

Expand Down Expand Up @@ -231,20 +227,21 @@ impl GpuReader for NvidiaGpuReader {
fn get_process_info(&self) -> Vec<ProcessInfo> {
use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, UpdateKind};

// Reuse the cached System instance
let mut system = self.system.lock().unwrap_or_else(|e| e.into_inner());
system.refresh_processes_specifics(
ProcessesToUpdate::All,
true,
ProcessRefreshKind::everything().with_user(UpdateKind::Always),
);
system.refresh_memory();

// Get GPU processes and PIDs using cached NVML handle
let (gpu_processes, gpu_pids) = self.get_gpu_processes_cached();

// Get all system processes
let mut all_processes = get_all_processes(&system, &gpu_pids);
// Use global system instance to avoid file descriptor leak
let mut all_processes = with_global_system(|system| {
system.refresh_processes_specifics(
ProcessesToUpdate::All,
true,
ProcessRefreshKind::everything().with_user(UpdateKind::Always),
);
system.refresh_memory();

// Get all system processes
get_all_processes(system, &gpu_pids)
});

// Merge GPU information into the process list
merge_gpu_processes(&mut all_processes, gpu_processes);
Expand Down
93 changes: 47 additions & 46 deletions src/device/readers/nvidia_jetson.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,11 @@ use crate::device::process_list::{get_all_processes, merge_gpu_processes};
use crate::device::readers::common_cache::{DetailBuilder, DeviceStaticInfo};
use crate::device::types::{GpuInfo, ProcessInfo};
use crate::device::GpuReader;
use crate::utils::{get_hostname, hz_to_mhz, millicelsius_to_celsius};
use crate::utils::{get_hostname, hz_to_mhz, millicelsius_to_celsius, with_global_system};
use chrono::Local;
use std::collections::HashSet;
use std::fs;
use std::sync::OnceLock;
use sysinfo::System;

pub struct NvidiaJetsonGpuReader {
/// Cached static device information (fetched only once)
Expand Down Expand Up @@ -171,22 +170,23 @@ impl GpuReader for NvidiaJetsonGpuReader {
}

fn get_process_info(&self) -> Vec<ProcessInfo> {
// Create a lightweight system instance and only refresh what we need
use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, UpdateKind};
let mut system = System::new();
// Refresh processes with user information
system.refresh_processes_specifics(
ProcessesToUpdate::All,
true,
ProcessRefreshKind::everything().with_user(UpdateKind::Always),
);
system.refresh_memory();

// Get GPU processes and PIDs
let (gpu_processes, gpu_pids) = get_gpu_processes();

// Get all system processes
let mut all_processes = get_all_processes(&system, &gpu_pids);
// Use global system instance to avoid file descriptor leak
let mut all_processes = with_global_system(|system| {
system.refresh_processes_specifics(
ProcessesToUpdate::All,
true,
ProcessRefreshKind::everything().with_user(UpdateKind::Always),
);
system.refresh_memory();

// Get all system processes
get_all_processes(system, &gpu_pids)
});

// Merge GPU information into the process list
merge_gpu_processes(&mut all_processes, gpu_processes);
Expand Down Expand Up @@ -258,41 +258,42 @@ fn get_gpu_processes() -> (Vec<ProcessInfo>, HashSet<u32>) {
"cuda",
];

let mut system = System::new();
system.refresh_memory();
for (pid, process) in system.processes() {
let process_name = process.name().to_string_lossy().to_lowercase();
for gpu_name in &gpu_process_names {
if process_name.contains(gpu_name) {
let pid_u32 = pid.as_u32();
gpu_pids.insert(pid_u32);

gpu_processes.push(ProcessInfo {
device_id: 0,
device_uuid: "JetsonGPU".to_string(),
pid: pid_u32,
process_name: String::new(), // Will be filled by sysinfo
used_memory: 0, // Can't determine GPU memory usage without nvidia-smi
cpu_percent: 0.0, // Will be filled by sysinfo
memory_percent: 0.0, // Will be filled by sysinfo
memory_rss: 0, // Will be filled by sysinfo
memory_vms: 0, // Will be filled by sysinfo
user: String::new(), // Will be filled by sysinfo
state: String::new(), // Will be filled by sysinfo
start_time: String::new(), // Will be filled by sysinfo
cpu_time: 0, // Will be filled by sysinfo
command: String::new(), // Will be filled by sysinfo
ppid: 0, // Will be filled by sysinfo
threads: 0, // Will be filled by sysinfo
uses_gpu: true,
priority: 0, // Will be filled by sysinfo
nice_value: 0, // Will be filled by sysinfo
gpu_utilization: 0.0, // Can't determine per-process GPU utilization
});
break;
with_global_system(|system| {
system.refresh_memory();
for (pid, process) in system.processes() {
let process_name = process.name().to_string_lossy().to_lowercase();
for gpu_name in &gpu_process_names {
if process_name.contains(gpu_name) {
let pid_u32 = pid.as_u32();
gpu_pids.insert(pid_u32);

gpu_processes.push(ProcessInfo {
device_id: 0,
device_uuid: "JetsonGPU".to_string(),
pid: pid_u32,
process_name: String::new(), // Will be filled by sysinfo
used_memory: 0, // Can't determine GPU memory usage without nvidia-smi
cpu_percent: 0.0, // Will be filled by sysinfo
memory_percent: 0.0, // Will be filled by sysinfo
memory_rss: 0, // Will be filled by sysinfo
memory_vms: 0, // Will be filled by sysinfo
user: String::new(), // Will be filled by sysinfo
state: String::new(), // Will be filled by sysinfo
start_time: String::new(), // Will be filled by sysinfo
cpu_time: 0, // Will be filled by sysinfo
command: String::new(), // Will be filled by sysinfo
ppid: 0, // Will be filled by sysinfo
threads: 0, // Will be filled by sysinfo
uses_gpu: true,
priority: 0, // Will be filled by sysinfo
nice_value: 0, // Will be filled by sysinfo
gpu_utilization: 0.0, // Can't determine per-process GPU utilization
});
break;
}
}
}
}
});
}

(gpu_processes, gpu_pids)
Expand Down
25 changes: 13 additions & 12 deletions src/device/readers/tenstorrent.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use crate::device::process_list::{get_all_processes, merge_gpu_processes};
use crate::device::readers::common_cache::{DetailBuilder, DeviceStaticInfo};
use crate::device::types::{GpuInfo, ProcessInfo};
use crate::device::GpuReader;
use crate::utils::get_hostname;
use crate::utils::{get_hostname, with_global_system};
use all_smi_luwen_core;
use all_smi_luwen_if::chip::{Chip, ChipImpl, Telemetry};
use all_smi_luwen_if::ChipDetectOptions;
Expand All @@ -25,7 +25,6 @@ use chrono::Local;
use once_cell::sync::Lazy;
use std::collections::{HashMap, HashSet};
use std::sync::Mutex;
use sysinfo::System;

/// Collection method for Tenstorrent NPU metrics
#[derive(Debug, Clone, Copy)]
Expand Down Expand Up @@ -196,21 +195,23 @@ impl GpuReader for TenstorrentReader {
}

fn get_process_info(&self) -> Vec<ProcessInfo> {
// Create system instance and refresh processes
use sysinfo::{ProcessRefreshKind, ProcessesToUpdate, UpdateKind};
let mut system = System::new();
system.refresh_processes_specifics(
ProcessesToUpdate::All,
true,
ProcessRefreshKind::everything().with_user(UpdateKind::Always),
);
system.refresh_memory();

// Get NPU processes (currently empty for Tenstorrent)
let (npu_processes, npu_pids) = self.get_npu_processes();

// Get all system processes
let mut all_processes = get_all_processes(&system, &npu_pids);
// Use global system instance to avoid file descriptor leak
let mut all_processes = with_global_system(|system| {
system.refresh_processes_specifics(
ProcessesToUpdate::All,
true,
ProcessRefreshKind::everything().with_user(UpdateKind::Always),
);
system.refresh_memory();

// Get all system processes
get_all_processes(system, &npu_pids)
});

// Merge NPU information
merge_gpu_processes(&mut all_processes, npu_processes);
Expand Down