@@ -71,14 +71,28 @@ jobs:
7171 # DCGM (Data Center GPU Manager) locks the GPU and prevents NCU from profiling.
7272 # Stop it before running the container tests.
7373 echo "Stopping DCGM services..."
74+
75+ # Stop the dcgm-exporter Docker container (this is how it runs on GitHub runners)
76+ echo "Stopping dcgm-exporter Docker container..."
77+ docker stop dcgm-exporter 2>/dev/null && echo "Stopped dcgm-exporter container" || echo "dcgm-exporter container not running"
78+ docker rm -f dcgm-exporter 2>/dev/null || echo "dcgm-exporter container already removed"
79+
80+ # Stop systemd services
81+ sudo systemctl stop dcgm-exporter || echo "dcgm-exporter service not found or already stopped"
7482 sudo systemctl stop nvidia-dcgm || echo "nvidia-dcgm service not found or already stopped"
7583 sudo systemctl stop dcgm || echo "dcgm service not found or already stopped"
76- # Also try nv-hostengine which DCGM uses
7784 sudo systemctl stop nv-hostengine || echo "nv-hostengine service not found or already stopped"
85+
7886 # Kill any remaining dcgm processes
7987 sudo pkill -9 nv-hostengine || echo "No nv-hostengine processes found"
8088 sudo pkill -9 dcgm || echo "No dcgm processes found"
81- echo "DCGM services stopped."
89+
90+ # Relax profiling permissions (perf_event_paranoid=4 is very restrictive)
91+ echo "Relaxing profiling permissions..."
92+ sudo sysctl -w kernel.perf_event_paranoid=2 || echo "Could not set perf_event_paranoid"
93+ sudo sysctl -w kernel.kptr_restrict=0 || echo "Could not set kptr_restrict"
94+
95+ echo "DCGM services stopped and profiling permissions relaxed."
8296
8397 - name : Debug GPU and NCU configuration
8498 run : |
0 commit comments