Skip to content

Commit edcd809

Browse files
committed
CI: Properly kill DCGM and fix permissions on NV GHA runners.
1 parent c8668c1 commit edcd809

File tree

1 file changed

+16
-2
lines changed

1 file changed

+16
-2
lines changed

.github/workflows/test-brev-tutorial-docker-images.yml

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -71,14 +71,28 @@ jobs:
7171
# DCGM (Data Center GPU Manager) locks the GPU and prevents NCU from profiling.
7272
# Stop it before running the container tests.
7373
echo "Stopping DCGM services..."
74+
75+
# Stop the dcgm-exporter Docker container (this is how it runs on GitHub runners)
76+
echo "Stopping dcgm-exporter Docker container..."
77+
docker stop dcgm-exporter 2>/dev/null && echo "Stopped dcgm-exporter container" || echo "dcgm-exporter container not running"
78+
docker rm -f dcgm-exporter 2>/dev/null || echo "dcgm-exporter container already removed"
79+
80+
# Stop systemd services
81+
sudo systemctl stop dcgm-exporter || echo "dcgm-exporter service not found or already stopped"
7482
sudo systemctl stop nvidia-dcgm || echo "nvidia-dcgm service not found or already stopped"
7583
sudo systemctl stop dcgm || echo "dcgm service not found or already stopped"
76-
# Also try nv-hostengine which DCGM uses
7784
sudo systemctl stop nv-hostengine || echo "nv-hostengine service not found or already stopped"
85+
7886
# Kill any remaining dcgm processes
7987
sudo pkill -9 nv-hostengine || echo "No nv-hostengine processes found"
8088
sudo pkill -9 dcgm || echo "No dcgm processes found"
81-
echo "DCGM services stopped."
89+
90+
# Relax profiling permissions (perf_event_paranoid=4 is very restrictive)
91+
echo "Relaxing profiling permissions..."
92+
sudo sysctl -w kernel.perf_event_paranoid=2 || echo "Could not set perf_event_paranoid"
93+
sudo sysctl -w kernel.kptr_restrict=0 || echo "Could not set kptr_restrict"
94+
95+
echo "DCGM services stopped and profiling permissions relaxed."
8296
8397
- name: Debug GPU and NCU configuration
8498
run: |

0 commit comments

Comments
 (0)