7272 # Stop it before running the container tests.
7373 echo "Stopping DCGM services..."
7474
75- # Stop the dcgm-exporter Docker container (this is how it runs on GitHub runners)
75+ # Stop the dcgm-exporter Docker container
7676 echo "Stopping dcgm-exporter Docker container..."
7777 docker stop dcgm-exporter 2>/dev/null && echo "Stopped dcgm-exporter container" || echo "dcgm-exporter container not running"
7878 docker rm -f dcgm-exporter 2>/dev/null || echo "dcgm-exporter container already removed"
@@ -116,10 +116,6 @@ jobs:
116116 systemctl list-units --type=service | grep -iE "(nvidia|dcgm|gpu)" || echo "No matching services"
117117 echo ""
118118
119- echo "=== NCU on Host ==="
120- which ncu 2>/dev/null && ncu --version 2>/dev/null || echo "NCU not found on host"
121- echo ""
122-
123119 echo "=== Profiling Permissions ==="
124120 cat /proc/sys/kernel/perf_event_paranoid 2>/dev/null || echo "Could not read perf_event_paranoid"
125121 cat /proc/sys/kernel/kptr_restrict 2>/dev/null || echo "Could not read kptr_restrict"
@@ -133,77 +129,6 @@ jobs:
133129 ls -la /dev/nvidia* 2>/dev/null || echo "No /dev/nvidia* devices found"
134130 echo ""
135131
136- echo "=== Docker GPU Access Test ==="
137- docker run --rm --gpus all nvidia/cuda:12.8.0-base-ubuntu24.04 nvidia-smi 2>&1 || echo "Docker GPU test failed"
138- echo ""
139-
140- - name : Test NCU inside container
141- run : |
142- echo "=== Testing NCU profiling inside container ==="
143- # Pull the tutorial image and test NCU directly
144- COMPOSE_FILE="artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml"
145-
146- # Extract the image name from the compose file
147- # The image is defined with a YAML anchor like: image: &image ghcr.io/...
148- # We need to extract the actual URL, not the anchor reference
149- IMAGE=$(grep "image: &image" "$COMPOSE_FILE" | sed 's/.*image: &image //')
150- if [ -z "$IMAGE" ]; then
151- # Fallback: try to find any ghcr.io image reference
152- IMAGE=$(grep -o 'ghcr.io/[^"]*' "$COMPOSE_FILE" | head -1)
153- fi
154- echo "Testing with image: $IMAGE"
155-
156- # Create a simple test script
157- cat > /tmp/test_ncu.py << 'EOF'
158- from numba import cuda
159- import numpy as np
160-
161- @cuda.jit
162- def simple_add(a, b, c):
163- i = cuda.grid(1)
164- if i < a.size:
165- c[i] = a[i] + b[i]
166-
167- n = 1024
168- a = np.ones(n, dtype=np.float32)
169- b = np.ones(n, dtype=np.float32)
170- c = np.zeros(n, dtype=np.float32)
171-
172- d_a = cuda.to_device(a)
173- d_b = cuda.to_device(b)
174- d_c = cuda.to_device(c)
175-
176- simple_add[4, 256](d_a, d_b, d_c)
177- cuda.synchronize()
178- print("Kernel executed successfully")
179- EOF
180-
181- # Test NCU inside the container
182- echo "--- Running NCU profiling test ---"
183- docker run --rm --gpus all \
184- -v /tmp/test_ncu.py:/test_ncu.py:ro \
185- "$IMAGE" \
186- bash -c '
187- echo "NCU version:"
188- ncu --version || echo "NCU not found"
189- echo ""
190- echo "Running simple kernel without profiling:"
191- python /test_ncu.py
192- echo ""
193- echo "Running NCU profiling:"
194- ncu --set full -o /tmp/test_profile python /test_ncu.py 2>&1
195- NCU_EXIT=$?
196- echo "NCU exit code: $NCU_EXIT"
197- echo ""
198- echo "Checking if profile was created:"
199- ls -la /tmp/test_profile.ncu-rep 2>&1 || echo "Profile file not created"
200- echo ""
201- if [ -f /tmp/test_profile.ncu-rep ]; then
202- echo "Profile file exists, testing CSV export:"
203- ncu --import /tmp/test_profile.ncu-rep --csv 2>&1 | head -20
204- fi
205- ' 2>&1 || echo "Container NCU test failed"
206-
207132 - name : Test Docker Compose
208133 id : test
209134 run : |
0 commit comments