Skip to content

Commit c91bcf2

Browse files
committed
CI: Disable NCU sanity test that is improperly implemented.
1 parent cd21d3d commit c91bcf2

File tree

1 file changed

+1
-76
lines changed

1 file changed

+1
-76
lines changed

.github/workflows/test-brev-tutorial-docker-images.yml

Lines changed: 1 addition & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ jobs:
7272
# Stop it before running the container tests.
7373
echo "Stopping DCGM services..."
7474
75-
# Stop the dcgm-exporter Docker container (this is how it runs on GitHub runners)
75+
# Stop the dcgm-exporter Docker container
7676
echo "Stopping dcgm-exporter Docker container..."
7777
docker stop dcgm-exporter 2>/dev/null && echo "Stopped dcgm-exporter container" || echo "dcgm-exporter container not running"
7878
docker rm -f dcgm-exporter 2>/dev/null || echo "dcgm-exporter container already removed"
@@ -116,10 +116,6 @@ jobs:
116116
systemctl list-units --type=service | grep -iE "(nvidia|dcgm|gpu)" || echo "No matching services"
117117
echo ""
118118
119-
echo "=== NCU on Host ==="
120-
which ncu 2>/dev/null && ncu --version 2>/dev/null || echo "NCU not found on host"
121-
echo ""
122-
123119
echo "=== Profiling Permissions ==="
124120
cat /proc/sys/kernel/perf_event_paranoid 2>/dev/null || echo "Could not read perf_event_paranoid"
125121
cat /proc/sys/kernel/kptr_restrict 2>/dev/null || echo "Could not read kptr_restrict"
@@ -133,77 +129,6 @@ jobs:
133129
ls -la /dev/nvidia* 2>/dev/null || echo "No /dev/nvidia* devices found"
134130
echo ""
135131
136-
echo "=== Docker GPU Access Test ==="
137-
docker run --rm --gpus all nvidia/cuda:12.8.0-base-ubuntu24.04 nvidia-smi 2>&1 || echo "Docker GPU test failed"
138-
echo ""
139-
140-
- name: Test NCU inside container
141-
run: |
142-
echo "=== Testing NCU profiling inside container ==="
143-
# Pull the tutorial image and test NCU directly
144-
COMPOSE_FILE="artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml"
145-
146-
# Extract the image name from the compose file
147-
# The image is defined with a YAML anchor like: image: &image ghcr.io/...
148-
# We need to extract the actual URL, not the anchor reference
149-
IMAGE=$(grep "image: &image" "$COMPOSE_FILE" | sed 's/.*image: &image //')
150-
if [ -z "$IMAGE" ]; then
151-
# Fallback: try to find any ghcr.io image reference
152-
IMAGE=$(grep -o 'ghcr.io/[^"]*' "$COMPOSE_FILE" | head -1)
153-
fi
154-
echo "Testing with image: $IMAGE"
155-
156-
# Create a simple test script
157-
cat > /tmp/test_ncu.py << 'EOF'
158-
from numba import cuda
159-
import numpy as np
160-
161-
@cuda.jit
162-
def simple_add(a, b, c):
163-
i = cuda.grid(1)
164-
if i < a.size:
165-
c[i] = a[i] + b[i]
166-
167-
n = 1024
168-
a = np.ones(n, dtype=np.float32)
169-
b = np.ones(n, dtype=np.float32)
170-
c = np.zeros(n, dtype=np.float32)
171-
172-
d_a = cuda.to_device(a)
173-
d_b = cuda.to_device(b)
174-
d_c = cuda.to_device(c)
175-
176-
simple_add[4, 256](d_a, d_b, d_c)
177-
cuda.synchronize()
178-
print("Kernel executed successfully")
179-
EOF
180-
181-
# Test NCU inside the container
182-
echo "--- Running NCU profiling test ---"
183-
docker run --rm --gpus all \
184-
-v /tmp/test_ncu.py:/test_ncu.py:ro \
185-
"$IMAGE" \
186-
bash -c '
187-
echo "NCU version:"
188-
ncu --version || echo "NCU not found"
189-
echo ""
190-
echo "Running simple kernel without profiling:"
191-
python /test_ncu.py
192-
echo ""
193-
echo "Running NCU profiling:"
194-
ncu --set full -o /tmp/test_profile python /test_ncu.py 2>&1
195-
NCU_EXIT=$?
196-
echo "NCU exit code: $NCU_EXIT"
197-
echo ""
198-
echo "Checking if profile was created:"
199-
ls -la /tmp/test_profile.ncu-rep 2>&1 || echo "Profile file not created"
200-
echo ""
201-
if [ -f /tmp/test_profile.ncu-rep ]; then
202-
echo "Profile file exists, testing CSV export:"
203-
ncu --import /tmp/test_profile.ncu-rep --csv 2>&1 | head -20
204-
fi
205-
' 2>&1 || echo "Container NCU test failed"
206-
207132
- name: Test Docker Compose
208133
id: test
209134
run: |

0 commit comments

Comments
 (0)