Skip to content

Test Brev Tutorial Docker Images #435

Test Brev Tutorial Docker Images

Test Brev Tutorial Docker Images #435

name: Test Brev Tutorial Docker Images
on:
workflow_dispatch:
inputs:
tutorial:
description: 'Tutorial name to test'
required: true
type: string
git_sha:
description: 'Git commit SHA to update status for'
required: true
type: string
workflow_run_id:
description: 'Workflow run ID to download artifacts from'
required: true
type: string
jobs:
test-tutorial:
name: test-tutorial (${{ inputs.tutorial }})
runs-on: linux-amd64-gpu-t4-latest-1
defaults:
run:
working-directory: ${{ github.workspace }}
permissions:
statuses: write
env:
BUILDKIT_PROGRESS: plain
DOCKER_CLI_HINTS: false
steps:
- name: Show runner info
run: |
echo "Runner name: ${{ runner.name }}"
echo "Runner OS: ${{ runner.os }}"
echo "Runner arch: ${{ runner.arch }}"
echo "Runner uname: $(uname -a)"
- name: Checkout repository
uses: actions/checkout@v4
- name: Set Git branch variables
run: |
GIT_BRANCH_NAME=${GITHUB_REF#refs/heads/}
# Sanitize branch name for Docker tags (replace invalid characters with hyphens and convert to lowercase)
DOCKER_TAG_BRANCH=$(echo "${GIT_BRANCH_NAME}" | sed 's/[^a-zA-Z0-9._-]/-/g' | tr '[:upper:]' '[:lower:]')
GIT_SHA=${{ inputs.git_sha }}
GIT_SHORT_SHA=${GIT_SHA:0:7}
echo "GIT_BRANCH_NAME=${GIT_BRANCH_NAME}" >> $GITHUB_ENV
echo "DOCKER_TAG_BRANCH=${DOCKER_TAG_BRANCH}" >> $GITHUB_ENV
echo "GIT_SHA=${GIT_SHA}" >> $GITHUB_ENV
echo "GIT_SHORT_SHA=${GIT_SHORT_SHA}" >> $GITHUB_ENV
- name: Download commit-specific Docker Compose artifact
uses: dawidd6/action-download-artifact@v6
with:
workflow: build-brev-tutorial-docker-images.yml
run_id: ${{ inputs.workflow_run_id }}
name: docker-compose-${{ inputs.tutorial }}-${{ env.DOCKER_TAG_BRANCH }}-git-${{ env.GIT_SHORT_SHA }}
path: artifacts/commit-specific/${{ inputs.tutorial }}/
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Stop DCGM to allow NCU profiling
run: |
# DCGM (Data Center GPU Manager) locks the GPU and prevents NCU from profiling.
# Stop it before running the container tests.
echo "Stopping DCGM services..."
# Stop the dcgm-exporter Docker container (this is how it runs on GitHub runners)
echo "Stopping dcgm-exporter Docker container..."
docker stop dcgm-exporter 2>/dev/null && echo "Stopped dcgm-exporter container" || echo "dcgm-exporter container not running"
docker rm -f dcgm-exporter 2>/dev/null || echo "dcgm-exporter container already removed"
# Stop systemd services
sudo systemctl stop dcgm-exporter || echo "dcgm-exporter service not found or already stopped"
sudo systemctl stop nvidia-dcgm || echo "nvidia-dcgm service not found or already stopped"
sudo systemctl stop dcgm || echo "dcgm service not found or already stopped"
sudo systemctl stop nv-hostengine || echo "nv-hostengine service not found or already stopped"
# Kill any remaining dcgm processes
sudo pkill -9 nv-hostengine || echo "No nv-hostengine processes found"
sudo pkill -9 dcgm || echo "No dcgm processes found"
# Relax profiling permissions (perf_event_paranoid=4 is very restrictive)
echo "Relaxing profiling permissions..."
sudo sysctl -w kernel.perf_event_paranoid=2 || echo "Could not set perf_event_paranoid"
sudo sysctl -w kernel.kptr_restrict=0 || echo "Could not set kptr_restrict"
echo "DCGM services stopped and profiling permissions relaxed."
- name: Debug GPU and NCU configuration
run: |
echo "=== GPU Information ==="
nvidia-smi || echo "nvidia-smi failed"
echo ""
echo "=== NVIDIA Driver Version ==="
cat /proc/driver/nvidia/version 2>/dev/null || echo "Could not read driver version"
echo ""
echo "=== GPU Processes ==="
nvidia-smi pmon -c 1 2>/dev/null || echo "nvidia-smi pmon failed"
echo ""
echo "=== DCGM/NCU Blocking Processes ==="
ps aux | grep -E "(dcgm|nv-hostengine|ncu)" | grep -v grep || echo "No DCGM/NCU processes found"
echo ""
echo "=== Systemd Services (nvidia/dcgm related) ==="
systemctl list-units --type=service | grep -iE "(nvidia|dcgm|gpu)" || echo "No matching services"
echo ""
echo "=== NCU on Host ==="
which ncu 2>/dev/null && ncu --version 2>/dev/null || echo "NCU not found on host"
echo ""
echo "=== Profiling Permissions ==="
cat /proc/sys/kernel/perf_event_paranoid 2>/dev/null || echo "Could not read perf_event_paranoid"
cat /proc/sys/kernel/kptr_restrict 2>/dev/null || echo "Could not read kptr_restrict"
echo ""
echo "=== NVIDIA Kernel Modules ==="
lsmod | grep nvidia || echo "No nvidia modules loaded"
echo ""
echo "=== /dev/nvidia* devices ==="
ls -la /dev/nvidia* 2>/dev/null || echo "No /dev/nvidia* devices found"
echo ""
echo "=== Docker GPU Access Test ==="
docker run --rm --gpus all nvidia/cuda:12.8.0-base-ubuntu24.04 nvidia-smi 2>&1 || echo "Docker GPU test failed"
echo ""
- name: Test NCU inside container
run: |
echo "=== Testing NCU profiling inside container ==="
# Pull the tutorial image and test NCU directly
COMPOSE_FILE="artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml"
# Extract the image name from the compose file
# The image is defined with a YAML anchor like: image: &image ghcr.io/...
# We need to extract the actual URL, not the anchor reference
IMAGE=$(grep "image: &image" "$COMPOSE_FILE" | sed 's/.*image: &image //')
if [ -z "$IMAGE" ]; then
# Fallback: try to find any ghcr.io image reference
IMAGE=$(grep -o 'ghcr.io/[^"]*' "$COMPOSE_FILE" | head -1)
fi
echo "Testing with image: $IMAGE"
# Create a simple test script
cat > /tmp/test_ncu.py << 'EOF'
from numba import cuda
import numpy as np
@cuda.jit
def simple_add(a, b, c):
i = cuda.grid(1)
if i < a.size:
c[i] = a[i] + b[i]
n = 1024
a = np.ones(n, dtype=np.float32)
b = np.ones(n, dtype=np.float32)
c = np.zeros(n, dtype=np.float32)
d_a = cuda.to_device(a)
d_b = cuda.to_device(b)
d_c = cuda.to_device(c)
simple_add[4, 256](d_a, d_b, d_c)
cuda.synchronize()
print("Kernel executed successfully")
EOF
# Test NCU inside the container
echo "--- Running NCU profiling test ---"
docker run --rm --gpus all \
-v /tmp/test_ncu.py:/test_ncu.py:ro \
"$IMAGE" \
bash -c '
echo "NCU version:"
ncu --version || echo "NCU not found"
echo ""
echo "Running simple kernel without profiling:"
python /test_ncu.py
echo ""
echo "Running NCU profiling:"
ncu --set full -o /tmp/test_profile python /test_ncu.py 2>&1
NCU_EXIT=$?
echo "NCU exit code: $NCU_EXIT"
echo ""
echo "Checking if profile was created:"
ls -la /tmp/test_profile.ncu-rep 2>&1 || echo "Profile file not created"
echo ""
if [ -f /tmp/test_profile.ncu-rep ]; then
echo "Profile file exists, testing CSV export:"
ncu --import /tmp/test_profile.ncu-rep --csv 2>&1 | head -20
fi
' 2>&1 || echo "Container NCU test failed"
- name: Test Docker Compose
id: test
run: |
./brev/test-docker-compose.bash "artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml"
- name: Update commit status to success
if: success()
run: |
gh api \
--method POST \
-H "Accept: application/vnd.github+json" \
/repos/${{ github.repository }}/statuses/${GIT_SHA} \
-f state='success' \
-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
-f description='Tests passed' \
-f context='test / ${{ inputs.tutorial }}'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Update commit status to failure
if: failure()
run: |
gh api \
--method POST \
-H "Accept: application/vnd.github+json" \
/repos/${{ github.repository }}/statuses/${GIT_SHA} \
-f state='failure' \
-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
-f description='Tests failed' \
-f context='test / ${{ inputs.tutorial }}'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}