Test Brev Tutorial Docker Images #435
Workflow file for this run
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Test Brev Tutorial Docker Images | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| tutorial: | |
| description: 'Tutorial name to test' | |
| required: true | |
| type: string | |
| git_sha: | |
| description: 'Git commit SHA to update status for' | |
| required: true | |
| type: string | |
| workflow_run_id: | |
| description: 'Workflow run ID to download artifacts from' | |
| required: true | |
| type: string | |
| jobs: | |
| test-tutorial: | |
| name: test-tutorial (${{ inputs.tutorial }}) | |
| runs-on: linux-amd64-gpu-t4-latest-1 | |
| defaults: | |
| run: | |
| working-directory: ${{ github.workspace }} | |
| permissions: | |
| statuses: write | |
| env: | |
| BUILDKIT_PROGRESS: plain | |
| DOCKER_CLI_HINTS: false | |
| steps: | |
| - name: Show runner info | |
| run: | | |
| echo "Runner name: ${{ runner.name }}" | |
| echo "Runner OS: ${{ runner.os }}" | |
| echo "Runner arch: ${{ runner.arch }}" | |
| echo "Runner uname: $(uname -a)" | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set Git branch variables | |
| run: | | |
| GIT_BRANCH_NAME=${GITHUB_REF#refs/heads/} | |
| # Sanitize branch name for Docker tags (replace invalid characters with hyphens and convert to lowercase) | |
| DOCKER_TAG_BRANCH=$(echo "${GIT_BRANCH_NAME}" | sed 's/[^a-zA-Z0-9._-]/-/g' | tr '[:upper:]' '[:lower:]') | |
| GIT_SHA=${{ inputs.git_sha }} | |
| GIT_SHORT_SHA=${GIT_SHA:0:7} | |
| echo "GIT_BRANCH_NAME=${GIT_BRANCH_NAME}" >> $GITHUB_ENV | |
| echo "DOCKER_TAG_BRANCH=${DOCKER_TAG_BRANCH}" >> $GITHUB_ENV | |
| echo "GIT_SHA=${GIT_SHA}" >> $GITHUB_ENV | |
| echo "GIT_SHORT_SHA=${GIT_SHORT_SHA}" >> $GITHUB_ENV | |
| - name: Download commit-specific Docker Compose artifact | |
| uses: dawidd6/action-download-artifact@v6 | |
| with: | |
| workflow: build-brev-tutorial-docker-images.yml | |
| run_id: ${{ inputs.workflow_run_id }} | |
| name: docker-compose-${{ inputs.tutorial }}-${{ env.DOCKER_TAG_BRANCH }}-git-${{ env.GIT_SHORT_SHA }} | |
| path: artifacts/commit-specific/${{ inputs.tutorial }}/ | |
| - name: Log in to GitHub Container Registry | |
| uses: docker/login-action@v3 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Stop DCGM to allow NCU profiling | |
| run: | | |
| # DCGM (Data Center GPU Manager) locks the GPU and prevents NCU from profiling. | |
| # Stop it before running the container tests. | |
| echo "Stopping DCGM services..." | |
| # Stop the dcgm-exporter Docker container (this is how it runs on GitHub runners) | |
| echo "Stopping dcgm-exporter Docker container..." | |
| docker stop dcgm-exporter 2>/dev/null && echo "Stopped dcgm-exporter container" || echo "dcgm-exporter container not running" | |
| docker rm -f dcgm-exporter 2>/dev/null || echo "dcgm-exporter container already removed" | |
| # Stop systemd services | |
| sudo systemctl stop dcgm-exporter || echo "dcgm-exporter service not found or already stopped" | |
| sudo systemctl stop nvidia-dcgm || echo "nvidia-dcgm service not found or already stopped" | |
| sudo systemctl stop dcgm || echo "dcgm service not found or already stopped" | |
| sudo systemctl stop nv-hostengine || echo "nv-hostengine service not found or already stopped" | |
| # Kill any remaining dcgm processes | |
| sudo pkill -9 nv-hostengine || echo "No nv-hostengine processes found" | |
| sudo pkill -9 dcgm || echo "No dcgm processes found" | |
| # Relax profiling permissions (perf_event_paranoid=4 is very restrictive) | |
| echo "Relaxing profiling permissions..." | |
| sudo sysctl -w kernel.perf_event_paranoid=2 || echo "Could not set perf_event_paranoid" | |
| sudo sysctl -w kernel.kptr_restrict=0 || echo "Could not set kptr_restrict" | |
| echo "DCGM services stopped and profiling permissions relaxed." | |
| - name: Debug GPU and NCU configuration | |
| run: | | |
| echo "=== GPU Information ===" | |
| nvidia-smi || echo "nvidia-smi failed" | |
| echo "" | |
| echo "=== NVIDIA Driver Version ===" | |
| cat /proc/driver/nvidia/version 2>/dev/null || echo "Could not read driver version" | |
| echo "" | |
| echo "=== GPU Processes ===" | |
| nvidia-smi pmon -c 1 2>/dev/null || echo "nvidia-smi pmon failed" | |
| echo "" | |
| echo "=== DCGM/NCU Blocking Processes ===" | |
| ps aux | grep -E "(dcgm|nv-hostengine|ncu)" | grep -v grep || echo "No DCGM/NCU processes found" | |
| echo "" | |
| echo "=== Systemd Services (nvidia/dcgm related) ===" | |
| systemctl list-units --type=service | grep -iE "(nvidia|dcgm|gpu)" || echo "No matching services" | |
| echo "" | |
| echo "=== NCU on Host ===" | |
| which ncu 2>/dev/null && ncu --version 2>/dev/null || echo "NCU not found on host" | |
| echo "" | |
| echo "=== Profiling Permissions ===" | |
| cat /proc/sys/kernel/perf_event_paranoid 2>/dev/null || echo "Could not read perf_event_paranoid" | |
| cat /proc/sys/kernel/kptr_restrict 2>/dev/null || echo "Could not read kptr_restrict" | |
| echo "" | |
| echo "=== NVIDIA Kernel Modules ===" | |
| lsmod | grep nvidia || echo "No nvidia modules loaded" | |
| echo "" | |
| echo "=== /dev/nvidia* devices ===" | |
| ls -la /dev/nvidia* 2>/dev/null || echo "No /dev/nvidia* devices found" | |
| echo "" | |
| echo "=== Docker GPU Access Test ===" | |
| docker run --rm --gpus all nvidia/cuda:12.8.0-base-ubuntu24.04 nvidia-smi 2>&1 || echo "Docker GPU test failed" | |
| echo "" | |
| - name: Test NCU inside container | |
| run: | | |
| echo "=== Testing NCU profiling inside container ===" | |
| # Pull the tutorial image and test NCU directly | |
| COMPOSE_FILE="artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml" | |
| # Extract the image name from the compose file | |
| # The image is defined with a YAML anchor like: image: &image ghcr.io/... | |
| # We need to extract the actual URL, not the anchor reference | |
| IMAGE=$(grep "image: &image" "$COMPOSE_FILE" | sed 's/.*image: &image //') | |
| if [ -z "$IMAGE" ]; then | |
| # Fallback: try to find any ghcr.io image reference | |
| IMAGE=$(grep -o 'ghcr.io/[^"]*' "$COMPOSE_FILE" | head -1) | |
| fi | |
| echo "Testing with image: $IMAGE" | |
| # Create a simple test script | |
| cat > /tmp/test_ncu.py << 'EOF' | |
| from numba import cuda | |
| import numpy as np | |
| @cuda.jit | |
| def simple_add(a, b, c): | |
| i = cuda.grid(1) | |
| if i < a.size: | |
| c[i] = a[i] + b[i] | |
| n = 1024 | |
| a = np.ones(n, dtype=np.float32) | |
| b = np.ones(n, dtype=np.float32) | |
| c = np.zeros(n, dtype=np.float32) | |
| d_a = cuda.to_device(a) | |
| d_b = cuda.to_device(b) | |
| d_c = cuda.to_device(c) | |
| simple_add[4, 256](d_a, d_b, d_c) | |
| cuda.synchronize() | |
| print("Kernel executed successfully") | |
| EOF | |
| # Test NCU inside the container | |
| echo "--- Running NCU profiling test ---" | |
| docker run --rm --gpus all \ | |
| -v /tmp/test_ncu.py:/test_ncu.py:ro \ | |
| "$IMAGE" \ | |
| bash -c ' | |
| echo "NCU version:" | |
| ncu --version || echo "NCU not found" | |
| echo "" | |
| echo "Running simple kernel without profiling:" | |
| python /test_ncu.py | |
| echo "" | |
| echo "Running NCU profiling:" | |
| ncu --set full -o /tmp/test_profile python /test_ncu.py 2>&1 | |
| NCU_EXIT=$? | |
| echo "NCU exit code: $NCU_EXIT" | |
| echo "" | |
| echo "Checking if profile was created:" | |
| ls -la /tmp/test_profile.ncu-rep 2>&1 || echo "Profile file not created" | |
| echo "" | |
| if [ -f /tmp/test_profile.ncu-rep ]; then | |
| echo "Profile file exists, testing CSV export:" | |
| ncu --import /tmp/test_profile.ncu-rep --csv 2>&1 | head -20 | |
| fi | |
| ' 2>&1 || echo "Container NCU test failed" | |
| - name: Test Docker Compose | |
| id: test | |
| run: | | |
| ./brev/test-docker-compose.bash "artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml" | |
| - name: Update commit status to success | |
| if: success() | |
| run: | | |
| gh api \ | |
| --method POST \ | |
| -H "Accept: application/vnd.github+json" \ | |
| /repos/${{ github.repository }}/statuses/${GIT_SHA} \ | |
| -f state='success' \ | |
| -f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \ | |
| -f description='Tests passed' \ | |
| -f context='test / ${{ inputs.tutorial }}' | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Update commit status to failure | |
| if: failure() | |
| run: | | |
| gh api \ | |
| --method POST \ | |
| -H "Accept: application/vnd.github+json" \ | |
| /repos/${{ github.repository }}/statuses/${GIT_SHA} \ | |
| -f state='failure' \ | |
| -f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \ | |
| -f description='Tests failed' \ | |
| -f context='test / ${{ inputs.tutorial }}' | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} |