Test Brev Tutorial Docker Images #500
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: Test Brev Tutorial Docker Images | |
| on: | |
| workflow_dispatch: | |
| inputs: | |
| tutorial: | |
| description: 'Tutorial name to test' | |
| required: true | |
| type: string | |
| git_sha: | |
| description: 'Git commit SHA to update status for' | |
| required: true | |
| type: string | |
| workflow_run_id: | |
| description: 'Workflow run ID to download artifacts from' | |
| required: true | |
| type: string | |
| jobs: | |
| test-tutorial: | |
| name: test-tutorial (${{ inputs.tutorial }}) | |
| runs-on: linux-amd64-gpu-l4-latest-1 | |
| defaults: | |
| run: | |
| working-directory: ${{ github.workspace }} | |
| permissions: | |
| statuses: write | |
| env: | |
| BUILDKIT_PROGRESS: plain | |
| DOCKER_CLI_HINTS: false | |
| steps: | |
| - name: Show runner info | |
| run: | | |
| echo "Runner name: ${{ runner.name }}" | |
| echo "Runner OS: ${{ runner.os }}" | |
| echo "Runner arch: ${{ runner.arch }}" | |
| echo "Runner uname: $(uname -a)" | |
| - name: Checkout repository | |
| uses: actions/checkout@v4 | |
| - name: Set Git branch variables | |
| run: | | |
| GIT_BRANCH_NAME=${GITHUB_REF#refs/heads/} | |
| # Sanitize branch name for Docker tags (replace invalid characters with hyphens and convert to lowercase) | |
| DOCKER_TAG_BRANCH=$(echo "${GIT_BRANCH_NAME}" | sed 's/[^a-zA-Z0-9._-]/-/g' | tr '[:upper:]' '[:lower:]') | |
| GIT_SHA=${{ inputs.git_sha }} | |
| GIT_SHORT_SHA=${GIT_SHA:0:7} | |
| echo "GIT_BRANCH_NAME=${GIT_BRANCH_NAME}" >> $GITHUB_ENV | |
| echo "DOCKER_TAG_BRANCH=${DOCKER_TAG_BRANCH}" >> $GITHUB_ENV | |
| echo "GIT_SHA=${GIT_SHA}" >> $GITHUB_ENV | |
| echo "GIT_SHORT_SHA=${GIT_SHORT_SHA}" >> $GITHUB_ENV | |
| - name: Download commit-specific Docker Compose artifact | |
| uses: dawidd6/action-download-artifact@v6 | |
| with: | |
| workflow: build-brev-tutorial-docker-images.yml | |
| run_id: ${{ inputs.workflow_run_id }} | |
| name: docker-compose-${{ inputs.tutorial }}-${{ env.DOCKER_TAG_BRANCH }}-git-${{ env.GIT_SHORT_SHA }} | |
| path: artifacts/commit-specific/${{ inputs.tutorial }}/ | |
| - name: Log in to GitHub Container Registry | |
| uses: docker/login-action@v3 | |
| with: | |
| registry: ghcr.io | |
| username: ${{ github.actor }} | |
| password: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Stop DCGM to allow NCU profiling | |
| run: | | |
| # DCGM (Data Center GPU Manager) locks the GPU and prevents NCU from profiling. | |
| # Stop it before running the container tests. | |
| echo "Stopping DCGM services..." | |
| # Stop the dcgm-exporter Docker container | |
| echo "Stopping dcgm-exporter Docker container..." | |
| docker stop dcgm-exporter 2>/dev/null && echo "Stopped dcgm-exporter container" || echo "dcgm-exporter container not running" | |
| docker rm -f dcgm-exporter 2>/dev/null || echo "dcgm-exporter container already removed" | |
| # Stop systemd services | |
| sudo systemctl stop dcgm-exporter || echo "dcgm-exporter service not found or already stopped" | |
| sudo systemctl stop nvidia-dcgm || echo "nvidia-dcgm service not found or already stopped" | |
| sudo systemctl stop dcgm || echo "dcgm service not found or already stopped" | |
| sudo systemctl stop nv-hostengine || echo "nv-hostengine service not found or already stopped" | |
| # Kill any remaining dcgm processes | |
| sudo pkill -9 nv-hostengine || echo "No nv-hostengine processes found" | |
| sudo pkill -9 dcgm || echo "No dcgm processes found" | |
| # Relax profiling permissions (perf_event_paranoid=4 is very restrictive) | |
| echo "Relaxing profiling permissions..." | |
| sudo sysctl -w kernel.perf_event_paranoid=2 || echo "Could not set perf_event_paranoid" | |
| sudo sysctl -w kernel.kptr_restrict=0 || echo "Could not set kptr_restrict" | |
| echo "DCGM services stopped and profiling permissions relaxed." | |
| - name: Debug GPU and NCU configuration | |
| run: | | |
| echo "=== GPU Information ===" | |
| nvidia-smi || echo "nvidia-smi failed" | |
| echo "" | |
| echo "=== NVIDIA Driver Version ===" | |
| cat /proc/driver/nvidia/version 2>/dev/null || echo "Could not read driver version" | |
| echo "" | |
| echo "=== GPU Processes ===" | |
| nvidia-smi pmon -c 1 2>/dev/null || echo "nvidia-smi pmon failed" | |
| echo "" | |
| echo "=== DCGM/NCU Blocking Processes ===" | |
| ps aux | grep -E "(dcgm|nv-hostengine|ncu)" | grep -v grep || echo "No DCGM/NCU processes found" | |
| echo "" | |
| echo "=== Systemd Services (nvidia/dcgm related) ===" | |
| systemctl list-units --type=service | grep -iE "(nvidia|dcgm|gpu)" || echo "No matching services" | |
| echo "" | |
| echo "=== Profiling Permissions ===" | |
| cat /proc/sys/kernel/perf_event_paranoid 2>/dev/null || echo "Could not read perf_event_paranoid" | |
| cat /proc/sys/kernel/kptr_restrict 2>/dev/null || echo "Could not read kptr_restrict" | |
| echo "" | |
| echo "=== NVIDIA Kernel Modules ===" | |
| lsmod | grep nvidia || echo "No nvidia modules loaded" | |
| echo "" | |
| echo "=== /dev/nvidia* devices ===" | |
| ls -la /dev/nvidia* 2>/dev/null || echo "No /dev/nvidia* devices found" | |
| echo "" | |
| - name: Pre-pull Docker images | |
| run: | | |
| COMPOSE_FILE="artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml" | |
| # Extract all unique images from the compose file | |
| # The main image is defined with a YAML anchor like: image: &image ghcr.io/... | |
| MAIN_IMAGE=$(grep "image: &image" "$COMPOSE_FILE" | sed 's/.*image: &image //') | |
| if [ -z "$MAIN_IMAGE" ]; then | |
| # Fallback: try to find any ghcr.io image reference | |
| MAIN_IMAGE=$(grep -o 'ghcr.io/[^"]*' "$COMPOSE_FILE" | head -1) | |
| fi | |
| # Extract the nsight image (nvcr.io) | |
| NSIGHT_IMAGE=$(grep -o 'nvcr.io/[^"]*' "$COMPOSE_FILE" | head -1) | |
| echo "Pre-pulling main image: $MAIN_IMAGE" | |
| docker pull "$MAIN_IMAGE" | |
| if [ -n "$NSIGHT_IMAGE" ]; then | |
| echo "Pre-pulling nsight image: $NSIGHT_IMAGE" | |
| docker pull "$NSIGHT_IMAGE" | |
| fi | |
| echo "All images pulled successfully" | |
| - name: Test Docker Compose | |
| id: test | |
| run: | | |
| ./brev/test-docker-compose.bash "artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml" | |
| - name: Update commit status to success | |
| if: success() | |
| run: | | |
| gh api \ | |
| --method POST \ | |
| -H "Accept: application/vnd.github+json" \ | |
| /repos/${{ github.repository }}/statuses/${GIT_SHA} \ | |
| -f state='success' \ | |
| -f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \ | |
| -f description='Tests passed' \ | |
| -f context='Test Brev Tutorial Docker Images / test-tutorial (tutorials/${{ inputs.tutorial }})' | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} | |
| - name: Update commit status to failure | |
| if: failure() | |
| run: | | |
| gh api \ | |
| --method POST \ | |
| -H "Accept: application/vnd.github+json" \ | |
| /repos/${{ github.repository }}/statuses/${GIT_SHA} \ | |
| -f state='failure' \ | |
| -f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \ | |
| -f description='Tests failed' \ | |
| -f context='Test Brev Tutorial Docker Images / test-tutorial (tutorials/${{ inputs.tutorial }})' | |
| env: | |
| GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} |