Skip to content

Test Brev Tutorial Docker Images #490

Test Brev Tutorial Docker Images

Test Brev Tutorial Docker Images #490

name: Test Brev Tutorial Docker Images
on:
workflow_dispatch:
inputs:
tutorial:
description: 'Tutorial name to test'
required: true
type: string
git_sha:
description: 'Git commit SHA to update status for'
required: true
type: string
workflow_run_id:
description: 'Workflow run ID to download artifacts from'
required: true
type: string
jobs:
test-tutorial:
name: test-tutorial (${{ inputs.tutorial }})
runs-on: linux-amd64-gpu-l4-latest-1
defaults:
run:
working-directory: ${{ github.workspace }}
permissions:
statuses: write
env:
BUILDKIT_PROGRESS: plain
DOCKER_CLI_HINTS: false
steps:
- name: Show runner info
run: |
echo "Runner name: ${{ runner.name }}"
echo "Runner OS: ${{ runner.os }}"
echo "Runner arch: ${{ runner.arch }}"
echo "Runner uname: $(uname -a)"
- name: Checkout repository
uses: actions/checkout@v4
- name: Set Git branch variables
run: |
GIT_BRANCH_NAME=${GITHUB_REF#refs/heads/}
# Sanitize branch name for Docker tags (replace invalid characters with hyphens and convert to lowercase)
DOCKER_TAG_BRANCH=$(echo "${GIT_BRANCH_NAME}" | sed 's/[^a-zA-Z0-9._-]/-/g' | tr '[:upper:]' '[:lower:]')
GIT_SHA=${{ inputs.git_sha }}
GIT_SHORT_SHA=${GIT_SHA:0:7}
echo "GIT_BRANCH_NAME=${GIT_BRANCH_NAME}" >> $GITHUB_ENV
echo "DOCKER_TAG_BRANCH=${DOCKER_TAG_BRANCH}" >> $GITHUB_ENV
echo "GIT_SHA=${GIT_SHA}" >> $GITHUB_ENV
echo "GIT_SHORT_SHA=${GIT_SHORT_SHA}" >> $GITHUB_ENV
- name: Download commit-specific Docker Compose artifact
uses: dawidd6/action-download-artifact@v6
with:
workflow: build-brev-tutorial-docker-images.yml
run_id: ${{ inputs.workflow_run_id }}
name: docker-compose-${{ inputs.tutorial }}-${{ env.DOCKER_TAG_BRANCH }}-git-${{ env.GIT_SHORT_SHA }}
path: artifacts/commit-specific/${{ inputs.tutorial }}/
- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
with:
registry: ghcr.io
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}
- name: Stop DCGM to allow NCU profiling
run: |
# DCGM (Data Center GPU Manager) locks the GPU and prevents NCU from profiling.
# Stop it before running the container tests.
echo "Stopping DCGM services..."
# Stop the dcgm-exporter Docker container
echo "Stopping dcgm-exporter Docker container..."
docker stop dcgm-exporter 2>/dev/null && echo "Stopped dcgm-exporter container" || echo "dcgm-exporter container not running"
docker rm -f dcgm-exporter 2>/dev/null || echo "dcgm-exporter container already removed"
# Stop systemd services
sudo systemctl stop dcgm-exporter || echo "dcgm-exporter service not found or already stopped"
sudo systemctl stop nvidia-dcgm || echo "nvidia-dcgm service not found or already stopped"
sudo systemctl stop dcgm || echo "dcgm service not found or already stopped"
sudo systemctl stop nv-hostengine || echo "nv-hostengine service not found or already stopped"
# Kill any remaining dcgm processes
sudo pkill -9 nv-hostengine || echo "No nv-hostengine processes found"
sudo pkill -9 dcgm || echo "No dcgm processes found"
# Relax profiling permissions (perf_event_paranoid=4 is very restrictive)
echo "Relaxing profiling permissions..."
sudo sysctl -w kernel.perf_event_paranoid=2 || echo "Could not set perf_event_paranoid"
sudo sysctl -w kernel.kptr_restrict=0 || echo "Could not set kptr_restrict"
echo "DCGM services stopped and profiling permissions relaxed."
- name: Debug GPU and NCU configuration
run: |
echo "=== GPU Information ==="
nvidia-smi || echo "nvidia-smi failed"
echo ""
echo "=== NVIDIA Driver Version ==="
cat /proc/driver/nvidia/version 2>/dev/null || echo "Could not read driver version"
echo ""
echo "=== GPU Processes ==="
nvidia-smi pmon -c 1 2>/dev/null || echo "nvidia-smi pmon failed"
echo ""
echo "=== DCGM/NCU Blocking Processes ==="
ps aux | grep -E "(dcgm|nv-hostengine|ncu)" | grep -v grep || echo "No DCGM/NCU processes found"
echo ""
echo "=== Systemd Services (nvidia/dcgm related) ==="
systemctl list-units --type=service | grep -iE "(nvidia|dcgm|gpu)" || echo "No matching services"
echo ""
echo "=== Profiling Permissions ==="
cat /proc/sys/kernel/perf_event_paranoid 2>/dev/null || echo "Could not read perf_event_paranoid"
cat /proc/sys/kernel/kptr_restrict 2>/dev/null || echo "Could not read kptr_restrict"
echo ""
echo "=== NVIDIA Kernel Modules ==="
lsmod | grep nvidia || echo "No nvidia modules loaded"
echo ""
echo "=== /dev/nvidia* devices ==="
ls -la /dev/nvidia* 2>/dev/null || echo "No /dev/nvidia* devices found"
echo ""
- name: Pre-pull Docker images
run: |
COMPOSE_FILE="artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml"
# Extract all unique images from the compose file
# The main image is defined with a YAML anchor like: image: &image ghcr.io/...
MAIN_IMAGE=$(grep "image: &image" "$COMPOSE_FILE" | sed 's/.*image: &image //')
if [ -z "$MAIN_IMAGE" ]; then
# Fallback: try to find any ghcr.io image reference
MAIN_IMAGE=$(grep -o 'ghcr.io/[^"]*' "$COMPOSE_FILE" | head -1)
fi
# Extract the nsight image (nvcr.io)
NSIGHT_IMAGE=$(grep -o 'nvcr.io/[^"]*' "$COMPOSE_FILE" | head -1)
echo "Pre-pulling main image: $MAIN_IMAGE"
docker pull "$MAIN_IMAGE"
if [ -n "$NSIGHT_IMAGE" ]; then
echo "Pre-pulling nsight image: $NSIGHT_IMAGE"
docker pull "$NSIGHT_IMAGE"
fi
echo "All images pulled successfully"
- name: Test Docker Compose
id: test
run: |
./brev/test-docker-compose.bash "artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml"
- name: Update commit status to success
if: success()
run: |
gh api \
--method POST \
-H "Accept: application/vnd.github+json" \
/repos/${{ github.repository }}/statuses/${GIT_SHA} \
-f state='success' \
-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
-f description='Tests passed' \
-f context='Test Brev Tutorial Docker Images / test-tutorial (tutorials/${{ inputs.tutorial }})'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- name: Update commit status to failure
if: failure()
run: |
gh api \
--method POST \
-H "Accept: application/vnd.github+json" \
/repos/${{ github.repository }}/statuses/${GIT_SHA} \
-f state='failure' \
-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
-f description='Tests failed' \
-f context='Test Brev Tutorial Docker Images / test-tutorial (tutorials/${{ inputs.tutorial }})'
env:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}