Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
15 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions .github/workflows/build-brev-tutorial-docker-images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,8 @@ jobs:

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v3
with:
buildkitd-config: /etc/buildkit/buildkitd.toml

- name: Log in to GitHub Container Registry
uses: docker/login-action@v3
Expand All @@ -174,22 +176,30 @@ jobs:
uses: nick-fields/retry@v3
env:
DOCKER_BUILDKIT: 1
BUILDKIT_PROGRESS: plain
with:
timeout_minutes: 60
max_attempts: 3
retry_wait_seconds: 30
command: |
cd ${{ matrix.tutorial }}/brev

# Show buildx version and check for cache images
docker buildx version
echo "Checking for existing cache images..."
docker manifest inspect "${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH}" > /dev/null 2>&1 && echo "✓ Branch cache exists" || echo "✗ Branch cache not found"
docker manifest inspect "${IMAGE_NAME}:buildcache-main" > /dev/null 2>&1 && echo "✓ Main cache exists" || echo "✗ Main cache not found"

docker buildx bake \
--progress=plain \
--allow=fs.read=/home/runner \
--set "base.output=type=registry" \
--set "base.tags=${IMAGE_NAME}:${DOCKER_TAG_BRANCH}-latest" \
--set "base.tags=${IMAGE_NAME}:${DOCKER_TAG_BRANCH}-git-${GIT_SHORT_SHA}" \
$([ "${GIT_BRANCH_NAME}" = "main" ] && echo "--set base.tags=${IMAGE_NAME}:latest") \
--set "base.cache-from=type=registry,ref=${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH},oci-mediatypes=true" \
--set "base.cache-from=type=registry,ref=${IMAGE_NAME}:buildcache-main,oci-mediatypes=true" \
--set "base.cache-to=type=registry,ref=${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH},mode=max,oci-mediatypes=true,compression=zstd,compression-level=3" \
--set "base.cache-from=type=registry,ref=${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH}" \
--set "base.cache-from=type=registry,ref=${IMAGE_NAME}:buildcache-main" \
--set "base.cache-to=type=registry,ref=${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH},mode=max,image-manifest=true,compression=zstd,compression-level=3" \
--set "base.platform=linux/amd64" \
-f docker-compose.yml \
base
Expand Down
8 changes: 5 additions & 3 deletions .github/workflows/mirror-base-images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,11 @@ jobs:

- name: Pull source image from Docker Hub
if: steps.check_image.outputs.exists != 'true' || github.event.inputs.force_pull == 'true'
env:
BUILDKIT_PROGRESS: quiet
run: |
echo "Pulling ${{ matrix.image.source }} from Docker Hub..."
docker pull ${{ matrix.image.source }}
docker pull --quiet ${{ matrix.image.source }}

- name: Tag image for GHCR
if: steps.check_image.outputs.exists != 'true' || github.event.inputs.force_pull == 'true'
Expand All @@ -72,7 +74,7 @@ jobs:
if: steps.check_image.outputs.exists != 'true' || github.event.inputs.force_pull == 'true'
run: |
echo "Pushing ${TARGET_IMAGE} to GHCR..."
docker push ${TARGET_IMAGE}
docker push --quiet ${TARGET_IMAGE}
echo "✓ Successfully mirrored ${{ matrix.image.source }}"

- name: Skipped (image exists)
Expand All @@ -84,6 +86,6 @@ jobs:
- name: Verify mirrored image
if: steps.check_image.outputs.exists != 'true' || github.event.inputs.force_pull == 'true'
run: |
docker pull ${TARGET_IMAGE}
docker pull --quiet ${TARGET_IMAGE}
docker images ${TARGET_IMAGE}
echo "✓ Image verified successfully"
93 changes: 92 additions & 1 deletion .github/workflows/test-brev-tutorial-docker-images.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,15 @@ on:
jobs:
test-tutorial:
name: test-tutorial (${{ inputs.tutorial }})
runs-on: linux-amd64-gpu-t4-latest-1
runs-on: linux-amd64-gpu-l4-latest-1
defaults:
run:
working-directory: ${{ github.workspace }}
permissions:
statuses: write
env:
BUILDKIT_PROGRESS: plain
DOCKER_CLI_HINTS: false
steps:
- name: Show runner info
run: |
Expand Down Expand Up @@ -63,6 +66,94 @@ jobs:
username: ${{ github.actor }}
password: ${{ secrets.GITHUB_TOKEN }}

- name: Stop DCGM to allow NCU profiling
run: |
# DCGM (Data Center GPU Manager) locks the GPU and prevents NCU from profiling.
# Stop it before running the container tests.
echo "Stopping DCGM services..."

# Stop the dcgm-exporter Docker container
echo "Stopping dcgm-exporter Docker container..."
docker stop dcgm-exporter 2>/dev/null && echo "Stopped dcgm-exporter container" || echo "dcgm-exporter container not running"
docker rm -f dcgm-exporter 2>/dev/null || echo "dcgm-exporter container already removed"

# Stop systemd services
sudo systemctl stop dcgm-exporter || echo "dcgm-exporter service not found or already stopped"
sudo systemctl stop nvidia-dcgm || echo "nvidia-dcgm service not found or already stopped"
sudo systemctl stop dcgm || echo "dcgm service not found or already stopped"
sudo systemctl stop nv-hostengine || echo "nv-hostengine service not found or already stopped"

# Kill any remaining dcgm processes
sudo pkill -9 nv-hostengine || echo "No nv-hostengine processes found"
sudo pkill -9 dcgm || echo "No dcgm processes found"

# Relax profiling permissions (perf_event_paranoid=4 is very restrictive)
echo "Relaxing profiling permissions..."
sudo sysctl -w kernel.perf_event_paranoid=2 || echo "Could not set perf_event_paranoid"
sudo sysctl -w kernel.kptr_restrict=0 || echo "Could not set kptr_restrict"

echo "DCGM services stopped and profiling permissions relaxed."

- name: Debug GPU and NCU configuration
run: |
echo "=== GPU Information ==="
nvidia-smi || echo "nvidia-smi failed"
echo ""

echo "=== NVIDIA Driver Version ==="
cat /proc/driver/nvidia/version 2>/dev/null || echo "Could not read driver version"
echo ""

echo "=== GPU Processes ==="
nvidia-smi pmon -c 1 2>/dev/null || echo "nvidia-smi pmon failed"
echo ""

echo "=== DCGM/NCU Blocking Processes ==="
ps aux | grep -E "(dcgm|nv-hostengine|ncu)" | grep -v grep || echo "No DCGM/NCU processes found"
echo ""

echo "=== Systemd Services (nvidia/dcgm related) ==="
systemctl list-units --type=service | grep -iE "(nvidia|dcgm|gpu)" || echo "No matching services"
echo ""

echo "=== Profiling Permissions ==="
cat /proc/sys/kernel/perf_event_paranoid 2>/dev/null || echo "Could not read perf_event_paranoid"
cat /proc/sys/kernel/kptr_restrict 2>/dev/null || echo "Could not read kptr_restrict"
echo ""

echo "=== NVIDIA Kernel Modules ==="
lsmod | grep nvidia || echo "No nvidia modules loaded"
echo ""

echo "=== /dev/nvidia* devices ==="
ls -la /dev/nvidia* 2>/dev/null || echo "No /dev/nvidia* devices found"
echo ""

- name: Pre-pull Docker images
run: |
COMPOSE_FILE="artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml"

# Extract all unique images from the compose file
# The main image is defined with a YAML anchor like: image: &image ghcr.io/...
MAIN_IMAGE=$(grep "image: &image" "$COMPOSE_FILE" | sed 's/.*image: &image //')
if [ -z "$MAIN_IMAGE" ]; then
# Fallback: try to find any ghcr.io image reference
MAIN_IMAGE=$(grep -o 'ghcr.io/[^"]*' "$COMPOSE_FILE" | head -1)
fi

# Extract the nsight image (nvcr.io)
NSIGHT_IMAGE=$(grep -o 'nvcr.io/[^"]*' "$COMPOSE_FILE" | head -1)

echo "Pre-pulling main image: $MAIN_IMAGE"
docker pull "$MAIN_IMAGE"

if [ -n "$NSIGHT_IMAGE" ]; then
echo "Pre-pulling nsight image: $NSIGHT_IMAGE"
docker pull "$NSIGHT_IMAGE"
fi

echo "All images pulled successfully"

- name: Test Docker Compose
id: test
run: |
Expand Down
2 changes: 1 addition & 1 deletion brev/test-docker-compose.bash
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ export ACH_RUN_TESTS=1
# Start container
echo "📦 Starting containers..."
echo ""
if docker compose -f "${COMPOSE_FILE}" up -d; then
if docker compose -f "${COMPOSE_FILE}" up -d --quiet-pull; then
echo ""
echo -e "${GREEN}✅ Containers started successfully${NC}"
echo ""
Expand Down
2 changes: 1 addition & 1 deletion tutorials/accelerated-python/brev/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ nvidia-nvshmem-cu12 == 3.3.20
nvidia-cuda-nvcc-cu12 == 12.8.*
nvidia-cuda-nvrtc-cu12 == 12.8.*

# NVIDIA devtools
# NVIDIA developer tools
nvtx
nsightful[notebook] @ git+https://github.com/brycelelbach/nsightful.git

Expand Down
2 changes: 1 addition & 1 deletion tutorials/accelerated-python/test/pytest.ini
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
[pytest]
addopts = -v --durations=0 --durations-min=0.0
addopts = -v -s --durations=0 --durations-min=0.0
95 changes: 88 additions & 7 deletions tutorials/accelerated-python/test/test_notebooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,79 @@

import pytest
from pathlib import Path
import time
import nbformat
from nbclient import NotebookClient
from nbclient.exceptions import CellExecutionError

# Define the path to the notebooks directory
NOTEBOOKS_DIR = Path(__file__).resolve().parent.parent / 'notebooks'

# Discover all solution notebooks
solution_notebooks = sorted(NOTEBOOKS_DIR.rglob('*SOLUTION*.ipynb'))
# Discover all solution notebooks (excluding checkpoint files)
solution_notebooks = sorted([
nb for nb in NOTEBOOKS_DIR.rglob('*SOLUTION*.ipynb')
if '.ipynb_checkpoints' not in str(nb)
])

# Create test IDs from notebook paths for better test output
notebook_ids = [nb.relative_to(NOTEBOOKS_DIR).as_posix() for nb in solution_notebooks]


def extract_cell_outputs(nb, cell_times=None):
"""Extract stdout/stderr from all executed cells for debugging."""
outputs = []
for i, cell in enumerate(nb.cells):
if cell.cell_type != 'code':
continue
cell_outputs = []
for output in cell.get('outputs', []):
if output.get('output_type') == 'stream':
stream_name = output.get('name', 'stdout')
text = output.get('text', '')
cell_outputs.append(f"[{stream_name}] {text}")
elif output.get('output_type') == 'error':
ename = output.get('ename', 'Error')
evalue = output.get('evalue', '')
cell_outputs.append(f"[error] {ename}: {evalue}")
if cell_outputs:
source_preview = cell.source[:100].replace('\n', ' ')
time_str = f" ({cell_times.get(i, 0):.2f}s)" if cell_times else ""
outputs.append(f"--- Cell {i}{time_str}: {source_preview}... ---\n" + ''.join(cell_outputs))
return '\n'.join(outputs)


def check_gpu_state():
"""Print GPU state for debugging slow execution."""
import subprocess
try:
result = subprocess.run(
['nvidia-smi', '--query-gpu=name,compute_mode,clocks.current.sm,clocks.current.memory,power.draw,temperature.gpu,utilization.gpu', '--format=csv,noheader'],
capture_output=True, text=True, timeout=5
)
if result.returncode == 0:
print(f" GPU State: {result.stdout.strip()}")
# Also check for any processes using the GPU
result2 = subprocess.run(
['nvidia-smi', '--query-compute-apps=pid,name,used_memory', '--format=csv,noheader'],
capture_output=True, text=True, timeout=5
)
if result2.returncode == 0 and result2.stdout.strip():
print(f" GPU Processes: {result2.stdout.strip()}")
except Exception as e:
print(f" GPU State check failed: {e}")


@pytest.mark.parametrize('notebook_path', solution_notebooks, ids=notebook_ids)
def test_solution_notebook_executes(notebook_path):
"""
Test that a solution notebook executes without errors.

Uses nbclient to execute all cells in the notebook.
"""
print(f"\n=== Starting notebook: {notebook_path.name} ===")
check_gpu_state()
notebook_start = time.time()

# Read the notebook
with open(notebook_path, 'r', encoding='utf-8') as f:
nb = nbformat.read(f, as_version=4)
Expand All @@ -44,13 +96,42 @@ def test_solution_notebook_executes(notebook_path):
resources={'metadata': {'path': str(notebook_path.parent)}}
)

# Execute the notebook
# Execute the notebook cell by cell to get timing
cell_times = {}
try:
client.execute()
with client.setup_kernel():
for i, cell in enumerate(nb.cells):
if cell.cell_type != 'code':
continue
cell_start = time.time()
source_preview = cell.source[:60].replace('\n', ' ')
print(f" Cell {i}: {source_preview}...", end='', flush=True)

# Check kernel is alive before executing
if not client.kc.is_alive():
print(" [KERNEL DEAD!]")
raise RuntimeError(f"Kernel died before cell {i}")

client.execute_cell(cell, i)
cell_time = time.time() - cell_start
cell_times[i] = cell_time
print(f" [{cell_time:.2f}s]")

# Flush any pending output
import sys
sys.stdout.flush()

except CellExecutionError as e:
# Provide detailed error information
# CellExecutionError stores the error message in str(e)
pytest.fail(f"Notebook execution failed:\n{str(e)}")
# Include output from ALL cells, not just the failing one
all_outputs = extract_cell_outputs(nb, cell_times)
total_time = time.time() - notebook_start
pytest.fail(f"Notebook execution failed (total time: {total_time:.2f}s):\n{str(e)}\n\n=== ALL CELL OUTPUTS ===\n{all_outputs}")
except Exception as e:
# Catch any other execution errors
pytest.fail(f"Notebook execution failed: {str(e)}")
all_outputs = extract_cell_outputs(nb, cell_times)
total_time = time.time() - notebook_start
pytest.fail(f"Notebook execution failed (total time: {total_time:.2f}s): {str(e)}\n\n=== ALL CELL OUTPUTS ===\n{all_outputs}")

total_time = time.time() - notebook_start
print(f"=== Completed {notebook_path.name} in {total_time:.2f}s ===")