diff --git a/.github/workflows/build-brev-tutorial-docker-images.yml b/.github/workflows/build-brev-tutorial-docker-images.yml index 9686b734..274cc059 100644 --- a/.github/workflows/build-brev-tutorial-docker-images.yml +++ b/.github/workflows/build-brev-tutorial-docker-images.yml @@ -162,6 +162,8 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 + with: + buildkitd-config: /etc/buildkit/buildkitd.toml - name: Log in to GitHub Container Registry uses: docker/login-action@v3 @@ -174,6 +176,7 @@ jobs: uses: nick-fields/retry@v3 env: DOCKER_BUILDKIT: 1 + BUILDKIT_PROGRESS: plain with: timeout_minutes: 60 max_attempts: 3 @@ -181,15 +184,22 @@ jobs: command: | cd ${{ matrix.tutorial }}/brev + # Show buildx version and check for cache images + docker buildx version + echo "Checking for existing cache images..." + docker manifest inspect "${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH}" > /dev/null 2>&1 && echo "✓ Branch cache exists" || echo "✗ Branch cache not found" + docker manifest inspect "${IMAGE_NAME}:buildcache-main" > /dev/null 2>&1 && echo "✓ Main cache exists" || echo "✗ Main cache not found" + docker buildx bake \ + --progress=plain \ --allow=fs.read=/home/runner \ --set "base.output=type=registry" \ --set "base.tags=${IMAGE_NAME}:${DOCKER_TAG_BRANCH}-latest" \ --set "base.tags=${IMAGE_NAME}:${DOCKER_TAG_BRANCH}-git-${GIT_SHORT_SHA}" \ $([ "${GIT_BRANCH_NAME}" = "main" ] && echo "--set base.tags=${IMAGE_NAME}:latest") \ - --set "base.cache-from=type=registry,ref=${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH},oci-mediatypes=true" \ - --set "base.cache-from=type=registry,ref=${IMAGE_NAME}:buildcache-main,oci-mediatypes=true" \ - --set "base.cache-to=type=registry,ref=${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH},mode=max,oci-mediatypes=true,compression=zstd,compression-level=3" \ + --set "base.cache-from=type=registry,ref=${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH}" \ + --set "base.cache-from=type=registry,ref=${IMAGE_NAME}:buildcache-main" \ + --set "base.cache-to=type=registry,ref=${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH},mode=max,image-manifest=true,compression=zstd,compression-level=3" \ --set "base.platform=linux/amd64" \ -f docker-compose.yml \ base diff --git a/.github/workflows/mirror-base-images.yml b/.github/workflows/mirror-base-images.yml index 5bde54cf..ec3ce8d7 100644 --- a/.github/workflows/mirror-base-images.yml +++ b/.github/workflows/mirror-base-images.yml @@ -59,9 +59,11 @@ jobs: - name: Pull source image from Docker Hub if: steps.check_image.outputs.exists != 'true' || github.event.inputs.force_pull == 'true' + env: + BUILDKIT_PROGRESS: quiet run: | echo "Pulling ${{ matrix.image.source }} from Docker Hub..." - docker pull ${{ matrix.image.source }} + docker pull --quiet ${{ matrix.image.source }} - name: Tag image for GHCR if: steps.check_image.outputs.exists != 'true' || github.event.inputs.force_pull == 'true' @@ -72,7 +74,7 @@ jobs: if: steps.check_image.outputs.exists != 'true' || github.event.inputs.force_pull == 'true' run: | echo "Pushing ${TARGET_IMAGE} to GHCR..." - docker push ${TARGET_IMAGE} + docker push --quiet ${TARGET_IMAGE} echo "✓ Successfully mirrored ${{ matrix.image.source }}" - name: Skipped (image exists) @@ -84,6 +86,6 @@ jobs: - name: Verify mirrored image if: steps.check_image.outputs.exists != 'true' || github.event.inputs.force_pull == 'true' run: | - docker pull ${TARGET_IMAGE} + docker pull --quiet ${TARGET_IMAGE} docker images ${TARGET_IMAGE} echo "✓ Image verified successfully" diff --git a/.github/workflows/test-brev-tutorial-docker-images.yml b/.github/workflows/test-brev-tutorial-docker-images.yml index a4f0c86a..72237b97 100644 --- a/.github/workflows/test-brev-tutorial-docker-images.yml +++ b/.github/workflows/test-brev-tutorial-docker-images.yml @@ -19,12 +19,15 @@ on: jobs: test-tutorial: name: test-tutorial (${{ inputs.tutorial }}) - runs-on: linux-amd64-gpu-t4-latest-1 + runs-on: linux-amd64-gpu-l4-latest-1 defaults: run: working-directory: ${{ github.workspace }} permissions: statuses: write + env: + BUILDKIT_PROGRESS: plain + DOCKER_CLI_HINTS: false steps: - name: Show runner info run: | @@ -63,6 +66,94 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Stop DCGM to allow NCU profiling + run: | + # DCGM (Data Center GPU Manager) locks the GPU and prevents NCU from profiling. + # Stop it before running the container tests. + echo "Stopping DCGM services..." + + # Stop the dcgm-exporter Docker container + echo "Stopping dcgm-exporter Docker container..." + docker stop dcgm-exporter 2>/dev/null && echo "Stopped dcgm-exporter container" || echo "dcgm-exporter container not running" + docker rm -f dcgm-exporter 2>/dev/null || echo "dcgm-exporter container already removed" + + # Stop systemd services + sudo systemctl stop dcgm-exporter || echo "dcgm-exporter service not found or already stopped" + sudo systemctl stop nvidia-dcgm || echo "nvidia-dcgm service not found or already stopped" + sudo systemctl stop dcgm || echo "dcgm service not found or already stopped" + sudo systemctl stop nv-hostengine || echo "nv-hostengine service not found or already stopped" + + # Kill any remaining dcgm processes + sudo pkill -9 nv-hostengine || echo "No nv-hostengine processes found" + sudo pkill -9 dcgm || echo "No dcgm processes found" + + # Relax profiling permissions (perf_event_paranoid=4 is very restrictive) + echo "Relaxing profiling permissions..." + sudo sysctl -w kernel.perf_event_paranoid=2 || echo "Could not set perf_event_paranoid" + sudo sysctl -w kernel.kptr_restrict=0 || echo "Could not set kptr_restrict" + + echo "DCGM services stopped and profiling permissions relaxed." + + - name: Debug GPU and NCU configuration + run: | + echo "=== GPU Information ===" + nvidia-smi || echo "nvidia-smi failed" + echo "" + + echo "=== NVIDIA Driver Version ===" + cat /proc/driver/nvidia/version 2>/dev/null || echo "Could not read driver version" + echo "" + + echo "=== GPU Processes ===" + nvidia-smi pmon -c 1 2>/dev/null || echo "nvidia-smi pmon failed" + echo "" + + echo "=== DCGM/NCU Blocking Processes ===" + ps aux | grep -E "(dcgm|nv-hostengine|ncu)" | grep -v grep || echo "No DCGM/NCU processes found" + echo "" + + echo "=== Systemd Services (nvidia/dcgm related) ===" + systemctl list-units --type=service | grep -iE "(nvidia|dcgm|gpu)" || echo "No matching services" + echo "" + + echo "=== Profiling Permissions ===" + cat /proc/sys/kernel/perf_event_paranoid 2>/dev/null || echo "Could not read perf_event_paranoid" + cat /proc/sys/kernel/kptr_restrict 2>/dev/null || echo "Could not read kptr_restrict" + echo "" + + echo "=== NVIDIA Kernel Modules ===" + lsmod | grep nvidia || echo "No nvidia modules loaded" + echo "" + + echo "=== /dev/nvidia* devices ===" + ls -la /dev/nvidia* 2>/dev/null || echo "No /dev/nvidia* devices found" + echo "" + + - name: Pre-pull Docker images + run: | + COMPOSE_FILE="artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml" + + # Extract all unique images from the compose file + # The main image is defined with a YAML anchor like: image: &image ghcr.io/... + MAIN_IMAGE=$(grep "image: &image" "$COMPOSE_FILE" | sed 's/.*image: &image //') + if [ -z "$MAIN_IMAGE" ]; then + # Fallback: try to find any ghcr.io image reference + MAIN_IMAGE=$(grep -o 'ghcr.io/[^"]*' "$COMPOSE_FILE" | head -1) + fi + + # Extract the nsight image (nvcr.io) + NSIGHT_IMAGE=$(grep -o 'nvcr.io/[^"]*' "$COMPOSE_FILE" | head -1) + + echo "Pre-pulling main image: $MAIN_IMAGE" + docker pull "$MAIN_IMAGE" + + if [ -n "$NSIGHT_IMAGE" ]; then + echo "Pre-pulling nsight image: $NSIGHT_IMAGE" + docker pull "$NSIGHT_IMAGE" + fi + + echo "All images pulled successfully" + - name: Test Docker Compose id: test run: | diff --git a/brev/test-docker-compose.bash b/brev/test-docker-compose.bash index 16417f1a..3a7fd34d 100755 --- a/brev/test-docker-compose.bash +++ b/brev/test-docker-compose.bash @@ -115,7 +115,7 @@ export ACH_RUN_TESTS=1 # Start container echo "📦 Starting containers..." echo "" -if docker compose -f "${COMPOSE_FILE}" up -d; then +if docker compose -f "${COMPOSE_FILE}" up -d --quiet-pull; then echo "" echo -e "${GREEN}✅ Containers started successfully${NC}" echo "" diff --git a/tutorials/accelerated-python/brev/requirements.txt b/tutorials/accelerated-python/brev/requirements.txt index 894481dc..e5d12e35 100644 --- a/tutorials/accelerated-python/brev/requirements.txt +++ b/tutorials/accelerated-python/brev/requirements.txt @@ -23,7 +23,7 @@ nvidia-nvshmem-cu12 == 3.3.20 nvidia-cuda-nvcc-cu12 == 12.8.* nvidia-cuda-nvrtc-cu12 == 12.8.* -# NVIDIA devtools +# NVIDIA developer tools nvtx nsightful[notebook] @ git+https://github.com/brycelelbach/nsightful.git diff --git a/tutorials/accelerated-python/test/pytest.ini b/tutorials/accelerated-python/test/pytest.ini index aa23ff1f..1dd0dacc 100644 --- a/tutorials/accelerated-python/test/pytest.ini +++ b/tutorials/accelerated-python/test/pytest.ini @@ -1,2 +1,2 @@ [pytest] -addopts = -v --durations=0 --durations-min=0.0 +addopts = -v -s --durations=0 --durations-min=0.0 diff --git a/tutorials/accelerated-python/test/test_notebooks.py b/tutorials/accelerated-python/test/test_notebooks.py index 9ef7b510..1e60c52f 100644 --- a/tutorials/accelerated-python/test/test_notebooks.py +++ b/tutorials/accelerated-python/test/test_notebooks.py @@ -4,6 +4,7 @@ import pytest from pathlib import Path +import time import nbformat from nbclient import NotebookClient from nbclient.exceptions import CellExecutionError @@ -11,13 +12,60 @@ # Define the path to the notebooks directory NOTEBOOKS_DIR = Path(__file__).resolve().parent.parent / 'notebooks' -# Discover all solution notebooks -solution_notebooks = sorted(NOTEBOOKS_DIR.rglob('*SOLUTION*.ipynb')) +# Discover all solution notebooks (excluding checkpoint files) +solution_notebooks = sorted([ + nb for nb in NOTEBOOKS_DIR.rglob('*SOLUTION*.ipynb') + if '.ipynb_checkpoints' not in str(nb) +]) # Create test IDs from notebook paths for better test output notebook_ids = [nb.relative_to(NOTEBOOKS_DIR).as_posix() for nb in solution_notebooks] +def extract_cell_outputs(nb, cell_times=None): + """Extract stdout/stderr from all executed cells for debugging.""" + outputs = [] + for i, cell in enumerate(nb.cells): + if cell.cell_type != 'code': + continue + cell_outputs = [] + for output in cell.get('outputs', []): + if output.get('output_type') == 'stream': + stream_name = output.get('name', 'stdout') + text = output.get('text', '') + cell_outputs.append(f"[{stream_name}] {text}") + elif output.get('output_type') == 'error': + ename = output.get('ename', 'Error') + evalue = output.get('evalue', '') + cell_outputs.append(f"[error] {ename}: {evalue}") + if cell_outputs: + source_preview = cell.source[:100].replace('\n', ' ') + time_str = f" ({cell_times.get(i, 0):.2f}s)" if cell_times else "" + outputs.append(f"--- Cell {i}{time_str}: {source_preview}... ---\n" + ''.join(cell_outputs)) + return '\n'.join(outputs) + + +def check_gpu_state(): + """Print GPU state for debugging slow execution.""" + import subprocess + try: + result = subprocess.run( + ['nvidia-smi', '--query-gpu=name,compute_mode,clocks.current.sm,clocks.current.memory,power.draw,temperature.gpu,utilization.gpu', '--format=csv,noheader'], + capture_output=True, text=True, timeout=5 + ) + if result.returncode == 0: + print(f" GPU State: {result.stdout.strip()}") + # Also check for any processes using the GPU + result2 = subprocess.run( + ['nvidia-smi', '--query-compute-apps=pid,name,used_memory', '--format=csv,noheader'], + capture_output=True, text=True, timeout=5 + ) + if result2.returncode == 0 and result2.stdout.strip(): + print(f" GPU Processes: {result2.stdout.strip()}") + except Exception as e: + print(f" GPU State check failed: {e}") + + @pytest.mark.parametrize('notebook_path', solution_notebooks, ids=notebook_ids) def test_solution_notebook_executes(notebook_path): """ @@ -25,6 +73,10 @@ def test_solution_notebook_executes(notebook_path): Uses nbclient to execute all cells in the notebook. """ + print(f"\n=== Starting notebook: {notebook_path.name} ===") + check_gpu_state() + notebook_start = time.time() + # Read the notebook with open(notebook_path, 'r', encoding='utf-8') as f: nb = nbformat.read(f, as_version=4) @@ -44,13 +96,42 @@ def test_solution_notebook_executes(notebook_path): resources={'metadata': {'path': str(notebook_path.parent)}} ) - # Execute the notebook + # Execute the notebook cell by cell to get timing + cell_times = {} try: - client.execute() + with client.setup_kernel(): + for i, cell in enumerate(nb.cells): + if cell.cell_type != 'code': + continue + cell_start = time.time() + source_preview = cell.source[:60].replace('\n', ' ') + print(f" Cell {i}: {source_preview}...", end='', flush=True) + + # Check kernel is alive before executing + if not client.kc.is_alive(): + print(" [KERNEL DEAD!]") + raise RuntimeError(f"Kernel died before cell {i}") + + client.execute_cell(cell, i) + cell_time = time.time() - cell_start + cell_times[i] = cell_time + print(f" [{cell_time:.2f}s]") + + # Flush any pending output + import sys + sys.stdout.flush() + except CellExecutionError as e: # Provide detailed error information - # CellExecutionError stores the error message in str(e) - pytest.fail(f"Notebook execution failed:\n{str(e)}") + # Include output from ALL cells, not just the failing one + all_outputs = extract_cell_outputs(nb, cell_times) + total_time = time.time() - notebook_start + pytest.fail(f"Notebook execution failed (total time: {total_time:.2f}s):\n{str(e)}\n\n=== ALL CELL OUTPUTS ===\n{all_outputs}") except Exception as e: # Catch any other execution errors - pytest.fail(f"Notebook execution failed: {str(e)}") + all_outputs = extract_cell_outputs(nb, cell_times) + total_time = time.time() - notebook_start + pytest.fail(f"Notebook execution failed (total time: {total_time:.2f}s): {str(e)}\n\n=== ALL CELL OUTPUTS ===\n{all_outputs}") + + total_time = time.time() - notebook_start + print(f"=== Completed {notebook_path.name} in {total_time:.2f}s ===")