NVIDIA · brycelelbach · Dec 10, 2025 · Dec 8, 2025 · Dec 8, 2025 · Dec 9, 2025
diff --git a/.github/workflows/build-brev-tutorial-docker-images.yml b/.github/workflows/build-brev-tutorial-docker-images.yml
@@ -162,6 +162,8 @@ jobs:
 
       - name: Set up Docker Buildx
         uses: docker/setup-buildx-action@v3
+        with:
+          buildkitd-config: /etc/buildkit/buildkitd.toml
 
       - name: Log in to GitHub Container Registry
         uses: docker/login-action@v3
@@ -174,22 +176,30 @@ jobs:
         uses: nick-fields/retry@v3
         env:
           DOCKER_BUILDKIT: 1
+          BUILDKIT_PROGRESS: plain
         with:
           timeout_minutes: 60
           max_attempts: 3
           retry_wait_seconds: 30
           command: |
             cd ${{ matrix.tutorial }}/brev
 
+            # Show buildx version and check for cache images
+            docker buildx version
+            echo "Checking for existing cache images..."
+            docker manifest inspect "${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH}" > /dev/null 2>&1 && echo "✓ Branch cache exists" || echo "✗ Branch cache not found"
+            docker manifest inspect "${IMAGE_NAME}:buildcache-main" > /dev/null 2>&1 && echo "✓ Main cache exists" || echo "✗ Main cache not found"
+
             docker buildx bake \
+              --progress=plain \
               --allow=fs.read=/home/runner \
               --set "base.output=type=registry" \
               --set "base.tags=${IMAGE_NAME}:${DOCKER_TAG_BRANCH}-latest" \
               --set "base.tags=${IMAGE_NAME}:${DOCKER_TAG_BRANCH}-git-${GIT_SHORT_SHA}" \
               $([ "${GIT_BRANCH_NAME}" = "main" ] && echo "--set base.tags=${IMAGE_NAME}:latest") \
-              --set "base.cache-from=type=registry,ref=${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH},oci-mediatypes=true" \
-              --set "base.cache-from=type=registry,ref=${IMAGE_NAME}:buildcache-main,oci-mediatypes=true" \
-              --set "base.cache-to=type=registry,ref=${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH},mode=max,oci-mediatypes=true,compression=zstd,compression-level=3" \
+              --set "base.cache-from=type=registry,ref=${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH}" \
+              --set "base.cache-from=type=registry,ref=${IMAGE_NAME}:buildcache-main" \
+              --set "base.cache-to=type=registry,ref=${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH},mode=max,image-manifest=true,compression=zstd,compression-level=3" \
               --set "base.platform=linux/amd64" \
               -f docker-compose.yml \
               base

diff --git a/.github/workflows/mirror-base-images.yml b/.github/workflows/mirror-base-images.yml
@@ -59,9 +59,11 @@ jobs:
 
       - name: Pull source image from Docker Hub
         if: steps.check_image.outputs.exists != 'true' || github.event.inputs.force_pull == 'true'
+        env:
+          BUILDKIT_PROGRESS: quiet
         run: |
           echo "Pulling ${{ matrix.image.source }} from Docker Hub..."
-          docker pull ${{ matrix.image.source }}
+          docker pull --quiet ${{ matrix.image.source }}
 
       - name: Tag image for GHCR
         if: steps.check_image.outputs.exists != 'true' || github.event.inputs.force_pull == 'true'
@@ -72,7 +74,7 @@ jobs:
         if: steps.check_image.outputs.exists != 'true' || github.event.inputs.force_pull == 'true'
         run: |
           echo "Pushing ${TARGET_IMAGE} to GHCR..."
-          docker push ${TARGET_IMAGE}
+          docker push --quiet ${TARGET_IMAGE}
           echo "✓ Successfully mirrored ${{ matrix.image.source }}"
 
       - name: Skipped (image exists)
@@ -84,6 +86,6 @@ jobs:
       - name: Verify mirrored image
         if: steps.check_image.outputs.exists != 'true' || github.event.inputs.force_pull == 'true'
         run: |
-          docker pull ${TARGET_IMAGE}
+          docker pull --quiet ${TARGET_IMAGE}
           docker images ${TARGET_IMAGE}
           echo "✓ Image verified successfully"
diff --git a/.github/workflows/test-brev-tutorial-docker-images.yml b/.github/workflows/test-brev-tutorial-docker-images.yml
@@ -19,12 +19,15 @@ on:
 jobs:
   test-tutorial:
     name: test-tutorial (${{ inputs.tutorial }})
-    runs-on: linux-amd64-gpu-t4-latest-1
+    runs-on: linux-amd64-gpu-l4-latest-1
     defaults:
       run:
         working-directory: ${{ github.workspace }}
     permissions:
       statuses: write
+    env:
+      BUILDKIT_PROGRESS: plain
+      DOCKER_CLI_HINTS: false
     steps:
       - name: Show runner info
         run: |
@@ -63,6 +66,94 @@ jobs:
           username: ${{ github.actor }}
           password: ${{ secrets.GITHUB_TOKEN }}
 
+      - name: Stop DCGM to allow NCU profiling
+        run: |
+          # DCGM (Data Center GPU Manager) locks the GPU and prevents NCU from profiling.
+          # Stop it before running the container tests.
+          echo "Stopping DCGM services..."
+
+          # Stop the dcgm-exporter Docker container
+          echo "Stopping dcgm-exporter Docker container..."
+          docker stop dcgm-exporter 2>/dev/null && echo "Stopped dcgm-exporter container" || echo "dcgm-exporter container not running"
+          docker rm -f dcgm-exporter 2>/dev/null || echo "dcgm-exporter container already removed"
+
+          # Stop systemd services
+          sudo systemctl stop dcgm-exporter || echo "dcgm-exporter service not found or already stopped"
+          sudo systemctl stop nvidia-dcgm || echo "nvidia-dcgm service not found or already stopped"
+          sudo systemctl stop dcgm || echo "dcgm service not found or already stopped"
+          sudo systemctl stop nv-hostengine || echo "nv-hostengine service not found or already stopped"
+
+          # Kill any remaining dcgm processes
+          sudo pkill -9 nv-hostengine || echo "No nv-hostengine processes found"
+          sudo pkill -9 dcgm || echo "No dcgm processes found"
+
+          # Relax profiling permissions (perf_event_paranoid=4 is very restrictive)
+          echo "Relaxing profiling permissions..."
+          sudo sysctl -w kernel.perf_event_paranoid=2 || echo "Could not set perf_event_paranoid"
+          sudo sysctl -w kernel.kptr_restrict=0 || echo "Could not set kptr_restrict"
+
+          echo "DCGM services stopped and profiling permissions relaxed."
+
+      - name: Debug GPU and NCU configuration
+        run: |
+          echo "=== GPU Information ==="
+          nvidia-smi || echo "nvidia-smi failed"
+          echo ""
+
+          echo "=== NVIDIA Driver Version ==="
+          cat /proc/driver/nvidia/version 2>/dev/null || echo "Could not read driver version"
+          echo ""
+
+          echo "=== GPU Processes ==="
+          nvidia-smi pmon -c 1 2>/dev/null || echo "nvidia-smi pmon failed"
+          echo ""
+
+          echo "=== DCGM/NCU Blocking Processes ==="
+          ps aux | grep -E "(dcgm|nv-hostengine|ncu)" | grep -v grep || echo "No DCGM/NCU processes found"
+          echo ""
+
+          echo "=== Systemd Services (nvidia/dcgm related) ==="
+          systemctl list-units --type=service | grep -iE "(nvidia|dcgm|gpu)" || echo "No matching services"
+          echo ""
+
+          echo "=== Profiling Permissions ==="
+          cat /proc/sys/kernel/perf_event_paranoid 2>/dev/null || echo "Could not read perf_event_paranoid"
+          cat /proc/sys/kernel/kptr_restrict 2>/dev/null || echo "Could not read kptr_restrict"
+          echo ""
+
+          echo "=== NVIDIA Kernel Modules ==="
+          lsmod | grep nvidia || echo "No nvidia modules loaded"
+          echo ""
+
+          echo "=== /dev/nvidia* devices ==="
+          ls -la /dev/nvidia* 2>/dev/null || echo "No /dev/nvidia* devices found"
+          echo ""
+
+      - name: Pre-pull Docker images
+        run: |
+          COMPOSE_FILE="artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml"
+
+          # Extract all unique images from the compose file
+          # The main image is defined with a YAML anchor like: image: &image ghcr.io/...
+          MAIN_IMAGE=$(grep "image: &image" "$COMPOSE_FILE" | sed 's/.*image: &image //')
+          if [ -z "$MAIN_IMAGE" ]; then
+            # Fallback: try to find any ghcr.io image reference
+            MAIN_IMAGE=$(grep -o 'ghcr.io/[^"]*' "$COMPOSE_FILE" | head -1)
+          fi
+
+          # Extract the nsight image (nvcr.io)
+          NSIGHT_IMAGE=$(grep -o 'nvcr.io/[^"]*' "$COMPOSE_FILE" | head -1)
+
+          echo "Pre-pulling main image: $MAIN_IMAGE"
+          docker pull "$MAIN_IMAGE"
+
+          if [ -n "$NSIGHT_IMAGE" ]; then
+            echo "Pre-pulling nsight image: $NSIGHT_IMAGE"
+            docker pull "$NSIGHT_IMAGE"
+          fi
+
+          echo "All images pulled successfully"
+
       - name: Test Docker Compose
         id: test
         run: |

diff --git a/brev/test-docker-compose.bash b/brev/test-docker-compose.bash
@@ -115,7 +115,7 @@ export ACH_RUN_TESTS=1
 # Start container
 echo "📦 Starting containers..."
 echo ""
-if docker compose -f "${COMPOSE_FILE}" up -d; then
+if docker compose -f "${COMPOSE_FILE}" up -d --quiet-pull; then
     echo ""
     echo -e "${GREEN}✅ Containers started successfully${NC}"
     echo ""

diff --git a/tutorials/accelerated-python/brev/requirements.txt b/tutorials/accelerated-python/brev/requirements.txt
@@ -23,7 +23,7 @@ nvidia-nvshmem-cu12 == 3.3.20
 nvidia-cuda-nvcc-cu12 == 12.8.*
 nvidia-cuda-nvrtc-cu12 == 12.8.*
 
-# NVIDIA devtools
+# NVIDIA developer tools
 nvtx
 nsightful[notebook] @ git+https://github.com/brycelelbach/nsightful.git
 

diff --git a/tutorials/accelerated-python/test/pytest.ini b/tutorials/accelerated-python/test/pytest.ini
@@ -1,2 +1,2 @@
 [pytest]
-addopts = -v --durations=0 --durations-min=0.0
+addopts = -v -s --durations=0 --durations-min=0.0
diff --git a/tutorials/accelerated-python/test/test_notebooks.py b/tutorials/accelerated-python/test/test_notebooks.py
@@ -4,27 +4,79 @@
 
 import pytest
 from pathlib import Path
+import time
 import nbformat
 from nbclient import NotebookClient
 from nbclient.exceptions import CellExecutionError
 
 # Define the path to the notebooks directory
 NOTEBOOKS_DIR = Path(__file__).resolve().parent.parent / 'notebooks'
 
-# Discover all solution notebooks
-solution_notebooks = sorted(NOTEBOOKS_DIR.rglob('*SOLUTION*.ipynb'))
+# Discover all solution notebooks (excluding checkpoint files)
+solution_notebooks = sorted([
+    nb for nb in NOTEBOOKS_DIR.rglob('*SOLUTION*.ipynb')
+    if '.ipynb_checkpoints' not in str(nb)
+])
 
 # Create test IDs from notebook paths for better test output
 notebook_ids = [nb.relative_to(NOTEBOOKS_DIR).as_posix() for nb in solution_notebooks]
 
 
+def extract_cell_outputs(nb, cell_times=None):
+    """Extract stdout/stderr from all executed cells for debugging."""
+    outputs = []
+    for i, cell in enumerate(nb.cells):
+        if cell.cell_type != 'code':
+            continue
+        cell_outputs = []
+        for output in cell.get('outputs', []):
+            if output.get('output_type') == 'stream':
+                stream_name = output.get('name', 'stdout')
+                text = output.get('text', '')
+                cell_outputs.append(f"[{stream_name}] {text}")
+            elif output.get('output_type') == 'error':
+                ename = output.get('ename', 'Error')
+                evalue = output.get('evalue', '')
+                cell_outputs.append(f"[error] {ename}: {evalue}")
+        if cell_outputs:
+            source_preview = cell.source[:100].replace('\n', ' ')
+            time_str = f" ({cell_times.get(i, 0):.2f}s)" if cell_times else ""
+            outputs.append(f"--- Cell {i}{time_str}: {source_preview}... ---\n" + ''.join(cell_outputs))
+    return '\n'.join(outputs)
+
+
+def check_gpu_state():
+    """Print GPU state for debugging slow execution."""
+    import subprocess
+    try:
+        result = subprocess.run(
+            ['nvidia-smi', '--query-gpu=name,compute_mode,clocks.current.sm,clocks.current.memory,power.draw,temperature.gpu,utilization.gpu', '--format=csv,noheader'],
+            capture_output=True, text=True, timeout=5
+        )
+        if result.returncode == 0:
+            print(f"  GPU State: {result.stdout.strip()}")
+        # Also check for any processes using the GPU
+        result2 = subprocess.run(
+            ['nvidia-smi', '--query-compute-apps=pid,name,used_memory', '--format=csv,noheader'],
+            capture_output=True, text=True, timeout=5
+        )
+        if result2.returncode == 0 and result2.stdout.strip():
+            print(f"  GPU Processes: {result2.stdout.strip()}")
+    except Exception as e:
+        print(f"  GPU State check failed: {e}")
+
+
 @pytest.mark.parametrize('notebook_path', solution_notebooks, ids=notebook_ids)
 def test_solution_notebook_executes(notebook_path):
     """
     Test that a solution notebook executes without errors.
 
     Uses nbclient to execute all cells in the notebook.
     """
+    print(f"\n=== Starting notebook: {notebook_path.name} ===")
+    check_gpu_state()
+    notebook_start = time.time()
+
     # Read the notebook
     with open(notebook_path, 'r', encoding='utf-8') as f:
         nb = nbformat.read(f, as_version=4)
@@ -44,13 +96,42 @@ def test_solution_notebook_executes(notebook_path):
         resources={'metadata': {'path': str(notebook_path.parent)}}
     )
 
-    # Execute the notebook
+    # Execute the notebook cell by cell to get timing
+    cell_times = {}
     try:
-        client.execute()
+        with client.setup_kernel():
+            for i, cell in enumerate(nb.cells):
+                if cell.cell_type != 'code':
+                    continue
+                cell_start = time.time()
+                source_preview = cell.source[:60].replace('\n', ' ')
+                print(f"  Cell {i}: {source_preview}...", end='', flush=True)
+
+                # Check kernel is alive before executing
+                if not client.kc.is_alive():
+                    print(" [KERNEL DEAD!]")
+                    raise RuntimeError(f"Kernel died before cell {i}")
+
+                client.execute_cell(cell, i)
+                cell_time = time.time() - cell_start
+                cell_times[i] = cell_time
+                print(f" [{cell_time:.2f}s]")
+
+                # Flush any pending output
+                import sys
+                sys.stdout.flush()
+
     except CellExecutionError as e:
         # Provide detailed error information
-        # CellExecutionError stores the error message in str(e)
-        pytest.fail(f"Notebook execution failed:\n{str(e)}")
+        # Include output from ALL cells, not just the failing one
+        all_outputs = extract_cell_outputs(nb, cell_times)
+        total_time = time.time() - notebook_start
+        pytest.fail(f"Notebook execution failed (total time: {total_time:.2f}s):\n{str(e)}\n\n=== ALL CELL OUTPUTS ===\n{all_outputs}")
     except Exception as e:
         # Catch any other execution errors
-        pytest.fail(f"Notebook execution failed: {str(e)}")
+        all_outputs = extract_cell_outputs(nb, cell_times)
+        total_time = time.time() - notebook_start
+        pytest.fail(f"Notebook execution failed (total time: {total_time:.2f}s): {str(e)}\n\n=== ALL CELL OUTPUTS ===\n{all_outputs}")
+
+    total_time = time.time() - notebook_start
+    print(f"=== Completed {notebook_path.name} in {total_time:.2f}s ===")