From 36d658f24c1c6125eb5d20bbf3617a80083bd102 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Mon, 8 Dec 2025 15:52:44 -0500 Subject: [PATCH 01/15] CI: Add debugging to the notebooks using nsightful ncu that are failing for mysterious reasons in CI. --- ...40__kernel_authoring__copy__SOLUTION.ipynb | 20 +++++++++++++++++-- ..._authoring__book_histogram__SOLUTION.ipynb | 20 +++++++++++++++++-- 2 files changed, 36 insertions(+), 4 deletions(-) diff --git a/tutorials/accelerated-python/notebooks/kernels/solutions/40__kernel_authoring__copy__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/kernels/solutions/40__kernel_authoring__copy__SOLUTION.ipynb index 4c16f24f..143d5cdc 100644 --- a/tutorials/accelerated-python/notebooks/kernels/solutions/40__kernel_authoring__copy__SOLUTION.ipynb +++ b/tutorials/accelerated-python/notebooks/kernels/solutions/40__kernel_authoring__copy__SOLUTION.ipynb @@ -163,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -299,6 +299,14 @@ "source": [ "import nsightful\n", "\n", + "# DEBUG: Print CSV info for CI debugging\n", + "print(f\"DEBUG: copy_blocked_csv type: {type(copy_blocked_csv)}\")\n", + "print(f\"DEBUG: copy_blocked_csv length: {len(copy_blocked_csv)}\")\n", + "if len(copy_blocked_csv) > 0:\n", + " print(f\"DEBUG: First 10 lines of copy_blocked_csv:\")\n", + " for i, line in enumerate(copy_blocked_csv[:10]):\n", + " print(f\" {i}: {repr(line)}\")\n", + "\n", "nsightful.display_ncu_csv_in_notebook(copy_blocked_csv)" ] }, @@ -473,7 +481,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -607,6 +615,14 @@ } ], "source": [ + "# DEBUG: Print CSV info for CI debugging\n", + "print(f\"DEBUG: copy_optimized_csv type: {type(copy_optimized_csv)}\")\n", + "print(f\"DEBUG: copy_optimized_csv length: {len(copy_optimized_csv)}\")\n", + "if len(copy_optimized_csv) > 0:\n", + " print(f\"DEBUG: First 10 lines of copy_optimized_csv:\")\n", + " for i, line in enumerate(copy_optimized_csv[:10]):\n", + " print(f\" {i}: {repr(line)}\")\n", + "\n", "nsightful.display_ncu_csv_in_notebook(copy_optimized_csv)" ] } diff --git a/tutorials/accelerated-python/notebooks/kernels/solutions/41__kernel_authoring__book_histogram__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/kernels/solutions/41__kernel_authoring__book_histogram__SOLUTION.ipynb index 733d392e..838b55e8 100644 --- a/tutorials/accelerated-python/notebooks/kernels/solutions/41__kernel_authoring__book_histogram__SOLUTION.ipynb +++ b/tutorials/accelerated-python/notebooks/kernels/solutions/41__kernel_authoring__book_histogram__SOLUTION.ipynb @@ -281,7 +281,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "id": "ad12380e-253b-4410-ab34-9479411fdf81", "metadata": { "colab": { @@ -418,6 +418,14 @@ } ], "source": [ + "# DEBUG: Print CSV info for CI debugging\n", + "print(f\"DEBUG: histogram_global_csv type: {type(histogram_global_csv)}\")\n", + "print(f\"DEBUG: histogram_global_csv length: {len(histogram_global_csv)}\")\n", + "if len(histogram_global_csv) > 0:\n", + " print(f\"DEBUG: First 10 lines of histogram_global_csv:\")\n", + " for i, line in enumerate(histogram_global_csv[:10]):\n", + " print(f\" {i}: {repr(line)}\")\n", + "\n", "nsightful.display_ncu_csv_in_notebook(histogram_global_csv)" ] }, @@ -635,7 +643,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "id": "114e8ff7-b6fb-42ad-abda-f6d53479c052", "metadata": { "colab": { @@ -772,6 +780,14 @@ } ], "source": [ + "# DEBUG: Print CSV info for CI debugging\n", + "print(f\"DEBUG: histogram_localized_csv type: {type(histogram_localized_csv)}\")\n", + "print(f\"DEBUG: histogram_localized_csv length: {len(histogram_localized_csv)}\")\n", + "if len(histogram_localized_csv) > 0:\n", + " print(f\"DEBUG: First 10 lines of histogram_localized_csv:\")\n", + " for i, line in enumerate(histogram_localized_csv[:10]):\n", + " print(f\" {i}: {repr(line)}\")\n", + "\n", "nsightful.display_ncu_csv_in_notebook(histogram_localized_csv)" ] }, From 8edf00e386f44268e709c9f62edd93d48ff000a8 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Mon, 8 Dec 2025 16:32:09 -0500 Subject: [PATCH 02/15] CI: Print out ncu command output. --- ...40__kernel_authoring__copy__SOLUTION.ipynb | 32 ++++++++++++++++--- ..._authoring__book_histogram__SOLUTION.ipynb | 32 ++++++++++++++++--- 2 files changed, 56 insertions(+), 8 deletions(-) diff --git a/tutorials/accelerated-python/notebooks/kernels/solutions/40__kernel_authoring__copy__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/kernels/solutions/40__kernel_authoring__copy__SOLUTION.ipynb index 143d5cdc..84a8db90 100644 --- a/tutorials/accelerated-python/notebooks/kernels/solutions/40__kernel_authoring__copy__SOLUTION.ipynb +++ b/tutorials/accelerated-python/notebooks/kernels/solutions/40__kernel_authoring__copy__SOLUTION.ipynb @@ -125,7 +125,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -146,7 +146,19 @@ } ], "source": [ - "!ncu -f --kernel-name regex:copy_blocked --set full -o copy_blocked python copy_blocked.py\n", + "# DEBUG: Capture NCU profiling output to diagnose CI failures\n", + "ncu_profile_output = !ncu -f --kernel-name regex:copy_blocked --set full -o copy_blocked python copy_blocked.py\n", + "print(\"DEBUG: NCU profiling output:\")\n", + "for line in ncu_profile_output:\n", + " print(f\" {line}\")\n", + "\n", + "# Check if report file was created\n", + "import os\n", + "report_exists = os.path.exists(\"copy_blocked.ncu-rep\")\n", + "print(f\"DEBUG: copy_blocked.ncu-rep exists: {report_exists}\")\n", + "if report_exists:\n", + " print(f\"DEBUG: copy_blocked.ncu-rep size: {os.path.getsize('copy_blocked.ncu-rep')} bytes\")\n", + "\n", "copy_blocked_csv = !ncu --import copy_blocked.ncu-rep --csv" ] }, @@ -445,7 +457,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -466,7 +478,19 @@ } ], "source": [ - "!ncu -f --kernel-name regex:copy_optimized --set full -o copy_optimized python copy_optimized.py\n", + "# DEBUG: Capture NCU profiling output to diagnose CI failures\n", + "ncu_profile_output = !ncu -f --kernel-name regex:copy_optimized --set full -o copy_optimized python copy_optimized.py\n", + "print(\"DEBUG: NCU profiling output:\")\n", + "for line in ncu_profile_output:\n", + " print(f\" {line}\")\n", + "\n", + "# Check if report file was created\n", + "import os\n", + "report_exists = os.path.exists(\"copy_optimized.ncu-rep\")\n", + "print(f\"DEBUG: copy_optimized.ncu-rep exists: {report_exists}\")\n", + "if report_exists:\n", + " print(f\"DEBUG: copy_optimized.ncu-rep size: {os.path.getsize('copy_optimized.ncu-rep')} bytes\")\n", + "\n", "copy_optimized_csv = !ncu --import copy_optimized.ncu-rep --csv" ] }, diff --git a/tutorials/accelerated-python/notebooks/kernels/solutions/41__kernel_authoring__book_histogram__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/kernels/solutions/41__kernel_authoring__book_histogram__SOLUTION.ipynb index 838b55e8..e16a0f14 100644 --- a/tutorials/accelerated-python/notebooks/kernels/solutions/41__kernel_authoring__book_histogram__SOLUTION.ipynb +++ b/tutorials/accelerated-python/notebooks/kernels/solutions/41__kernel_authoring__book_histogram__SOLUTION.ipynb @@ -253,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "id": "8dbd226c-66f2-43df-868a-6b024b1de24c", "metadata": { "colab": { @@ -275,7 +275,19 @@ } ], "source": [ - "!ncu -f --kernel-name regex:histogram_global --set full -o histogram_global python histogram_global.py\n", + "# DEBUG: Capture NCU profiling output to diagnose CI failures\n", + "ncu_profile_output = !ncu -f --kernel-name regex:histogram_global --set full -o histogram_global python histogram_global.py\n", + "print(\"DEBUG: NCU profiling output:\")\n", + "for line in ncu_profile_output:\n", + " print(f\" {line}\")\n", + "\n", + "# Check if report file was created\n", + "import os\n", + "report_exists = os.path.exists(\"histogram_global.ncu-rep\")\n", + "print(f\"DEBUG: histogram_global.ncu-rep exists: {report_exists}\")\n", + "if report_exists:\n", + " print(f\"DEBUG: histogram_global.ncu-rep size: {os.path.getsize('histogram_global.ncu-rep')} bytes\")\n", + "\n", "histogram_global_csv = !ncu --import histogram_global.ncu-rep --csv" ] }, @@ -615,7 +627,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "id": "d637b6b1-fb0b-4807-b70b-c80227c0fd6f", "metadata": { "colab": { @@ -637,7 +649,19 @@ } ], "source": [ - "!ncu -f --kernel-name regex:histogram_localized --set full -o histogram_localized python histogram_localized.py\n", + "# DEBUG: Capture NCU profiling output to diagnose CI failures\n", + "ncu_profile_output = !ncu -f --kernel-name regex:histogram_localized --set full -o histogram_localized python histogram_localized.py\n", + "print(\"DEBUG: NCU profiling output:\")\n", + "for line in ncu_profile_output:\n", + " print(f\" {line}\")\n", + "\n", + "# Check if report file was created\n", + "import os\n", + "report_exists = os.path.exists(\"histogram_localized.ncu-rep\")\n", + "print(f\"DEBUG: histogram_localized.ncu-rep exists: {report_exists}\")\n", + "if report_exists:\n", + " print(f\"DEBUG: histogram_localized.ncu-rep size: {os.path.getsize('histogram_localized.ncu-rep')} bytes\")\n", + "\n", "histogram_localized_csv = !ncu --import histogram_localized.ncu-rep --csv" ] }, From bffb081b5ded4be335b345ff4b03a3913e1901d9 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Mon, 8 Dec 2025 23:17:32 -0500 Subject: [PATCH 03/15] CI: Print out all cell outputs if a notebook fails. --- .../accelerated-python/test/test_notebooks.py | 30 +++++++++++++++++-- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/tutorials/accelerated-python/test/test_notebooks.py b/tutorials/accelerated-python/test/test_notebooks.py index 9ef7b510..7ea9f26a 100644 --- a/tutorials/accelerated-python/test/test_notebooks.py +++ b/tutorials/accelerated-python/test/test_notebooks.py @@ -18,6 +18,28 @@ notebook_ids = [nb.relative_to(NOTEBOOKS_DIR).as_posix() for nb in solution_notebooks] +def extract_cell_outputs(nb): + """Extract stdout/stderr from all executed cells for debugging.""" + outputs = [] + for i, cell in enumerate(nb.cells): + if cell.cell_type != 'code': + continue + cell_outputs = [] + for output in cell.get('outputs', []): + if output.get('output_type') == 'stream': + stream_name = output.get('name', 'stdout') + text = output.get('text', '') + cell_outputs.append(f"[{stream_name}] {text}") + elif output.get('output_type') == 'error': + ename = output.get('ename', 'Error') + evalue = output.get('evalue', '') + cell_outputs.append(f"[error] {ename}: {evalue}") + if cell_outputs: + source_preview = cell.source[:100].replace('\n', ' ') + outputs.append(f"--- Cell {i}: {source_preview}... ---\n" + ''.join(cell_outputs)) + return '\n'.join(outputs) + + @pytest.mark.parametrize('notebook_path', solution_notebooks, ids=notebook_ids) def test_solution_notebook_executes(notebook_path): """ @@ -49,8 +71,10 @@ def test_solution_notebook_executes(notebook_path): client.execute() except CellExecutionError as e: # Provide detailed error information - # CellExecutionError stores the error message in str(e) - pytest.fail(f"Notebook execution failed:\n{str(e)}") + # Include output from ALL cells, not just the failing one + all_outputs = extract_cell_outputs(nb) + pytest.fail(f"Notebook execution failed:\n{str(e)}\n\n=== ALL CELL OUTPUTS ===\n{all_outputs}") except Exception as e: # Catch any other execution errors - pytest.fail(f"Notebook execution failed: {str(e)}") + all_outputs = extract_cell_outputs(nb) + pytest.fail(f"Notebook execution failed: {str(e)}\n\n=== ALL CELL OUTPUTS ===\n{all_outputs}") From aaf7114506f521f651f709089d6ef914f690fd35 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Mon, 8 Dec 2025 23:32:32 -0500 Subject: [PATCH 04/15] CI: Try to fix Docker caching and add caching diagnostics. --- .../workflows/build-brev-tutorial-docker-images.yml | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-brev-tutorial-docker-images.yml b/.github/workflows/build-brev-tutorial-docker-images.yml index 9686b734..1d24fb07 100644 --- a/.github/workflows/build-brev-tutorial-docker-images.yml +++ b/.github/workflows/build-brev-tutorial-docker-images.yml @@ -181,15 +181,22 @@ jobs: command: | cd ${{ matrix.tutorial }}/brev + # Debug: Show buildx version and check for cache images + docker buildx version + echo "Checking for existing cache images..." + docker manifest inspect "${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH}" > /dev/null 2>&1 && echo "✓ Branch cache exists" || echo "✗ Branch cache not found" + docker manifest inspect "${IMAGE_NAME}:buildcache-main" > /dev/null 2>&1 && echo "✓ Main cache exists" || echo "✗ Main cache not found" + docker buildx bake \ + --progress=plain \ --allow=fs.read=/home/runner \ --set "base.output=type=registry" \ --set "base.tags=${IMAGE_NAME}:${DOCKER_TAG_BRANCH}-latest" \ --set "base.tags=${IMAGE_NAME}:${DOCKER_TAG_BRANCH}-git-${GIT_SHORT_SHA}" \ $([ "${GIT_BRANCH_NAME}" = "main" ] && echo "--set base.tags=${IMAGE_NAME}:latest") \ - --set "base.cache-from=type=registry,ref=${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH},oci-mediatypes=true" \ - --set "base.cache-from=type=registry,ref=${IMAGE_NAME}:buildcache-main,oci-mediatypes=true" \ - --set "base.cache-to=type=registry,ref=${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH},mode=max,oci-mediatypes=true,compression=zstd,compression-level=3" \ + --set "base.cache-from=type=registry,ref=${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH}" \ + --set "base.cache-from=type=registry,ref=${IMAGE_NAME}:buildcache-main" \ + --set "base.cache-to=type=registry,ref=${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH},mode=max,image-manifest=true,compression=zstd,compression-level=3" \ --set "base.platform=linux/amd64" \ -f docker-compose.yml \ base From 23e6483316ba96b73cbed755cdb560dcc79e6218 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Mon, 8 Dec 2025 23:47:48 -0500 Subject: [PATCH 05/15] CI: DCGM strikes again, it appears to be running on the nv-gha-runners. --- .../workflows/test-brev-tutorial-docker-images.yml | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/.github/workflows/test-brev-tutorial-docker-images.yml b/.github/workflows/test-brev-tutorial-docker-images.yml index a4f0c86a..750317ff 100644 --- a/.github/workflows/test-brev-tutorial-docker-images.yml +++ b/.github/workflows/test-brev-tutorial-docker-images.yml @@ -63,6 +63,20 @@ jobs: username: ${{ github.actor }} password: ${{ secrets.GITHUB_TOKEN }} + - name: Stop DCGM to allow NCU profiling + run: | + # DCGM (Data Center GPU Manager) locks the GPU and prevents NCU from profiling. + # Stop it before running the container tests. + echo "Stopping DCGM services..." + sudo systemctl stop nvidia-dcgm || echo "nvidia-dcgm service not found or already stopped" + sudo systemctl stop dcgm || echo "dcgm service not found or already stopped" + # Also try nv-hostengine which DCGM uses + sudo systemctl stop nv-hostengine || echo "nv-hostengine service not found or already stopped" + # Kill any remaining dcgm processes + sudo pkill -9 nv-hostengine || echo "No nv-hostengine processes found" + sudo pkill -9 dcgm || echo "No dcgm processes found" + echo "DCGM services stopped." + - name: Test Docker Compose id: test run: | From a69ae0974bed3b43af5fee72e86a176e44148167 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Tue, 9 Dec 2025 11:01:05 -0500 Subject: [PATCH 06/15] CI: Clean up CI logs by disabling docker progress output and add more debugging output to get to the bottom of why NCU can't run on the NV GHA T4 runners. --- .../build-brev-tutorial-docker-images.yml | 1 + .github/workflows/mirror-base-images.yml | 8 +- .../test-brev-tutorial-docker-images.yml | 107 ++++++++++++++++++ brev/test-docker-compose.bash | 2 +- 4 files changed, 114 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build-brev-tutorial-docker-images.yml b/.github/workflows/build-brev-tutorial-docker-images.yml index 1d24fb07..ff462f80 100644 --- a/.github/workflows/build-brev-tutorial-docker-images.yml +++ b/.github/workflows/build-brev-tutorial-docker-images.yml @@ -174,6 +174,7 @@ jobs: uses: nick-fields/retry@v3 env: DOCKER_BUILDKIT: 1 + BUILDKIT_PROGRESS: plain with: timeout_minutes: 60 max_attempts: 3 diff --git a/.github/workflows/mirror-base-images.yml b/.github/workflows/mirror-base-images.yml index 5bde54cf..2e127b4b 100644 --- a/.github/workflows/mirror-base-images.yml +++ b/.github/workflows/mirror-base-images.yml @@ -59,9 +59,11 @@ jobs: - name: Pull source image from Docker Hub if: steps.check_image.outputs.exists != 'true' || github.event.inputs.force_pull == 'true' + env: + BUILDKIT_PROGRESS: plain run: | echo "Pulling ${{ matrix.image.source }} from Docker Hub..." - docker pull ${{ matrix.image.source }} + docker pull --quiet ${{ matrix.image.source }} - name: Tag image for GHCR if: steps.check_image.outputs.exists != 'true' || github.event.inputs.force_pull == 'true' @@ -72,7 +74,7 @@ jobs: if: steps.check_image.outputs.exists != 'true' || github.event.inputs.force_pull == 'true' run: | echo "Pushing ${TARGET_IMAGE} to GHCR..." - docker push ${TARGET_IMAGE} + docker push --quiet ${TARGET_IMAGE} echo "✓ Successfully mirrored ${{ matrix.image.source }}" - name: Skipped (image exists) @@ -84,6 +86,6 @@ jobs: - name: Verify mirrored image if: steps.check_image.outputs.exists != 'true' || github.event.inputs.force_pull == 'true' run: | - docker pull ${TARGET_IMAGE} + docker pull --quiet ${TARGET_IMAGE} docker images ${TARGET_IMAGE} echo "✓ Image verified successfully" diff --git a/.github/workflows/test-brev-tutorial-docker-images.yml b/.github/workflows/test-brev-tutorial-docker-images.yml index 750317ff..3867086c 100644 --- a/.github/workflows/test-brev-tutorial-docker-images.yml +++ b/.github/workflows/test-brev-tutorial-docker-images.yml @@ -25,6 +25,9 @@ jobs: working-directory: ${{ github.workspace }} permissions: statuses: write + env: + BUILDKIT_PROGRESS: plain + DOCKER_CLI_HINTS: false steps: - name: Show runner info run: | @@ -77,6 +80,110 @@ jobs: sudo pkill -9 dcgm || echo "No dcgm processes found" echo "DCGM services stopped." + - name: Debug GPU and NCU configuration + run: | + echo "=== GPU Information ===" + nvidia-smi || echo "nvidia-smi failed" + echo "" + + echo "=== NVIDIA Driver Version ===" + cat /proc/driver/nvidia/version 2>/dev/null || echo "Could not read driver version" + echo "" + + echo "=== GPU Processes ===" + nvidia-smi pmon -c 1 2>/dev/null || echo "nvidia-smi pmon failed" + echo "" + + echo "=== DCGM/NCU Blocking Processes ===" + ps aux | grep -E "(dcgm|nv-hostengine|ncu)" | grep -v grep || echo "No DCGM/NCU processes found" + echo "" + + echo "=== Systemd Services (nvidia/dcgm related) ===" + systemctl list-units --type=service | grep -iE "(nvidia|dcgm|gpu)" || echo "No matching services" + echo "" + + echo "=== NCU on Host ===" + which ncu 2>/dev/null && ncu --version 2>/dev/null || echo "NCU not found on host" + echo "" + + echo "=== Profiling Permissions ===" + cat /proc/sys/kernel/perf_event_paranoid 2>/dev/null || echo "Could not read perf_event_paranoid" + cat /proc/sys/kernel/kptr_restrict 2>/dev/null || echo "Could not read kptr_restrict" + echo "" + + echo "=== NVIDIA Kernel Modules ===" + lsmod | grep nvidia || echo "No nvidia modules loaded" + echo "" + + echo "=== /dev/nvidia* devices ===" + ls -la /dev/nvidia* 2>/dev/null || echo "No /dev/nvidia* devices found" + echo "" + + echo "=== Docker GPU Access Test ===" + docker run --rm --gpus all nvidia/cuda:12.8.0-base-ubuntu24.04 nvidia-smi 2>&1 || echo "Docker GPU test failed" + echo "" + + - name: Test NCU inside container + run: | + echo "=== Testing NCU profiling inside container ===" + # Pull the tutorial image and test NCU directly + COMPOSE_FILE="artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml" + + # Extract the image name from the compose file + IMAGE=$(grep -A5 "base:" "$COMPOSE_FILE" | grep "image:" | head -1 | awk '{print $2}') + echo "Testing with image: $IMAGE" + + # Create a simple test script + cat > /tmp/test_ncu.py << 'EOF' + from numba import cuda + import numpy as np + + @cuda.jit + def simple_add(a, b, c): + i = cuda.grid(1) + if i < a.size: + c[i] = a[i] + b[i] + + n = 1024 + a = np.ones(n, dtype=np.float32) + b = np.ones(n, dtype=np.float32) + c = np.zeros(n, dtype=np.float32) + + d_a = cuda.to_device(a) + d_b = cuda.to_device(b) + d_c = cuda.to_device(c) + + simple_add[4, 256](d_a, d_b, d_c) + cuda.synchronize() + print("Kernel executed successfully") + EOF + + # Test NCU inside the container + echo "--- Running NCU profiling test ---" + docker run --rm --gpus all \ + -v /tmp/test_ncu.py:/test_ncu.py:ro \ + "$IMAGE" \ + bash -c ' + echo "NCU version:" + ncu --version || echo "NCU not found" + echo "" + echo "Running simple kernel without profiling:" + python /test_ncu.py + echo "" + echo "Running NCU profiling:" + ncu --set full -o /tmp/test_profile python /test_ncu.py 2>&1 + NCU_EXIT=$? + echo "NCU exit code: $NCU_EXIT" + echo "" + echo "Checking if profile was created:" + ls -la /tmp/test_profile.ncu-rep 2>&1 || echo "Profile file not created" + echo "" + if [ -f /tmp/test_profile.ncu-rep ]; then + echo "Profile file exists, testing CSV export:" + ncu --import /tmp/test_profile.ncu-rep --csv 2>&1 | head -20 + fi + ' 2>&1 || echo "Container NCU test failed" + - name: Test Docker Compose id: test run: | diff --git a/brev/test-docker-compose.bash b/brev/test-docker-compose.bash index 16417f1a..3a7fd34d 100755 --- a/brev/test-docker-compose.bash +++ b/brev/test-docker-compose.bash @@ -115,7 +115,7 @@ export ACH_RUN_TESTS=1 # Start container echo "📦 Starting containers..." echo "" -if docker compose -f "${COMPOSE_FILE}" up -d; then +if docker compose -f "${COMPOSE_FILE}" up -d --quiet-pull; then echo "" echo -e "${GREEN}✅ Containers started successfully${NC}" echo "" From 10d56056276d7111da6313ec3934cdbb17d7b100 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Tue, 9 Dec 2025 11:17:06 -0500 Subject: [PATCH 07/15] CI: Use the buildkitd caching config setup on NVIDIA runners. --- .github/workflows/build-brev-tutorial-docker-images.yml | 8 +++++--- .github/workflows/mirror-base-images.yml | 2 +- .github/workflows/test-brev-tutorial-docker-images.yml | 2 +- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/.github/workflows/build-brev-tutorial-docker-images.yml b/.github/workflows/build-brev-tutorial-docker-images.yml index ff462f80..441688c1 100644 --- a/.github/workflows/build-brev-tutorial-docker-images.yml +++ b/.github/workflows/build-brev-tutorial-docker-images.yml @@ -162,6 +162,8 @@ jobs: - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 + with: + buildkitd-config: /etc/buildkit/buildkitd.toml - name: Log in to GitHub Container Registry uses: docker/login-action@v3 @@ -174,7 +176,7 @@ jobs: uses: nick-fields/retry@v3 env: DOCKER_BUILDKIT: 1 - BUILDKIT_PROGRESS: plain + BUILDKIT_PROGRESS: quiet with: timeout_minutes: 60 max_attempts: 3 @@ -182,14 +184,14 @@ jobs: command: | cd ${{ matrix.tutorial }}/brev - # Debug: Show buildx version and check for cache images + # Show buildx version and check for cache images docker buildx version echo "Checking for existing cache images..." docker manifest inspect "${IMAGE_NAME}:buildcache-${DOCKER_TAG_BRANCH}" > /dev/null 2>&1 && echo "✓ Branch cache exists" || echo "✗ Branch cache not found" docker manifest inspect "${IMAGE_NAME}:buildcache-main" > /dev/null 2>&1 && echo "✓ Main cache exists" || echo "✗ Main cache not found" docker buildx bake \ - --progress=plain \ + --progress=quiet \ --allow=fs.read=/home/runner \ --set "base.output=type=registry" \ --set "base.tags=${IMAGE_NAME}:${DOCKER_TAG_BRANCH}-latest" \ diff --git a/.github/workflows/mirror-base-images.yml b/.github/workflows/mirror-base-images.yml index 2e127b4b..ec3ce8d7 100644 --- a/.github/workflows/mirror-base-images.yml +++ b/.github/workflows/mirror-base-images.yml @@ -60,7 +60,7 @@ jobs: - name: Pull source image from Docker Hub if: steps.check_image.outputs.exists != 'true' || github.event.inputs.force_pull == 'true' env: - BUILDKIT_PROGRESS: plain + BUILDKIT_PROGRESS: quiet run: | echo "Pulling ${{ matrix.image.source }} from Docker Hub..." docker pull --quiet ${{ matrix.image.source }} diff --git a/.github/workflows/test-brev-tutorial-docker-images.yml b/.github/workflows/test-brev-tutorial-docker-images.yml index 3867086c..01685492 100644 --- a/.github/workflows/test-brev-tutorial-docker-images.yml +++ b/.github/workflows/test-brev-tutorial-docker-images.yml @@ -26,7 +26,7 @@ jobs: permissions: statuses: write env: - BUILDKIT_PROGRESS: plain + BUILDKIT_PROGRESS: quiet DOCKER_CLI_HINTS: false steps: - name: Show runner info From b3a8c1eea540e17f01f7a5bc28d34d0565118ee3 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Tue, 9 Dec 2025 11:20:26 -0500 Subject: [PATCH 08/15] CI: Fix an issue in the NCU test image extraction. --- .github/workflows/test-brev-tutorial-docker-images.yml | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-brev-tutorial-docker-images.yml b/.github/workflows/test-brev-tutorial-docker-images.yml index 01685492..06955878 100644 --- a/.github/workflows/test-brev-tutorial-docker-images.yml +++ b/.github/workflows/test-brev-tutorial-docker-images.yml @@ -130,7 +130,13 @@ jobs: COMPOSE_FILE="artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml" # Extract the image name from the compose file - IMAGE=$(grep -A5 "base:" "$COMPOSE_FILE" | grep "image:" | head -1 | awk '{print $2}') + # The image is defined with a YAML anchor like: image: &image ghcr.io/... + # We need to extract the actual URL, not the anchor reference + IMAGE=$(grep "image: &image" "$COMPOSE_FILE" | sed 's/.*image: &image //') + if [ -z "$IMAGE" ]; then + # Fallback: try to find any ghcr.io image reference + IMAGE=$(grep -o 'ghcr.io/[^"]*' "$COMPOSE_FILE" | head -1) + fi echo "Testing with image: $IMAGE" # Create a simple test script From 31d5a43c9d82a82ea263e9660bb51135c27ec388 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Tue, 9 Dec 2025 11:37:47 -0500 Subject: [PATCH 09/15] CI: Go back to plain Docker progress because quiet shows nothing. --- .github/workflows/build-brev-tutorial-docker-images.yml | 4 ++-- .github/workflows/test-brev-tutorial-docker-images.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-brev-tutorial-docker-images.yml b/.github/workflows/build-brev-tutorial-docker-images.yml index 441688c1..274cc059 100644 --- a/.github/workflows/build-brev-tutorial-docker-images.yml +++ b/.github/workflows/build-brev-tutorial-docker-images.yml @@ -176,7 +176,7 @@ jobs: uses: nick-fields/retry@v3 env: DOCKER_BUILDKIT: 1 - BUILDKIT_PROGRESS: quiet + BUILDKIT_PROGRESS: plain with: timeout_minutes: 60 max_attempts: 3 @@ -191,7 +191,7 @@ jobs: docker manifest inspect "${IMAGE_NAME}:buildcache-main" > /dev/null 2>&1 && echo "✓ Main cache exists" || echo "✗ Main cache not found" docker buildx bake \ - --progress=quiet \ + --progress=plain \ --allow=fs.read=/home/runner \ --set "base.output=type=registry" \ --set "base.tags=${IMAGE_NAME}:${DOCKER_TAG_BRANCH}-latest" \ diff --git a/.github/workflows/test-brev-tutorial-docker-images.yml b/.github/workflows/test-brev-tutorial-docker-images.yml index 06955878..406f1a73 100644 --- a/.github/workflows/test-brev-tutorial-docker-images.yml +++ b/.github/workflows/test-brev-tutorial-docker-images.yml @@ -26,7 +26,7 @@ jobs: permissions: statuses: write env: - BUILDKIT_PROGRESS: quiet + BUILDKIT_PROGRESS: plain DOCKER_CLI_HINTS: false steps: - name: Show runner info From cd21d3d194d0d7f93b6385c36f7f7f54e54195e4 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Tue, 9 Dec 2025 11:49:53 -0500 Subject: [PATCH 10/15] CI: Properly kill DCGM and fix permissions on NV GHA runners. --- .../test-brev-tutorial-docker-images.yml | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test-brev-tutorial-docker-images.yml b/.github/workflows/test-brev-tutorial-docker-images.yml index 406f1a73..8aabeda0 100644 --- a/.github/workflows/test-brev-tutorial-docker-images.yml +++ b/.github/workflows/test-brev-tutorial-docker-images.yml @@ -71,14 +71,28 @@ jobs: # DCGM (Data Center GPU Manager) locks the GPU and prevents NCU from profiling. # Stop it before running the container tests. echo "Stopping DCGM services..." + + # Stop the dcgm-exporter Docker container (this is how it runs on GitHub runners) + echo "Stopping dcgm-exporter Docker container..." + docker stop dcgm-exporter 2>/dev/null && echo "Stopped dcgm-exporter container" || echo "dcgm-exporter container not running" + docker rm -f dcgm-exporter 2>/dev/null || echo "dcgm-exporter container already removed" + + # Stop systemd services + sudo systemctl stop dcgm-exporter || echo "dcgm-exporter service not found or already stopped" sudo systemctl stop nvidia-dcgm || echo "nvidia-dcgm service not found or already stopped" sudo systemctl stop dcgm || echo "dcgm service not found or already stopped" - # Also try nv-hostengine which DCGM uses sudo systemctl stop nv-hostengine || echo "nv-hostengine service not found or already stopped" + # Kill any remaining dcgm processes sudo pkill -9 nv-hostengine || echo "No nv-hostengine processes found" sudo pkill -9 dcgm || echo "No dcgm processes found" - echo "DCGM services stopped." + + # Relax profiling permissions (perf_event_paranoid=4 is very restrictive) + echo "Relaxing profiling permissions..." + sudo sysctl -w kernel.perf_event_paranoid=2 || echo "Could not set perf_event_paranoid" + sudo sysctl -w kernel.kptr_restrict=0 || echo "Could not set kptr_restrict" + + echo "DCGM services stopped and profiling permissions relaxed." - name: Debug GPU and NCU configuration run: | From c91bcf2ceaf4c64ed21aa761138e27bb7b7c8cd4 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Tue, 9 Dec 2025 12:27:32 -0500 Subject: [PATCH 11/15] CI: Disable NCU sanity test that is improperly implemented. --- .../test-brev-tutorial-docker-images.yml | 77 +------------------ 1 file changed, 1 insertion(+), 76 deletions(-) diff --git a/.github/workflows/test-brev-tutorial-docker-images.yml b/.github/workflows/test-brev-tutorial-docker-images.yml index 8aabeda0..8da78a86 100644 --- a/.github/workflows/test-brev-tutorial-docker-images.yml +++ b/.github/workflows/test-brev-tutorial-docker-images.yml @@ -72,7 +72,7 @@ jobs: # Stop it before running the container tests. echo "Stopping DCGM services..." - # Stop the dcgm-exporter Docker container (this is how it runs on GitHub runners) + # Stop the dcgm-exporter Docker container echo "Stopping dcgm-exporter Docker container..." docker stop dcgm-exporter 2>/dev/null && echo "Stopped dcgm-exporter container" || echo "dcgm-exporter container not running" docker rm -f dcgm-exporter 2>/dev/null || echo "dcgm-exporter container already removed" @@ -116,10 +116,6 @@ jobs: systemctl list-units --type=service | grep -iE "(nvidia|dcgm|gpu)" || echo "No matching services" echo "" - echo "=== NCU on Host ===" - which ncu 2>/dev/null && ncu --version 2>/dev/null || echo "NCU not found on host" - echo "" - echo "=== Profiling Permissions ===" cat /proc/sys/kernel/perf_event_paranoid 2>/dev/null || echo "Could not read perf_event_paranoid" cat /proc/sys/kernel/kptr_restrict 2>/dev/null || echo "Could not read kptr_restrict" @@ -133,77 +129,6 @@ jobs: ls -la /dev/nvidia* 2>/dev/null || echo "No /dev/nvidia* devices found" echo "" - echo "=== Docker GPU Access Test ===" - docker run --rm --gpus all nvidia/cuda:12.8.0-base-ubuntu24.04 nvidia-smi 2>&1 || echo "Docker GPU test failed" - echo "" - - - name: Test NCU inside container - run: | - echo "=== Testing NCU profiling inside container ===" - # Pull the tutorial image and test NCU directly - COMPOSE_FILE="artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml" - - # Extract the image name from the compose file - # The image is defined with a YAML anchor like: image: &image ghcr.io/... - # We need to extract the actual URL, not the anchor reference - IMAGE=$(grep "image: &image" "$COMPOSE_FILE" | sed 's/.*image: &image //') - if [ -z "$IMAGE" ]; then - # Fallback: try to find any ghcr.io image reference - IMAGE=$(grep -o 'ghcr.io/[^"]*' "$COMPOSE_FILE" | head -1) - fi - echo "Testing with image: $IMAGE" - - # Create a simple test script - cat > /tmp/test_ncu.py << 'EOF' - from numba import cuda - import numpy as np - - @cuda.jit - def simple_add(a, b, c): - i = cuda.grid(1) - if i < a.size: - c[i] = a[i] + b[i] - - n = 1024 - a = np.ones(n, dtype=np.float32) - b = np.ones(n, dtype=np.float32) - c = np.zeros(n, dtype=np.float32) - - d_a = cuda.to_device(a) - d_b = cuda.to_device(b) - d_c = cuda.to_device(c) - - simple_add[4, 256](d_a, d_b, d_c) - cuda.synchronize() - print("Kernel executed successfully") - EOF - - # Test NCU inside the container - echo "--- Running NCU profiling test ---" - docker run --rm --gpus all \ - -v /tmp/test_ncu.py:/test_ncu.py:ro \ - "$IMAGE" \ - bash -c ' - echo "NCU version:" - ncu --version || echo "NCU not found" - echo "" - echo "Running simple kernel without profiling:" - python /test_ncu.py - echo "" - echo "Running NCU profiling:" - ncu --set full -o /tmp/test_profile python /test_ncu.py 2>&1 - NCU_EXIT=$? - echo "NCU exit code: $NCU_EXIT" - echo "" - echo "Checking if profile was created:" - ls -la /tmp/test_profile.ncu-rep 2>&1 || echo "Profile file not created" - echo "" - if [ -f /tmp/test_profile.ncu-rep ]; then - echo "Profile file exists, testing CSV export:" - ncu --import /tmp/test_profile.ncu-rep --csv 2>&1 | head -20 - fi - ' 2>&1 || echo "Container NCU test failed" - - name: Test Docker Compose id: test run: | From b19a0b92e3bbdb13cd0842c1a40173b972c80481 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Tue, 9 Dec 2025 13:51:02 -0500 Subject: [PATCH 12/15] CI: Remove debugging output from notebooks. --- ...40__kernel_authoring__copy__SOLUTION.ipynb | 52 +++---------------- ..._authoring__book_histogram__SOLUTION.ipynb | 52 +++---------------- 2 files changed, 12 insertions(+), 92 deletions(-) diff --git a/tutorials/accelerated-python/notebooks/kernels/solutions/40__kernel_authoring__copy__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/kernels/solutions/40__kernel_authoring__copy__SOLUTION.ipynb index 84a8db90..4c16f24f 100644 --- a/tutorials/accelerated-python/notebooks/kernels/solutions/40__kernel_authoring__copy__SOLUTION.ipynb +++ b/tutorials/accelerated-python/notebooks/kernels/solutions/40__kernel_authoring__copy__SOLUTION.ipynb @@ -125,7 +125,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -146,19 +146,7 @@ } ], "source": [ - "# DEBUG: Capture NCU profiling output to diagnose CI failures\n", - "ncu_profile_output = !ncu -f --kernel-name regex:copy_blocked --set full -o copy_blocked python copy_blocked.py\n", - "print(\"DEBUG: NCU profiling output:\")\n", - "for line in ncu_profile_output:\n", - " print(f\" {line}\")\n", - "\n", - "# Check if report file was created\n", - "import os\n", - "report_exists = os.path.exists(\"copy_blocked.ncu-rep\")\n", - "print(f\"DEBUG: copy_blocked.ncu-rep exists: {report_exists}\")\n", - "if report_exists:\n", - " print(f\"DEBUG: copy_blocked.ncu-rep size: {os.path.getsize('copy_blocked.ncu-rep')} bytes\")\n", - "\n", + "!ncu -f --kernel-name regex:copy_blocked --set full -o copy_blocked python copy_blocked.py\n", "copy_blocked_csv = !ncu --import copy_blocked.ncu-rep --csv" ] }, @@ -175,7 +163,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -311,14 +299,6 @@ "source": [ "import nsightful\n", "\n", - "# DEBUG: Print CSV info for CI debugging\n", - "print(f\"DEBUG: copy_blocked_csv type: {type(copy_blocked_csv)}\")\n", - "print(f\"DEBUG: copy_blocked_csv length: {len(copy_blocked_csv)}\")\n", - "if len(copy_blocked_csv) > 0:\n", - " print(f\"DEBUG: First 10 lines of copy_blocked_csv:\")\n", - " for i, line in enumerate(copy_blocked_csv[:10]):\n", - " print(f\" {i}: {repr(line)}\")\n", - "\n", "nsightful.display_ncu_csv_in_notebook(copy_blocked_csv)" ] }, @@ -457,7 +437,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "metadata": { "colab": { "base_uri": "https://localhost:8080/" @@ -478,19 +458,7 @@ } ], "source": [ - "# DEBUG: Capture NCU profiling output to diagnose CI failures\n", - "ncu_profile_output = !ncu -f --kernel-name regex:copy_optimized --set full -o copy_optimized python copy_optimized.py\n", - "print(\"DEBUG: NCU profiling output:\")\n", - "for line in ncu_profile_output:\n", - " print(f\" {line}\")\n", - "\n", - "# Check if report file was created\n", - "import os\n", - "report_exists = os.path.exists(\"copy_optimized.ncu-rep\")\n", - "print(f\"DEBUG: copy_optimized.ncu-rep exists: {report_exists}\")\n", - "if report_exists:\n", - " print(f\"DEBUG: copy_optimized.ncu-rep size: {os.path.getsize('copy_optimized.ncu-rep')} bytes\")\n", - "\n", + "!ncu -f --kernel-name regex:copy_optimized --set full -o copy_optimized python copy_optimized.py\n", "copy_optimized_csv = !ncu --import copy_optimized.ncu-rep --csv" ] }, @@ -505,7 +473,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": { "colab": { "base_uri": "https://localhost:8080/", @@ -639,14 +607,6 @@ } ], "source": [ - "# DEBUG: Print CSV info for CI debugging\n", - "print(f\"DEBUG: copy_optimized_csv type: {type(copy_optimized_csv)}\")\n", - "print(f\"DEBUG: copy_optimized_csv length: {len(copy_optimized_csv)}\")\n", - "if len(copy_optimized_csv) > 0:\n", - " print(f\"DEBUG: First 10 lines of copy_optimized_csv:\")\n", - " for i, line in enumerate(copy_optimized_csv[:10]):\n", - " print(f\" {i}: {repr(line)}\")\n", - "\n", "nsightful.display_ncu_csv_in_notebook(copy_optimized_csv)" ] } diff --git a/tutorials/accelerated-python/notebooks/kernels/solutions/41__kernel_authoring__book_histogram__SOLUTION.ipynb b/tutorials/accelerated-python/notebooks/kernels/solutions/41__kernel_authoring__book_histogram__SOLUTION.ipynb index e16a0f14..733d392e 100644 --- a/tutorials/accelerated-python/notebooks/kernels/solutions/41__kernel_authoring__book_histogram__SOLUTION.ipynb +++ b/tutorials/accelerated-python/notebooks/kernels/solutions/41__kernel_authoring__book_histogram__SOLUTION.ipynb @@ -253,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 7, "id": "8dbd226c-66f2-43df-868a-6b024b1de24c", "metadata": { "colab": { @@ -275,25 +275,13 @@ } ], "source": [ - "# DEBUG: Capture NCU profiling output to diagnose CI failures\n", - "ncu_profile_output = !ncu -f --kernel-name regex:histogram_global --set full -o histogram_global python histogram_global.py\n", - "print(\"DEBUG: NCU profiling output:\")\n", - "for line in ncu_profile_output:\n", - " print(f\" {line}\")\n", - "\n", - "# Check if report file was created\n", - "import os\n", - "report_exists = os.path.exists(\"histogram_global.ncu-rep\")\n", - "print(f\"DEBUG: histogram_global.ncu-rep exists: {report_exists}\")\n", - "if report_exists:\n", - " print(f\"DEBUG: histogram_global.ncu-rep size: {os.path.getsize('histogram_global.ncu-rep')} bytes\")\n", - "\n", + "!ncu -f --kernel-name regex:histogram_global --set full -o histogram_global python histogram_global.py\n", "histogram_global_csv = !ncu --import histogram_global.ncu-rep --csv" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "ad12380e-253b-4410-ab34-9479411fdf81", "metadata": { "colab": { @@ -430,14 +418,6 @@ } ], "source": [ - "# DEBUG: Print CSV info for CI debugging\n", - "print(f\"DEBUG: histogram_global_csv type: {type(histogram_global_csv)}\")\n", - "print(f\"DEBUG: histogram_global_csv length: {len(histogram_global_csv)}\")\n", - "if len(histogram_global_csv) > 0:\n", - " print(f\"DEBUG: First 10 lines of histogram_global_csv:\")\n", - " for i, line in enumerate(histogram_global_csv[:10]):\n", - " print(f\" {i}: {repr(line)}\")\n", - "\n", "nsightful.display_ncu_csv_in_notebook(histogram_global_csv)" ] }, @@ -627,7 +607,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "id": "d637b6b1-fb0b-4807-b70b-c80227c0fd6f", "metadata": { "colab": { @@ -649,25 +629,13 @@ } ], "source": [ - "# DEBUG: Capture NCU profiling output to diagnose CI failures\n", - "ncu_profile_output = !ncu -f --kernel-name regex:histogram_localized --set full -o histogram_localized python histogram_localized.py\n", - "print(\"DEBUG: NCU profiling output:\")\n", - "for line in ncu_profile_output:\n", - " print(f\" {line}\")\n", - "\n", - "# Check if report file was created\n", - "import os\n", - "report_exists = os.path.exists(\"histogram_localized.ncu-rep\")\n", - "print(f\"DEBUG: histogram_localized.ncu-rep exists: {report_exists}\")\n", - "if report_exists:\n", - " print(f\"DEBUG: histogram_localized.ncu-rep size: {os.path.getsize('histogram_localized.ncu-rep')} bytes\")\n", - "\n", + "!ncu -f --kernel-name regex:histogram_localized --set full -o histogram_localized python histogram_localized.py\n", "histogram_localized_csv = !ncu --import histogram_localized.ncu-rep --csv" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "id": "114e8ff7-b6fb-42ad-abda-f6d53479c052", "metadata": { "colab": { @@ -804,14 +772,6 @@ } ], "source": [ - "# DEBUG: Print CSV info for CI debugging\n", - "print(f\"DEBUG: histogram_localized_csv type: {type(histogram_localized_csv)}\")\n", - "print(f\"DEBUG: histogram_localized_csv length: {len(histogram_localized_csv)}\")\n", - "if len(histogram_localized_csv) > 0:\n", - " print(f\"DEBUG: First 10 lines of histogram_localized_csv:\")\n", - " for i, line in enumerate(histogram_localized_csv[:10]):\n", - " print(f\" {i}: {repr(line)}\")\n", - "\n", "nsightful.display_ncu_csv_in_notebook(histogram_localized_csv)" ] }, From 5b280499e3295563568b8ea20405ceb37e6d5761 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Tue, 9 Dec 2025 15:17:43 -0500 Subject: [PATCH 13/15] CI: Switch to L4 runners to see if that helps with timeouts and flakiness. --- .github/workflows/test-brev-tutorial-docker-images.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/test-brev-tutorial-docker-images.yml b/.github/workflows/test-brev-tutorial-docker-images.yml index 8da78a86..53611d93 100644 --- a/.github/workflows/test-brev-tutorial-docker-images.yml +++ b/.github/workflows/test-brev-tutorial-docker-images.yml @@ -19,7 +19,7 @@ on: jobs: test-tutorial: name: test-tutorial (${{ inputs.tutorial }}) - runs-on: linux-amd64-gpu-t4-latest-1 + runs-on: linux-amd64-gpu-l4-latest-1 defaults: run: working-directory: ${{ github.workspace }} From dacfed4b6752d70a31f80d7010230d03d4094df6 Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Tue, 9 Dec 2025 19:19:10 -0500 Subject: [PATCH 14/15] CI: Pre-pull docker images on test node before running `docker compose` to execute the tests. --- .../test-brev-tutorial-docker-images.yml | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/.github/workflows/test-brev-tutorial-docker-images.yml b/.github/workflows/test-brev-tutorial-docker-images.yml index 53611d93..72237b97 100644 --- a/.github/workflows/test-brev-tutorial-docker-images.yml +++ b/.github/workflows/test-brev-tutorial-docker-images.yml @@ -129,6 +129,31 @@ jobs: ls -la /dev/nvidia* 2>/dev/null || echo "No /dev/nvidia* devices found" echo "" + - name: Pre-pull Docker images + run: | + COMPOSE_FILE="artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml" + + # Extract all unique images from the compose file + # The main image is defined with a YAML anchor like: image: &image ghcr.io/... + MAIN_IMAGE=$(grep "image: &image" "$COMPOSE_FILE" | sed 's/.*image: &image //') + if [ -z "$MAIN_IMAGE" ]; then + # Fallback: try to find any ghcr.io image reference + MAIN_IMAGE=$(grep -o 'ghcr.io/[^"]*' "$COMPOSE_FILE" | head -1) + fi + + # Extract the nsight image (nvcr.io) + NSIGHT_IMAGE=$(grep -o 'nvcr.io/[^"]*' "$COMPOSE_FILE" | head -1) + + echo "Pre-pulling main image: $MAIN_IMAGE" + docker pull "$MAIN_IMAGE" + + if [ -n "$NSIGHT_IMAGE" ]; then + echo "Pre-pulling nsight image: $NSIGHT_IMAGE" + docker pull "$NSIGHT_IMAGE" + fi + + echo "All images pulled successfully" + - name: Test Docker Compose id: test run: | From e58cc618f85aa861667b19e9fc23ad63c43b96bb Mon Sep 17 00:00:00 2001 From: Bryce Adelstein Lelbach aka wash Date: Tue, 9 Dec 2025 19:20:38 -0500 Subject: [PATCH 15/15] Tutorials/Accelerated Python: During testing, output timing for each cell, perform a GPU status check, and never execute .ipynb_checkpoints notebooks. --- .../accelerated-python/brev/requirements.txt | 2 +- tutorials/accelerated-python/test/pytest.ini | 2 +- .../accelerated-python/test/test_notebooks.py | 77 ++++++++++++++++--- 3 files changed, 69 insertions(+), 12 deletions(-) diff --git a/tutorials/accelerated-python/brev/requirements.txt b/tutorials/accelerated-python/brev/requirements.txt index 894481dc..e5d12e35 100644 --- a/tutorials/accelerated-python/brev/requirements.txt +++ b/tutorials/accelerated-python/brev/requirements.txt @@ -23,7 +23,7 @@ nvidia-nvshmem-cu12 == 3.3.20 nvidia-cuda-nvcc-cu12 == 12.8.* nvidia-cuda-nvrtc-cu12 == 12.8.* -# NVIDIA devtools +# NVIDIA developer tools nvtx nsightful[notebook] @ git+https://github.com/brycelelbach/nsightful.git diff --git a/tutorials/accelerated-python/test/pytest.ini b/tutorials/accelerated-python/test/pytest.ini index aa23ff1f..1dd0dacc 100644 --- a/tutorials/accelerated-python/test/pytest.ini +++ b/tutorials/accelerated-python/test/pytest.ini @@ -1,2 +1,2 @@ [pytest] -addopts = -v --durations=0 --durations-min=0.0 +addopts = -v -s --durations=0 --durations-min=0.0 diff --git a/tutorials/accelerated-python/test/test_notebooks.py b/tutorials/accelerated-python/test/test_notebooks.py index 7ea9f26a..1e60c52f 100644 --- a/tutorials/accelerated-python/test/test_notebooks.py +++ b/tutorials/accelerated-python/test/test_notebooks.py @@ -4,6 +4,7 @@ import pytest from pathlib import Path +import time import nbformat from nbclient import NotebookClient from nbclient.exceptions import CellExecutionError @@ -11,14 +12,17 @@ # Define the path to the notebooks directory NOTEBOOKS_DIR = Path(__file__).resolve().parent.parent / 'notebooks' -# Discover all solution notebooks -solution_notebooks = sorted(NOTEBOOKS_DIR.rglob('*SOLUTION*.ipynb')) +# Discover all solution notebooks (excluding checkpoint files) +solution_notebooks = sorted([ + nb for nb in NOTEBOOKS_DIR.rglob('*SOLUTION*.ipynb') + if '.ipynb_checkpoints' not in str(nb) +]) # Create test IDs from notebook paths for better test output notebook_ids = [nb.relative_to(NOTEBOOKS_DIR).as_posix() for nb in solution_notebooks] -def extract_cell_outputs(nb): +def extract_cell_outputs(nb, cell_times=None): """Extract stdout/stderr from all executed cells for debugging.""" outputs = [] for i, cell in enumerate(nb.cells): @@ -36,10 +40,32 @@ def extract_cell_outputs(nb): cell_outputs.append(f"[error] {ename}: {evalue}") if cell_outputs: source_preview = cell.source[:100].replace('\n', ' ') - outputs.append(f"--- Cell {i}: {source_preview}... ---\n" + ''.join(cell_outputs)) + time_str = f" ({cell_times.get(i, 0):.2f}s)" if cell_times else "" + outputs.append(f"--- Cell {i}{time_str}: {source_preview}... ---\n" + ''.join(cell_outputs)) return '\n'.join(outputs) +def check_gpu_state(): + """Print GPU state for debugging slow execution.""" + import subprocess + try: + result = subprocess.run( + ['nvidia-smi', '--query-gpu=name,compute_mode,clocks.current.sm,clocks.current.memory,power.draw,temperature.gpu,utilization.gpu', '--format=csv,noheader'], + capture_output=True, text=True, timeout=5 + ) + if result.returncode == 0: + print(f" GPU State: {result.stdout.strip()}") + # Also check for any processes using the GPU + result2 = subprocess.run( + ['nvidia-smi', '--query-compute-apps=pid,name,used_memory', '--format=csv,noheader'], + capture_output=True, text=True, timeout=5 + ) + if result2.returncode == 0 and result2.stdout.strip(): + print(f" GPU Processes: {result2.stdout.strip()}") + except Exception as e: + print(f" GPU State check failed: {e}") + + @pytest.mark.parametrize('notebook_path', solution_notebooks, ids=notebook_ids) def test_solution_notebook_executes(notebook_path): """ @@ -47,6 +73,10 @@ def test_solution_notebook_executes(notebook_path): Uses nbclient to execute all cells in the notebook. """ + print(f"\n=== Starting notebook: {notebook_path.name} ===") + check_gpu_state() + notebook_start = time.time() + # Read the notebook with open(notebook_path, 'r', encoding='utf-8') as f: nb = nbformat.read(f, as_version=4) @@ -66,15 +96,42 @@ def test_solution_notebook_executes(notebook_path): resources={'metadata': {'path': str(notebook_path.parent)}} ) - # Execute the notebook + # Execute the notebook cell by cell to get timing + cell_times = {} try: - client.execute() + with client.setup_kernel(): + for i, cell in enumerate(nb.cells): + if cell.cell_type != 'code': + continue + cell_start = time.time() + source_preview = cell.source[:60].replace('\n', ' ') + print(f" Cell {i}: {source_preview}...", end='', flush=True) + + # Check kernel is alive before executing + if not client.kc.is_alive(): + print(" [KERNEL DEAD!]") + raise RuntimeError(f"Kernel died before cell {i}") + + client.execute_cell(cell, i) + cell_time = time.time() - cell_start + cell_times[i] = cell_time + print(f" [{cell_time:.2f}s]") + + # Flush any pending output + import sys + sys.stdout.flush() + except CellExecutionError as e: # Provide detailed error information # Include output from ALL cells, not just the failing one - all_outputs = extract_cell_outputs(nb) - pytest.fail(f"Notebook execution failed:\n{str(e)}\n\n=== ALL CELL OUTPUTS ===\n{all_outputs}") + all_outputs = extract_cell_outputs(nb, cell_times) + total_time = time.time() - notebook_start + pytest.fail(f"Notebook execution failed (total time: {total_time:.2f}s):\n{str(e)}\n\n=== ALL CELL OUTPUTS ===\n{all_outputs}") except Exception as e: # Catch any other execution errors - all_outputs = extract_cell_outputs(nb) - pytest.fail(f"Notebook execution failed: {str(e)}\n\n=== ALL CELL OUTPUTS ===\n{all_outputs}") + all_outputs = extract_cell_outputs(nb, cell_times) + total_time = time.time() - notebook_start + pytest.fail(f"Notebook execution failed (total time: {total_time:.2f}s): {str(e)}\n\n=== ALL CELL OUTPUTS ===\n{all_outputs}") + + total_time = time.time() - notebook_start + print(f"=== Completed {notebook_path.name} in {total_time:.2f}s ===")