Test Brev Tutorial Docker Images #490

Workflow file for this run

.github/workflows/test-brev-tutorial-docker-images.yml at 75e6b07

	name: Test Brev Tutorial Docker Images

	on:
	workflow_dispatch:
	inputs:
	tutorial:
	description: 'Tutorial name to test'
	required: true
	type: string
	git_sha:
	description: 'Git commit SHA to update status for'
	required: true
	type: string
	workflow_run_id:
	description: 'Workflow run ID to download artifacts from'
	required: true
	type: string

	jobs:
	test-tutorial:
	name: test-tutorial (${{ inputs.tutorial }})
	runs-on: linux-amd64-gpu-l4-latest-1
	defaults:
	run:
	working-directory: ${{ github.workspace }}
	permissions:
	statuses: write
	env:
	BUILDKIT_PROGRESS: plain
	DOCKER_CLI_HINTS: false
	steps:
	- name: Show runner info
	run: \|
	echo "Runner name: ${{ runner.name }}"
	echo "Runner OS: ${{ runner.os }}"
	echo "Runner arch: ${{ runner.arch }}"
	echo "Runner uname: $(uname -a)"

	- name: Checkout repository
	uses: actions/checkout@v4

	- name: Set Git branch variables
	run: \|
	GIT_BRANCH_NAME=${GITHUB_REF#refs/heads/}
	# Sanitize branch name for Docker tags (replace invalid characters with hyphens and convert to lowercase)
	DOCKER_TAG_BRANCH=$(echo "${GIT_BRANCH_NAME}" \| sed 's/[^a-zA-Z0-9._-]/-/g' \| tr '[:upper:]' '[:lower:]')
	GIT_SHA=${{ inputs.git_sha }}
	GIT_SHORT_SHA=${GIT_SHA:0:7}
	echo "GIT_BRANCH_NAME=${GIT_BRANCH_NAME}" >> $GITHUB_ENV
	echo "DOCKER_TAG_BRANCH=${DOCKER_TAG_BRANCH}" >> $GITHUB_ENV
	echo "GIT_SHA=${GIT_SHA}" >> $GITHUB_ENV
	echo "GIT_SHORT_SHA=${GIT_SHORT_SHA}" >> $GITHUB_ENV

	- name: Download commit-specific Docker Compose artifact
	uses: dawidd6/action-download-artifact@v6
	with:
	workflow: build-brev-tutorial-docker-images.yml
	run_id: ${{ inputs.workflow_run_id }}
	name: docker-compose-${{ inputs.tutorial }}-${{ env.DOCKER_TAG_BRANCH }}-git-${{ env.GIT_SHORT_SHA }}
	path: artifacts/commit-specific/${{ inputs.tutorial }}/

	- name: Log in to GitHub Container Registry
	uses: docker/login-action@v3
	with:
	registry: ghcr.io
	username: ${{ github.actor }}
	password: ${{ secrets.GITHUB_TOKEN }}

	- name: Stop DCGM to allow NCU profiling
	run: \|
	# DCGM (Data Center GPU Manager) locks the GPU and prevents NCU from profiling.
	# Stop it before running the container tests.
	echo "Stopping DCGM services..."

	# Stop the dcgm-exporter Docker container
	echo "Stopping dcgm-exporter Docker container..."
	docker stop dcgm-exporter 2>/dev/null && echo "Stopped dcgm-exporter container" \|\| echo "dcgm-exporter container not running"
	docker rm -f dcgm-exporter 2>/dev/null \|\| echo "dcgm-exporter container already removed"

	# Stop systemd services
	sudo systemctl stop dcgm-exporter \|\| echo "dcgm-exporter service not found or already stopped"
	sudo systemctl stop nvidia-dcgm \|\| echo "nvidia-dcgm service not found or already stopped"
	sudo systemctl stop dcgm \|\| echo "dcgm service not found or already stopped"
	sudo systemctl stop nv-hostengine \|\| echo "nv-hostengine service not found or already stopped"

	# Kill any remaining dcgm processes
	sudo pkill -9 nv-hostengine \|\| echo "No nv-hostengine processes found"
	sudo pkill -9 dcgm \|\| echo "No dcgm processes found"

	# Relax profiling permissions (perf_event_paranoid=4 is very restrictive)
	echo "Relaxing profiling permissions..."
	sudo sysctl -w kernel.perf_event_paranoid=2 \|\| echo "Could not set perf_event_paranoid"
	sudo sysctl -w kernel.kptr_restrict=0 \|\| echo "Could not set kptr_restrict"

	echo "DCGM services stopped and profiling permissions relaxed."

	- name: Debug GPU and NCU configuration
	run: \|
	echo "=== GPU Information ==="
	nvidia-smi \|\| echo "nvidia-smi failed"
	echo ""

	echo "=== NVIDIA Driver Version ==="
	cat /proc/driver/nvidia/version 2>/dev/null \|\| echo "Could not read driver version"
	echo ""

	echo "=== GPU Processes ==="
	nvidia-smi pmon -c 1 2>/dev/null \|\| echo "nvidia-smi pmon failed"
	echo ""

	echo "=== DCGM/NCU Blocking Processes ==="
	ps aux \| grep -E "(dcgm\|nv-hostengine\|ncu)" \| grep -v grep \|\| echo "No DCGM/NCU processes found"
	echo ""

	echo "=== Systemd Services (nvidia/dcgm related) ==="
	systemctl list-units --type=service \| grep -iE "(nvidia\|dcgm\|gpu)" \|\| echo "No matching services"
	echo ""

	echo "=== Profiling Permissions ==="
	cat /proc/sys/kernel/perf_event_paranoid 2>/dev/null \|\| echo "Could not read perf_event_paranoid"
	cat /proc/sys/kernel/kptr_restrict 2>/dev/null \|\| echo "Could not read kptr_restrict"
	echo ""

	echo "=== NVIDIA Kernel Modules ==="
	lsmod \| grep nvidia \|\| echo "No nvidia modules loaded"
	echo ""

	echo "=== /dev/nvidia* devices ==="
	ls -la /dev/nvidia* 2>/dev/null \|\| echo "No /dev/nvidia* devices found"
	echo ""

	- name: Pre-pull Docker images
	run: \|
	COMPOSE_FILE="artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml"

	# Extract all unique images from the compose file
	# The main image is defined with a YAML anchor like: image: &image ghcr.io/...
	MAIN_IMAGE=$(grep "image: &image" "$COMPOSE_FILE" \| sed 's/.*image: &image //')
	if [ -z "$MAIN_IMAGE" ]; then
	# Fallback: try to find any ghcr.io image reference
	MAIN_IMAGE=$(grep -o 'ghcr.io/[^"]*' "$COMPOSE_FILE" \| head -1)
	fi

	# Extract the nsight image (nvcr.io)
	NSIGHT_IMAGE=$(grep -o 'nvcr.io/[^"]*' "$COMPOSE_FILE" \| head -1)

	echo "Pre-pulling main image: $MAIN_IMAGE"
	docker pull "$MAIN_IMAGE"

	if [ -n "$NSIGHT_IMAGE" ]; then
	echo "Pre-pulling nsight image: $NSIGHT_IMAGE"
	docker pull "$NSIGHT_IMAGE"
	fi

	echo "All images pulled successfully"

	- name: Test Docker Compose
	id: test
	run: \|
	./brev/test-docker-compose.bash "artifacts/commit-specific/${{ inputs.tutorial }}/brev/docker-compose.yml"

	- name: Update commit status to success
	if: success()
	run: \|
	gh api \
	--method POST \
	-H "Accept: application/vnd.github+json" \
	/repos/${{ github.repository }}/statuses/${GIT_SHA} \
	-f state='success' \
	-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
	-f description='Tests passed' \
	-f context='Test Brev Tutorial Docker Images / test-tutorial (tutorials/${{ inputs.tutorial }})'
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

	- name: Update commit status to failure
	if: failure()
	run: \|
	gh api \
	--method POST \
	-H "Accept: application/vnd.github+json" \
	/repos/${{ github.repository }}/statuses/${GIT_SHA} \
	-f state='failure' \
	-f target_url="${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}" \
	-f description='Tests failed' \
	-f context='Test Brev Tutorial Docker Images / test-tutorial (tutorials/${{ inputs.tutorial }})'
	env:
	GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Test Brev Tutorial Docker Images #490

Workflow file

Test Brev Tutorial Docker Images #490

Uh oh!

Workflow file for this run