From a3414c5c22544a225b5e4b5752051a854362b8e6 Mon Sep 17 00:00:00 2001 From: Dennis Klein Date: Mon, 3 Nov 2025 18:27:29 +0100 Subject: [PATCH 1/4] refactor: remove legacy Vagrant development environment Remove Vagrant-based development setup in favor of Docker Compose runtime tests which provide better CI integration and maintainability. --- .gitignore | 1 - README.md | 29 ------------------ Vagrantfile | 84 --------------------------------------------------- containers.sh | 39 ------------------------ 4 files changed, 153 deletions(-) delete mode 100644 Vagrantfile delete mode 100755 containers.sh diff --git a/.gitignore b/.gitignore index 9258e82..c1a09d4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,5 @@ singularity-exec.so *.rpm -.vagrant/ build/ install/ build-* diff --git a/README.md b/README.md index b245403..b067d2b 100644 --- a/README.md +++ b/README.md @@ -168,33 +168,6 @@ still find them interesting for reference or to fork from: Traditional manually maintained Release and Changelog fields. -## Development - -Build the required singularity containers with the script [`containers.sh`][97]. -(This requires the `singularity` command installed on the host). The containers -generated by the script are stored under `/tmp/*.sif`. - -Start a test environment using the included [`Vagrantfile`][96]: - -* Installs the `apptainer` package from Fedora EPEL -* Copies the SIF container images to `/tmp` -* Builds, installs and configures the Slurm Singularity plug-in - -Start a Vagrant box to build an RPM package: - -```sh -./containers.sh && vagrant up el8 && vagrant ssh el8 # for example... - -# synced from the host -cd /vagrant - -cmake -S . -B build # configure the project and choose a build dir -cmake --build build # build the Singularity SPANK plug-in -sudo cmake --install build # install the binary and configuration files - -sudo systemctl enable --now munge slurmctld slurmd -``` - ## License ``` @@ -243,8 +216,6 @@ along with this program. If not, see . [99]: singularity-exec.conf.in [98]: slurm-singularity-wrapper.sh -[97]: containers.sh -[96]: Vagrantfile [95]: https://singularity.hpcng.org/user-docs/master/bind_paths_and_mounts.html#user-defined-bind-paths [94]: https://singularity.hpcng.org/user-docs/master/cli/singularity_exec.html [93]: https://singularity.hpcng.org/user-docs/master/cli/singularity.html#options diff --git a/Vagrantfile b/Vagrantfile deleted file mode 100644 index ba4f655..0000000 --- a/Vagrantfile +++ /dev/null @@ -1,84 +0,0 @@ -# -*- mode: ruby -*- -# vi: set ft=ruby : - -Vagrant.configure("2") do |config| - - slurm_conf = %q( - # vim:ft=bash - ClusterName=tester - SlurmUser=slurm - SlurmctldHost=localhost - SlurmctldPidFile=/var/run/slurmctld.pid - SlurmctldDebug=3 - SlurmctldLogFile=/var/log/slurmctld.log - StateSaveLocation=/var/spool/slurm/ctld - ReturnToService=1 - SlurmdPidFile=/var/run/slurmd.pid - SlurmdSpoolDir=/var/spool/slurm/d - SlurmdDebug=3 - SlurmdLogFile=/var/log/slurmd.log - AuthType=auth/munge - MpiDefault=none - ProctrackType=proctrack/pgid - SwitchType=switch/none - TaskPlugin=task/affinity - FastSchedule=2 # version prior to 20.04 - SchedulerType=sched/builtin - SelectType=select/cons_res - SelectTypeParameters=CR_CPU - JobAcctGatherType=jobacct_gather/none - JobCompType=jobcomp/none - AccountingStorageType=accounting_storage/none - NodeName=localhost Sockets=1 CoresPerSocket=8 ThreadsPerCore=2 State=UNKNOWN - PartitionName=debug Nodes=localhost Default=YES MaxTime=INFINITE State=UP - ).gsub(/^ */,'') - - plugin = '/etc/slurm/spank/singularity-exec.so' - wrapper = '/etc/slurm/spank/slurm-singularity-wrapper.sh' - bind = '/etc/slurm,/var/run/munge,/var/spool/slurm' - - singularity_conf = %Q(required #{plugin} default= script=#{wrapper} bind=#{bind} args="") - - config.vm.box_check_update = false - config.vm.synced_folder ".", "/vagrant", type: "rsync" - - # Copy test container into the box - # - %w( - /tmp/debian10.sif - /tmp/centos7.sif - /tmp/centos_stream8.sif - ).each do |file| - name = File.basename file - config.vm.provision "file", source: "#{file}", destination: "/tmp/#{name}" - end - - ## - # Enterprise Linux 8 - # - config.vm.define "el8" do |config| - - config.vm.hostname = "el8" - config.vm.box = "almalinux/8" - - config.vm.provider :libvirt do |libvirt| - libvirt.memory = 2048 - libvirt.cpus = 2 - end - - config.vm.provision "shell" do |s| - s.privileged = true, - s.inline = %q( - dnf install -y epel-release - dnf config-manager --set-enabled powertools - dnf install -y munge slurm-slurmctld slurm-slurmd apptainer \ - rpm-build rpmdevtools slurm-devel make gcc gcc-c++ libstdc++-static - echo 123456789123456781234567812345678 > /etc/munge/munge.key - chown munge:munge /etc/munge/munge.key - chmod 600 /etc/munge/munge.key - ) - end - - end - -end diff --git a/containers.sh b/containers.sh deleted file mode 100755 index 6d3cb68..0000000 --- a/containers.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env bash - -cd /tmp - -cat > debian10.def < centos7.def < centos_stream8.def </dev/null From 9b31988b9d12bc2f9c256d508f0c1425b476cfd1 Mon Sep 17 00:00:00 2001 From: Dennis Klein Date: Mon, 3 Nov 2025 18:29:59 +0100 Subject: [PATCH 2/4] feat: add Docker Compose runtime test infrastructure Implement comprehensive runtime testing infrastructure using Docker Compose with a three-service architecture optimized for CI environments. Architecture: - plugin-builder: Builds plugin once with custom bind mount configuration - slurmctld: Slurm controller with Munge authentication - slurmd: Compute node with privileged access for containers Configuration: - Slurm 24.11 with proctrack/linuxproc and task/none - Cgroup plugin disabled for container compatibility - Logging to stdout/stderr for visibility - Version-aware configuration system Test Suite: - Plugin installation and CLI option verification - Job submission and execution validation - Containerized workload testing with Singularity - Multi-argument command support verification - Comprehensive error handling and retry logic --- tests/runtime/Dockerfile | 43 ++++ tests/runtime/README.md | 109 +++++++++ tests/runtime/cgroup.conf | 4 + tests/runtime/docker-compose.yml | 71 ++++++ tests/runtime/entrypoint-plugin-builder.sh | 19 ++ tests/runtime/entrypoint-slurmctld.sh | 53 ++++ tests/runtime/entrypoint-slurmd.sh | 84 +++++++ tests/runtime/plugstack.conf | 2 + tests/runtime/run-tests.sh | 96 ++++++++ tests/runtime/setup-slurm-config.sh | 42 ++++ tests/runtime/slurm-24.11.conf | 10 + tests/runtime/slurm-common.conf | 40 +++ tests/runtime/test-integration.sh | 270 +++++++++++++++++++++ tests/runtime/validate-setup.sh | 62 +++++ 14 files changed, 905 insertions(+) create mode 100644 tests/runtime/Dockerfile create mode 100644 tests/runtime/README.md create mode 100644 tests/runtime/cgroup.conf create mode 100644 tests/runtime/docker-compose.yml create mode 100755 tests/runtime/entrypoint-plugin-builder.sh create mode 100755 tests/runtime/entrypoint-slurmctld.sh create mode 100755 tests/runtime/entrypoint-slurmd.sh create mode 100644 tests/runtime/plugstack.conf create mode 100755 tests/runtime/run-tests.sh create mode 100644 tests/runtime/setup-slurm-config.sh create mode 100644 tests/runtime/slurm-24.11.conf create mode 100644 tests/runtime/slurm-common.conf create mode 100755 tests/runtime/test-integration.sh create mode 100755 tests/runtime/validate-setup.sh diff --git a/tests/runtime/Dockerfile b/tests/runtime/Dockerfile new file mode 100644 index 0000000..9971bbc --- /dev/null +++ b/tests/runtime/Dockerfile @@ -0,0 +1,43 @@ +# Dockerfile for runtime testing of slurm-singularity-exec +# Sets up a complete Slurm cluster with controller and compute nodes + +ARG UBUNTU_VERSION=25.04 +FROM ubuntu:${UBUNTU_VERSION} + +# Install Slurm, Munge, and dependencies +RUN apt-get update -y && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y \ + slurm-wlm \ + slurm-wlm-basic-plugins \ + slurmd \ + slurmctld \ + munge \ + cmake \ + g++ \ + ninja-build \ + libslurm-dev \ + curl \ + sudo \ + retry \ + && rm -rf /var/lib/apt/lists/* + +# Try to install singularity-container if available (may not be in all Ubuntu versions) +RUN apt-get update -y && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y singularity-container || true \ + && rm -rf /var/lib/apt/lists/* + +# Create necessary directories and files +RUN mkdir -p /var/spool/slurmctld \ + /var/spool/slurmd \ + /var/spool/slurm \ + /var/run/slurm \ + /etc/slurm \ + /etc/slurm/plugstack.conf.d \ + && chown -R slurm:slurm /var/spool/slurmctld /var/spool/slurmd /var/spool/slurm /var/run/slurm \ + && touch /etc/localtime + +# Set working directory +WORKDIR /workspace + +# Default command will be overridden in docker-compose +CMD ["/bin/bash"] diff --git a/tests/runtime/README.md b/tests/runtime/README.md new file mode 100644 index 0000000..f252248 --- /dev/null +++ b/tests/runtime/README.md @@ -0,0 +1,109 @@ +# Runtime Integration Tests + +This directory contains integration tests that verify the slurm-singularity-exec plugin works with actual Slurm daemons. + +## Overview + +The runtime tests: +1. Start a minimal Slurm cluster (slurmctld + slurmd) using Docker Compose +2. Build and install the slurm-singularity-exec plugin +3. Verify plugin files are installed (library and configuration) +4. Verify plugin CLI options appear in `sbatch --help` and `srun --help` +5. Verify SPANK plugin loads when jobs run (check slurmd logs) +6. Submit and run a containerized test job (if singularity/apptainer is available) + +## Docker Compose Architecture + +### Services + +The test infrastructure consists of three services orchestrated by Docker Compose: + +| Service | Purpose | Startup Order | +|---------|---------|---------------| +| **plugin-builder** | Builds the plugin once using CMake/Ninja | 1st (runs to completion) | +| **slurmctld** | Slurm controller - manages scheduling and cluster state | 2nd (after builder) | +| **slurmd** | Slurm compute node - executes jobs | 3rd (after builder and slurmctld) | + +### Volumes + +| Volume | Containers | Access | Purpose | +|--------|------------|--------|---------| +| `../..` → `/workspace` | All | Read-only (`:z`) | Source code and build scripts | +| `plugin-build` | All | Read-write | Shared build artifacts (plugin binaries) | +| `slurmctld-state` | slurmctld | Read-write | Controller state persistence | +| `slurmd-state` | slurmd | Read-write | Daemon state persistence | +| `munge-key` | slurmctld, slurmd | Read-write | Shared Munge authentication key | +| `job-spool` | slurmctld, slurmd | Read-write | Shared job output files | + +### Build Flow + +1. **plugin-builder** service: + - Runs `entrypoint-plugin-builder.sh` + - Configures and builds plugin with CMake to `/var/lib/slurm-plugin-build` + - Exits when build completes (dependency satisfied) + +2. **slurmctld** service (waits for plugin-builder): + - Runs `entrypoint-slurmctld.sh` + - Installs pre-built plugin from shared volume + - Generates Munge authentication key + - Starts slurmctld daemon + +3. **slurmd** service (waits for plugin-builder and slurmctld): + - Runs `entrypoint-slurmd.sh` + - Installs pre-built plugin from shared volume + - Waits for Munge key and slurmctld connectivity + - Starts slurmd daemon + +### Network + +All services communicate via the `slurm-net` bridge network, allowing hostname-based service discovery. + +## Quick Start + +```bash +# Validate the setup +./validate-setup.sh + +# Run the full integration tests +./run-tests.sh +``` + +## Files + +- `Dockerfile` - Container image for Slurm cluster nodes +- `docker-compose.yml` - Orchestrates the Slurm cluster (plugin-builder, slurmctld, slurmd) +- `slurm-common.conf` - Common Slurm configuration settings +- `slurm-24.11.conf` - Version-specific Slurm configuration +- `plugstack.conf` - Plugin loading configuration +- `cgroup.conf` - Cgroup configuration +- `setup-slurm-config.sh` - Version detection and config selection +- `entrypoint-plugin-builder.sh` - Builds the plugin (runs once) +- `entrypoint-slurmctld.sh` - Startup script for controller node +- `entrypoint-slurmd.sh` - Startup script for compute node +- `test-integration.sh` - Integration test suite +- `run-tests.sh` - Test orchestration script +- `validate-setup.sh` - Quick validation of the setup + +## Requirements + +- Docker +- Docker Compose + +## CI/CD + +These tests run automatically in GitHub Actions for each push and pull request, testing against: +- Slurm 24.11 (Ubuntu 25.04 Plucky) + +## Troubleshooting + +If tests fail, check the logs: +```bash +cd tests/runtime +docker compose logs slurmctld +docker compose logs slurmd +``` + +Clean up containers: +```bash +docker compose down -v +``` diff --git a/tests/runtime/cgroup.conf b/tests/runtime/cgroup.conf new file mode 100644 index 0000000..2d6c08c --- /dev/null +++ b/tests/runtime/cgroup.conf @@ -0,0 +1,4 @@ +# Cgroup configuration for Slurm +# Disable cgroup completely for containerized testing + +CgroupPlugin=disabled diff --git a/tests/runtime/docker-compose.yml b/tests/runtime/docker-compose.yml new file mode 100644 index 0000000..037f8af --- /dev/null +++ b/tests/runtime/docker-compose.yml @@ -0,0 +1,71 @@ +# Docker Compose configuration for runtime testing +# Sets up a minimal Slurm cluster with controller and compute node + +# Common configuration using YAML anchors +x-common-build: &common-build + context: . + dockerfile: Dockerfile + args: + UBUNTU_VERSION: ${UBUNTU_VERSION:-25.04} + +services: + # Plugin builder service - builds the plugin once before starting Slurm services + plugin-builder: + image: runtime-slurmctld:latest + build: *common-build + volumes: + - ../..:/workspace:z + - plugin-build:/var/lib/slurm-plugin-build + entrypoint: /workspace/tests/runtime/entrypoint-plugin-builder.sh + + # Slurm controller + slurmctld: + image: runtime-slurmctld:latest + build: *common-build + depends_on: + plugin-builder: + condition: service_completed_successfully + volumes: + - ../..:/workspace:z + - plugin-build:/var/lib/slurm-plugin-build + - slurmctld-state:/var/spool/slurmctld + - munge-key:/etc/munge + - job-spool:/var/spool/slurm-jobs + networks: + - slurm-net + hostname: slurmctld + container_name: slurmctld + entrypoint: /workspace/tests/runtime/entrypoint-slurmctld.sh + + # Slurm compute node + slurmd: + image: runtime-slurmd:latest + build: *common-build + depends_on: + plugin-builder: + condition: service_completed_successfully + slurmctld: + condition: service_started + volumes: + - ../..:/workspace:z + - plugin-build:/var/lib/slurm-plugin-build + - slurmd-state:/var/spool/slurmd + - munge-key:/etc/munge + - job-spool:/var/spool/slurm-jobs + privileged: true + networks: + - slurm-net + hostname: slurmd + container_name: slurmd + entrypoint: /workspace/tests/runtime/entrypoint-slurmd.sh + +networks: + slurm-net: + driver: bridge + +volumes: + slurmctld-state: + slurmd-state: + munge-key: + job-spool: + plugin-build: diff --git a/tests/runtime/entrypoint-plugin-builder.sh b/tests/runtime/entrypoint-plugin-builder.sh new file mode 100755 index 0000000..ff3197c --- /dev/null +++ b/tests/runtime/entrypoint-plugin-builder.sh @@ -0,0 +1,19 @@ +#!/bin/bash +# Entrypoint script for plugin-builder service +# Builds the slurm-singularity-exec plugin once for installation in all Slurm containers + +set -e + +echo "Building slurm-singularity-exec plugin..." +SRC_DIR="/workspace" +BUILD_DIR="/var/lib/slurm-plugin-build" + +cmake -GNinja -S "$SRC_DIR" -B "$BUILD_DIR" \ + -DCMAKE_INSTALL_PREFIX=/usr \ + -DSLURM_SYSCONFDIR=/etc/slurm \ + -DINSTALL_PLUGSTACK_CONF=ON \ + -DPLUGIN_BIND_ARG="/etc/slurm,/var/spool/slurm,/var/spool/slurmd,/var/run/munge" + +cmake --build "$BUILD_DIR" + +echo "Plugin built successfully" diff --git a/tests/runtime/entrypoint-slurmctld.sh b/tests/runtime/entrypoint-slurmctld.sh new file mode 100755 index 0000000..c49c38d --- /dev/null +++ b/tests/runtime/entrypoint-slurmctld.sh @@ -0,0 +1,53 @@ +#!/bin/bash +# Entrypoint script for slurmctld container + +set -e + +echo "Starting slurmctld container..." + +# Setup Slurm configuration (version detection and config files) +source /workspace/tests/runtime/setup-slurm-config.sh slurmctld + +# Install the pre-built plugin +echo "Installing slurm-singularity-exec plugin..." +BUILD_DIR="/var/lib/slurm-plugin-build" +cmake --install "$BUILD_DIR" +echo "Plugin installed in slurmctld" + +echo "Plugin configuration:" +cat /etc/slurm/plugstack.conf.d/singularity-exec.conf + +# Create Munge key if it doesn't exist (shared volume) +if [ ! -f /etc/munge/munge.key ]; then + echo "Creating Munge key..." + dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key + chown munge:munge /etc/munge/munge.key + chmod 400 /etc/munge/munge.key +else + echo "Munge key already exists, ensuring correct permissions..." + chown munge:munge /etc/munge/munge.key + chmod 400 /etc/munge/munge.key +fi + +# Start Munge +echo "Starting Munge..." +mkdir -p /var/run/munge +chown munge:munge /var/run/munge +sudo -u munge /usr/sbin/munged --force + +# Wait for Munge to be ready and verify it works +echo "Verifying Munge functionality..." +if ! retry --times=10 --delay=1 -- bash -c 'echo "test" | munge | unmunge >/dev/null 2>&1'; then + echo "ERROR: Munge failed to start properly" + exit 1 +fi +echo "✓ Munge is operational" + +# Start slurmctld +echo "Starting slurmctld..." +mkdir -p /var/spool/slurmctld /var/run/slurm +chown -R slurm:slurm /var/spool/slurmctld /var/run/slurm + +# Start slurmctld in foreground +echo "Starting slurmctld daemon..." +exec /usr/sbin/slurmctld -D -vvvv diff --git a/tests/runtime/entrypoint-slurmd.sh b/tests/runtime/entrypoint-slurmd.sh new file mode 100755 index 0000000..e5f0af1 --- /dev/null +++ b/tests/runtime/entrypoint-slurmd.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# Entrypoint script for slurmd container + +set -e + +echo "Starting slurmd container..." + +# Setup Slurm configuration (version detection and config files) +source /workspace/tests/runtime/setup-slurm-config.sh slurmd + +# Install the pre-built plugin +echo "Installing slurm-singularity-exec plugin..." +BUILD_DIR="/var/lib/slurm-plugin-build" +cmake --install "$BUILD_DIR" +echo "Plugin installed in slurmd" + +# Create loop devices for Singularity +echo "Creating loop devices for Singularity..." +for i in {0..7}; do + mknod -m 0660 "/dev/loop$i" b 7 "$i" 2>/dev/null || true +done +chgrp disk /dev/loop* 2>/dev/null || true + +# Wait for Munge key to be created by slurmctld +echo "Waiting for Munge key..." +if ! retry --times=30 --delay=1 -- test -f /etc/munge/munge.key; then + echo "ERROR: Munge key not found" + exit 1 +fi + +echo "Munge key found, ensuring correct permissions..." +chown munge:munge /etc/munge/munge.key +chmod 400 /etc/munge/munge.key + +# Verify munge key +echo "Munge key info:" +ls -la /etc/munge/munge.key + +# Start Munge +echo "Starting Munge..." +mkdir -p /var/run/munge /var/log/munge +chown munge:munge /var/run/munge /var/log/munge +sudo -u munge /usr/sbin/munged --force + +# Wait for Munge to be ready +echo "Waiting for Munge to be ready..." +sleep 3 + +# Test Munge +echo "Testing Munge..." +munge -n | unmunge || echo "Warning: Munge test failed" + +# Wait for slurmctld to be ready +echo "Waiting for slurmctld to respond..." +if ! retry --times=30 --delay=1 -- scontrol ping >/dev/null 2>&1; then + echo "ERROR: slurmctld not responding" + exit 1 +fi +echo "✓ slurmctld is responding" + +# Start slurmd +echo "Starting slurmd..." +mkdir -p /var/spool/slurmd /var/run/slurm /run/slurm +chown -R slurm:slurm /var/spool/slurmd /var/run/slurm +chmod 755 /var/spool/slurmd +chmod 755 /run/slurm + +echo "Slurm configuration:" +grep -E "^(ClusterName|SlurmctldHost|NodeName|ProctrackType|TaskPlugin)" /etc/slurm/slurm.conf || true + +# Verify slurmstepd exists and is executable +echo "Checking slurmstepd..." +if [ -f /usr/sbin/slurmstepd ]; then + echo "✓ slurmstepd found at /usr/sbin/slurmstepd" + ls -lh /usr/sbin/slurmstepd + /usr/sbin/slurmstepd -V || echo "Warning: Could not get slurmstepd version" +else + echo "✗ ERROR: slurmstepd not found!" + exit 1 +fi + +# Start slurmd in foreground +echo "Starting slurmd daemon..." +exec /usr/sbin/slurmd -D -vvvv diff --git a/tests/runtime/plugstack.conf b/tests/runtime/plugstack.conf new file mode 100644 index 0000000..62221f9 --- /dev/null +++ b/tests/runtime/plugstack.conf @@ -0,0 +1,2 @@ +# Include plugstack configuration from subdirectory +include /etc/slurm/plugstack.conf.d/*.conf diff --git a/tests/runtime/run-tests.sh b/tests/runtime/run-tests.sh new file mode 100755 index 0000000..e4c3690 --- /dev/null +++ b/tests/runtime/run-tests.sh @@ -0,0 +1,96 @@ +#!/bin/bash +# Runner script for integration tests +# This script orchestrates the docker-compose cluster and runs tests + +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +cd "$SCRIPT_DIR" + +echo "::group::Clean up previous containers" +docker compose down -v 2>/dev/null || true +echo "::endgroup::" + +echo "::group::Build Docker images with buildx cache" +# Build both services using buildx bake for proper cache support +# Set BUILDX_BAKE_ENTITLEMENTS_FS=0 to allow filesystem access without explicit --allow flags +export BUILDX_BAKE_ENTITLEMENTS_FS=0 + +# If BUILDX_CACHE_FROM/TO are set, use them; otherwise build without cache +if [ -n "${BUILDX_CACHE_FROM}" ] && [ -n "${BUILDX_CACHE_TO}" ]; then + docker buildx bake \ + --file docker-compose.yml \ + --set "*.cache-from=${BUILDX_CACHE_FROM}" \ + --set "*.cache-to=${BUILDX_CACHE_TO}" \ + --load \ + slurmctld slurmd +else + docker buildx bake \ + --file docker-compose.yml \ + --load \ + slurmctld slurmd +fi +echo "::endgroup::" + +echo "::group::Start Slurm cluster" +docker compose up -d --no-build +echo "::endgroup::" + +echo "::group::Wait for services" +echo "Waiting for slurmctld to be ready..." +# Give slurmctld up to 30 seconds to start (15 retries * 2 seconds) +RETRIES=15 +DELAY=2 +for i in $(seq 1 $RETRIES); do + if docker compose exec -T slurmctld scontrol ping >/dev/null 2>&1; then + echo "✓ Slurm cluster is ready (attempt $i/$RETRIES)" + break + fi + if [ $i -eq $RETRIES ]; then + echo "ERROR: slurmctld not ready after $((RETRIES * DELAY)) seconds" + docker compose logs slurmctld + exit 1 + fi + sleep $DELAY +done +echo "::endgroup::" + +echo "::group::Run integration tests" +set +e # Temporarily disable exit on error +docker compose exec -T slurmctld /workspace/tests/runtime/test-integration.sh +TEST_EXIT_CODE=$? +set -e # Re-enable exit on error +echo "::endgroup::" + +# Additional verification: Check for SPANK plugin loading in slurmd container logs +if [ $TEST_EXIT_CODE -eq 0 ]; then + echo "::group::Verify SPANK plugin loading in logs" + if docker compose logs slurmd 2>&1 | grep -q "Loaded plugin slurm-singularity-exec.so"; then + echo "✓ Found SPANK plugin loading message in slurmd container logs" + else + echo "⚠ Warning: SPANK plugin loading message not found in slurmd logs" + echo " This may indicate the plugin is not being loaded by slurmstepd" + fi + echo "::endgroup::" +fi + +# Show logs if tests failed +if [ $TEST_EXIT_CODE -ne 0 ]; then + echo "::group::slurmctld logs (last 100 lines)" + docker compose logs --tail=100 slurmctld + echo "::endgroup::" + + echo "::group::slurmd logs (last 100 lines)" + docker compose logs --tail=100 slurmd + echo "::endgroup::" + + echo "::group::Container status" + docker compose ps + echo "::endgroup::" +fi + +echo "::group::Clean up" +docker compose down -v +echo "::endgroup::" + +exit $TEST_EXIT_CODE diff --git a/tests/runtime/setup-slurm-config.sh b/tests/runtime/setup-slurm-config.sh new file mode 100644 index 0000000..d72ada5 --- /dev/null +++ b/tests/runtime/setup-slurm-config.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Common script to setup Slurm configuration based on version +# Usage: source setup-slurm-config.sh +# daemon-name: slurmctld or slurmd + +set -e + +DAEMON_NAME="$1" + +if [[ -z "$DAEMON_NAME" ]]; then + echo "ERROR: Daemon name required (slurmctld or slurmd)" + exit 1 +fi + +# Detect Slurm version and use appropriate config +# Handle both "slurm X.Y.Z" and "slurm-wlm X.Y.Z" formats +SLURM_VERSION=$($DAEMON_NAME -V | grep -oP 'slurm(-wlm)? \K[0-9]+\.[0-9]+' || echo "unknown") +echo "Detected Slurm version: $SLURM_VERSION" + +# Copy common config first +cp /workspace/tests/runtime/slurm-common.conf /etc/slurm/slurm-common.conf + +# Copy version-specific config +if [[ "$SLURM_VERSION" == "24.11" ]]; then + echo "Using Slurm 24.11 configuration (proctrack/linuxproc)" + cp /workspace/tests/runtime/slurm-24.11.conf /etc/slurm/slurm.conf +elif [[ "$SLURM_VERSION" == "unknown" ]]; then + echo "ERROR: Could not detect Slurm version" + echo "The daemon '$DAEMON_NAME -V' command failed or produced unexpected output" + echo "Please ensure Slurm is properly installed and accessible" + exit 1 +else + echo "ERROR: Unsupported Slurm version: $SLURM_VERSION" + echo "Only version 24.11 is currently supported" + echo "If you are using a newer version, please add configuration in tests/runtime/" + exit 1 +fi + +cp /workspace/tests/runtime/plugstack.conf /etc/slurm/plugstack.conf +cp /workspace/tests/runtime/cgroup.conf /etc/slurm/cgroup.conf + +echo "Slurm configuration setup complete" diff --git a/tests/runtime/slurm-24.11.conf b/tests/runtime/slurm-24.11.conf new file mode 100644 index 0000000..b0b380c --- /dev/null +++ b/tests/runtime/slurm-24.11.conf @@ -0,0 +1,10 @@ +# Slurm 24.11 specific configuration +# Uses linuxproc for CI compatibility (GitHub Actions uses cgroup v1) + +# Include common configuration +include /etc/slurm/slurm-common.conf + +# Process tracking: Use linuxproc for broader compatibility +# Disable task plugin to avoid cgroup/systemd requirements in containers +ProctrackType=proctrack/linuxproc +TaskPlugin=task/none diff --git a/tests/runtime/slurm-common.conf b/tests/runtime/slurm-common.conf new file mode 100644 index 0000000..5283622 --- /dev/null +++ b/tests/runtime/slurm-common.conf @@ -0,0 +1,40 @@ +# Common Slurm configuration for all versions +ClusterName=test +SlurmctldHost=slurmctld + +# Logging - log to stdout/stderr for container visibility +SlurmctldDebug=debug +SlurmdDebug=debug + +# Scheduler +SchedulerType=sched/backfill +SelectType=select/cons_tres +SelectTypeParameters=CR_Core + +# State preservation +StateSaveLocation=/var/spool/slurmctld +SlurmdSpoolDir=/var/spool/slurmd + +# Authentication +AuthType=auth/munge +CryptoType=crypto/munge + +# Timeouts +SlurmctldTimeout=300 +SlurmdTimeout=300 +InactiveLimit=0 +MinJobAge=300 +KillWait=30 +Waittime=0 + +# Return to service +ReturnToService=2 + +# MPI +MpiDefault=none + +# Node configuration (CPUs=4 to match typical GitHub Actions runner: 2 cores * 2 threads) +NodeName=slurmd Gres=gpu:0 CPUs=4 Boards=1 SocketsPerBoard=1 CoresPerSocket=2 ThreadsPerCore=2 RealMemory=1000 State=UNKNOWN + +# Partition configuration +PartitionName=debug Nodes=slurmd Default=YES MaxTime=INFINITE State=UP diff --git a/tests/runtime/test-integration.sh b/tests/runtime/test-integration.sh new file mode 100755 index 0000000..1ec6079 --- /dev/null +++ b/tests/runtime/test-integration.sh @@ -0,0 +1,270 @@ +#!/bin/bash +# Integration test script for runtime functionality testing +# This script runs inside a container and tests the Slurm-Singularity plugin + +set -e + +echo "=== Slurm Singularity Plugin Runtime Tests ===" +echo + +# Test 1: Verify plugin files are installed +echo "Test 1: Verifying plugin installation..." +if [ -f "/usr/libexec/slurm-singularity-exec.so" ]; then + echo "✓ Found plugin library: /usr/libexec/slurm-singularity-exec.so" +else + echo "✗ ERROR: Plugin library not found at /usr/libexec/slurm-singularity-exec.so" + exit 1 +fi + +if [ -f "/etc/slurm/plugstack.conf.d/singularity-exec.conf" ]; then + echo "✓ Found plugin config: /etc/slurm/plugstack.conf.d/singularity-exec.conf" +else + echo "✗ ERROR: Plugin config not found at /etc/slurm/plugstack.conf.d/singularity-exec.conf" + exit 1 +fi +echo + +# Test 2: Check plugin CLI options in sbatch --help +echo "Test 2: Checking plugin CLI options in sbatch --help..." +if sbatch --help 2>&1 | grep -q "singularity-container"; then + echo "✓ Found --singularity-container option" +else + echo "✗ ERROR: --singularity-container option not found in sbatch --help" + exit 1 +fi + +if sbatch --help 2>&1 | grep -q "singularity-bind"; then + echo "✓ Found --singularity-bind option" +else + echo "✗ ERROR: --singularity-bind option not found in sbatch --help" + exit 1 +fi + +if sbatch --help 2>&1 | grep -q "singularity-args"; then + echo "✓ Found --singularity-args option" +else + echo "✗ ERROR: --singularity-args option not found in sbatch --help" + exit 1 +fi + +if sbatch --help 2>&1 | grep -q "singularity-no-bind-defaults"; then + echo "✓ Found --singularity-no-bind-defaults option" +else + echo "✗ ERROR: --singularity-no-bind-defaults option not found in sbatch --help" + exit 1 +fi +echo + +# Test 3: Check plugin CLI options in srun --help +echo "Test 3: Checking plugin CLI options in srun --help..." +if srun --help 2>&1 | grep -q "singularity-container"; then + echo "✓ Found --singularity-container option in srun" +else + echo "✗ ERROR: --singularity-container option not found in srun --help" + exit 1 +fi +echo + +# Test 4: Check if singularity/apptainer is available +echo "Test 4: Checking for singularity/apptainer..." +SINGULARITY_CMD="" +if command -v singularity >/dev/null 2>&1; then + SINGULARITY_CMD="singularity" + echo "✓ Found singularity command" +elif command -v apptainer >/dev/null 2>&1; then + SINGULARITY_CMD="apptainer" + echo "✓ Found apptainer command" +else + echo "⚠ Warning: Neither singularity nor apptainer found. Skipping container job test." + SKIP_CONTAINER_TEST=true +fi +echo + +# Test 5: Create a simple test container (if singularity/apptainer available) +if [ "$SKIP_CONTAINER_TEST" != "true" ]; then + echo "Test 5: Creating a test container image..." + # Use shared directory so container is accessible from both slurmctld and slurmd + TEST_CONTAINER="/var/spool/slurm-jobs/test-debian.sif" + if [ ! -f "$TEST_CONTAINER" ]; then + # Create a minimal Debian container + $SINGULARITY_CMD pull "$TEST_CONTAINER" docker://debian:stable-slim + if [ $? -eq 0 ]; then + echo "✓ Test container created: $TEST_CONTAINER" + else + echo "⚠ Warning: Failed to create test container. Skipping container job test." + SKIP_CONTAINER_TEST=true + fi + else + echo "✓ Test container already exists: $TEST_CONTAINER" + fi + echo +fi + +# Test 6: Wait for Slurm to be ready +echo "Test 6: Waiting for Slurm cluster to be ready..." +if ! retry --times=30 --delay=2 -- scontrol ping >/dev/null 2>&1; then + echo "✗ ERROR: Slurm controller not responding" + exit 1 +fi +echo "✓ Slurm controller is responding" + +# Wait for node to be ready +if ! retry --times=30 --delay=2 -- bash -c 'sinfo -h -o "%T" 2>/dev/null | grep -qE "idle|mixed|alloc"'; then + echo "✗ ERROR: No compute nodes are ready" + echo "Showing sinfo output:" + sinfo + echo + echo "Showing last 50 lines of slurmd logs:" + tail -50 /var/log/slurm/slurmd.log 2>/dev/null || echo "Could not read slurmd logs" + echo + echo "Showing last 50 lines of slurmctld logs:" + tail -50 /var/log/slurm/slurmctld.log 2>/dev/null || echo "Could not read slurmctld logs" + exit 1 +fi +echo "✓ Compute node is ready" +echo + +# Show cluster status +echo "Cluster status:" +sinfo +echo + +# Test 7: Verify job submission works (triggers SPANK plugin) +echo "Test 7: Verifying job submission works..." +# Submit a simple test job to verify Slurm is functional and trigger plugin loading +TEST_JOB_ID=$(sbatch --wrap="echo 'Test job running'; sleep 1" --output=/dev/null 2>&1 | awk '{print $NF}') +if [ -z "$TEST_JOB_ID" ]; then + echo "✗ ERROR: Failed to submit test job" + exit 1 +fi + +# Wait for job to complete +echo " Waiting for job $TEST_JOB_ID to complete..." +retry --times=30 --delay=1 -- bash -c "scontrol show job $TEST_JOB_ID 2>/dev/null | grep -qE 'JobState=(COMPLETED|FAILED|CANCELLED)'" >/dev/null 2>&1 + +JOB_STATE=$(scontrol show job "$TEST_JOB_ID" 2>/dev/null | grep "JobState" | awk '{print $1}' | cut -d= -f2) +if [ "$JOB_STATE" = "COMPLETED" ]; then + echo "✓ Test job completed successfully (JobID: $TEST_JOB_ID)" +elif [ "$JOB_STATE" = "COMPLETING" ]; then + echo "✓ Test job completed (JobID: $TEST_JOB_ID)" +else + echo "✗ ERROR: Test job did not complete properly (State: $JOB_STATE)" + scontrol show job "$TEST_JOB_ID" + exit 1 +fi +echo + +# Test 8: Submit a containerized test job (if container available) +if [ "$SKIP_CONTAINER_TEST" != "true" ]; then + echo "Test 8: Submitting a containerized test job..." +JOB_SCRIPT=$(mktemp /tmp/test_job.XXXXXX.sh) +cat > "$JOB_SCRIPT" <<'JOBEOF' +#!/bin/bash +#SBATCH --job-name=test-singularity +#SBATCH --output=/var/spool/slurm-jobs/test_job_%j.out +#SBATCH --error=/var/spool/slurm-jobs/test_job_%j.err +#SBATCH --partition=debug +#SBATCH --time=00:01:00 +#SBATCH --nodes=1 +#SBATCH --ntasks=1 + +echo "Job started at: $(date)" +echo "Running on node: $(hostname)" +echo "Job ID: $SLURM_JOB_ID" + +# Test command inside container +cat /etc/os-release | grep -i pretty + +echo "Job completed at: $(date)" +JOBEOF + +chmod +x "$JOB_SCRIPT" + +# Submit the job with the container +JOB_ID=$(sbatch --singularity-container="$TEST_CONTAINER" "$JOB_SCRIPT" | awk '{print $NF}') +if [ -n "$JOB_ID" ]; then + echo "✓ Job submitted successfully: Job ID $JOB_ID" +else + echo "✗ ERROR: Failed to submit job" + exit 1 +fi +echo + +# Test 9: Wait for job to complete +echo "Test 9: Waiting for job to complete..." +max_wait=120 +waited=0 +while true; do + JOB_STATE=$(scontrol show job "$JOB_ID" 2>/dev/null | grep "JobState=" | sed 's/.*JobState=\([^ ]*\).*/\1/') + + if [ "$JOB_STATE" = "COMPLETED" ]; then + echo "✓ Job completed successfully" + break + elif [ "$JOB_STATE" = "FAILED" ] || [ "$JOB_STATE" = "CANCELLED" ] || [ "$JOB_STATE" = "TIMEOUT" ]; then + echo "✗ ERROR: Job failed with state: $JOB_STATE" + scontrol show job "$JOB_ID" + exit 1 + elif [ $waited -ge $max_wait ]; then + echo "✗ ERROR: Job did not complete within ${max_wait}s" + scontrol show job "$JOB_ID" + scancel "$JOB_ID" + exit 1 + fi + + echo " Job state: $JOB_STATE (${waited}s/${max_wait}s)" + sleep 3 + waited=$((waited + 3)) +done +echo + +# Test 10: Check job output +echo "Test 10: Checking job output..." +JOB_OUTPUT="/var/spool/slurm-jobs/test_job_${JOB_ID}.out" +if [ -f "$JOB_OUTPUT" ]; then + echo "Job output:" + cat "$JOB_OUTPUT" + echo + + if grep -q "PRETTY_NAME" "$JOB_OUTPUT"; then + echo "✓ Job produced expected output (found PRETTY_NAME)" + else + echo "✗ ERROR: Job output does not contain expected content" + exit 1 + fi +else + echo "✗ ERROR: Job output file not found: $JOB_OUTPUT" + exit 1 +fi +echo + +# Test 11: Run containerized job via srun with multi-argument command +echo "Test 11: Testing srun with multi-argument command (bugfix from v3.2.0)..." +# This tests the fix for properly handling multi-argument commands in containerized srun jobs +SRUN_OUTPUT=$(mktemp /tmp/srun_output.XXXXXX) +if srun --singularity-container="$TEST_CONTAINER" /bin/bash -c 'echo "arg1 arg2 arg3"' > "$SRUN_OUTPUT" 2>&1; then + if grep -q "arg1 arg2 arg3" "$SRUN_OUTPUT"; then + echo "✓ srun multi-argument command executed successfully" + echo " Output: $(cat $SRUN_OUTPUT)" + else + echo "✗ ERROR: srun output does not contain expected content" + echo " Expected: 'arg1 arg2 arg3'" + echo " Got: $(cat $SRUN_OUTPUT)" + rm -f "$SRUN_OUTPUT" + exit 1 + fi +else + echo "✗ ERROR: srun command failed" + echo " Output: $(cat $SRUN_OUTPUT)" + rm -f "$SRUN_OUTPUT" + exit 1 +fi +rm -f "$SRUN_OUTPUT" +echo + +else + echo "Skipping containerized job tests (no singularity/apptainer available)" + echo +fi + +echo "=== All tests passed! ===" +exit 0 diff --git a/tests/runtime/validate-setup.sh b/tests/runtime/validate-setup.sh new file mode 100755 index 0000000..4891d55 --- /dev/null +++ b/tests/runtime/validate-setup.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# Quick validation script to check the runtime test setup + +set -e + +echo "=== Runtime Test Setup Validation ===" +echo + +# Check 1: Validate Docker is available +echo "Check 1: Docker availability..." +if ! command -v docker >/dev/null 2>&1; then + echo "✗ ERROR: Docker not found" + exit 1 +fi +echo "✓ Docker is available" +echo + +# Check 2: Validate Docker Compose is available +echo "Check 2: Docker Compose availability..." +if ! docker compose version >/dev/null 2>&1; then + echo "✗ ERROR: Docker Compose not found" + exit 1 +fi +echo "✓ Docker Compose is available" +echo + +# Check 3: Validate docker-compose.yml syntax +echo "Check 3: Validating docker-compose.yml..." +cd "$(dirname "$0")" +if ! docker compose config >/dev/null 2>&1; then + echo "✗ ERROR: docker-compose.yml has syntax errors" + exit 1 +fi +echo "✓ docker-compose.yml is valid" +echo + +# Check 4: Validate Dockerfile can be built +echo "Check 4: Building Docker image..." +if ! docker build -f Dockerfile -t slurm-test:validation --build-arg UBUNTU_VERSION=${UBUNTU_VERSION:-noble} ../.. 2>&1 | tail -5; then + echo "✗ ERROR: Failed to build Docker image" + exit 1 +fi +echo "✓ Docker image builds successfully" +echo + +# Check 5: Validate shell scripts syntax +echo "Check 5: Validating shell scripts..." +for script in entrypoint-slurmctld.sh entrypoint-slurmd.sh test-integration.sh run-tests.sh; do + if ! bash -n "$script"; then + echo "✗ ERROR: $script has syntax errors" + exit 1 + fi + echo " ✓ $script syntax is valid" +done +echo + +echo "=== All validation checks passed! ===" +echo +echo "The runtime test infrastructure is ready." +echo "To run the full integration tests, execute: ./run-tests.sh" +echo +exit 0 From db72e0762bda6cfcbef83ce53e9129a42632702c Mon Sep 17 00:00:00 2001 From: Dennis Klein Date: Mon, 3 Nov 2025 18:30:24 +0100 Subject: [PATCH 3/4] ci: add GitHub Actions workflow for runtime tests Add automated runtime testing that runs on every push and pull request: - Tests Slurm 24.11 on Ubuntu 25.04 (plucky) - Uses Docker Buildx with GitHub Actions caching - Runs full integration test suite with live Slurm cluster - Validates plugin installation, job execution, and containerized workloads Updates documentation and adds Runtime Tests badge to README. --- .github/workflows/runtime-tests.yml | 37 +++++++++++++++++++++++++++++ README.md | 2 +- tests/README.md | 21 +++++++++++++--- 3 files changed, 56 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/runtime-tests.yml diff --git a/.github/workflows/runtime-tests.yml b/.github/workflows/runtime-tests.yml new file mode 100644 index 0000000..3c17b8b --- /dev/null +++ b/.github/workflows/runtime-tests.yml @@ -0,0 +1,37 @@ +name: Runtime Tests + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +permissions: + contents: read + +jobs: + runtime-test: + name: Slurm ${{ matrix.slurm }} + runs-on: ubuntu-latest + + strategy: + fail-fast: false + matrix: + include: + - ubuntu_version: plucky + slurm: 24.11 + + steps: + - uses: actions/checkout@v4 + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Run runtime integration tests + run: | + cd tests/runtime + UBUNTU_VERSION=${{ matrix.ubuntu_version }} ./run-tests.sh + env: + UBUNTU_VERSION: ${{ matrix.ubuntu_version }} + BUILDX_CACHE_FROM: type=gha,scope=${{ matrix.ubuntu_version }} + BUILDX_CACHE_TO: type=gha,mode=max,scope=${{ matrix.ubuntu_version }} diff --git a/README.md b/README.md index b067d2b..83e1256 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Slurm Singularity SPANK Plugin -[![Build](https://github.com/GSI-HPC/slurm-singularity-exec/actions/workflows/build.yml/badge.svg)](https://github.com/GSI-HPC/slurm-singularity-exec/actions/workflows/build.yml) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10719223.svg)](https://doi.org/10.5281/zenodo.10719223) +[![Build](https://github.com/GSI-HPC/slurm-singularity-exec/actions/workflows/build.yml/badge.svg)](https://github.com/GSI-HPC/slurm-singularity-exec/actions/workflows/build.yml) [![Runtime Tests](https://github.com/GSI-HPC/slurm-singularity-exec/actions/workflows/runtime-tests.yml/badge.svg)](https://github.com/GSI-HPC/slurm-singularity-exec/actions/workflows/runtime-tests.yml) [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.10719223.svg)](https://doi.org/10.5281/zenodo.10719223) The Singularity SPANK plugin provides the users with an interface to launch an application within a Linux container. The plug-in adds multiple command-line diff --git a/tests/README.md b/tests/README.md index 7f451d8..fbfac1f 100644 --- a/tests/README.md +++ b/tests/README.md @@ -46,11 +46,26 @@ bats test_wrapper.bats - Error handling (missing files, invalid paths, special characters) - Command construction (argument ordering, bind mount syntax, environment propagation) +## Runtime Integration Tests + +Runtime tests verify the plugin works with actual Slurm daemons: + +```bash +cd tests/runtime +./run-tests.sh +``` + +See [runtime/README.md](runtime/README.md) for detailed documentation on the Docker Compose architecture, test flow, and troubleshooting. + ## Continuous Integration -GitHub Actions tests on every push/PR with: -- Slurm 23.11 (Ubuntu 24.04 Noble) -- Slurm 24.11 (Ubuntu 25.04 Plucky) +GitHub Actions runs tests on every push/PR: + +| Test Type | Slurm Version | Ubuntu Version | Description | +|-----------|---------------|----------------|-------------| +| Build | 23.11 | 24.04 Noble | Compile-time compatibility check | +| Build | 24.11 | 25.04 Plucky | Compile-time compatibility check | +| Runtime | 24.11 | 25.04 Plucky | Full integration tests with live cluster | ## Writing Tests From 71599d31cc71b6ddd5028b50a517517f1ae33010 Mon Sep 17 00:00:00 2001 From: Dennis Klein Date: Mon, 3 Nov 2025 18:51:31 +0100 Subject: [PATCH 4/4] refactor: deduplicate hard-coded values in test scripts Extract configuration values to the top of each script with environment variable overrides. run-tests.sh configuration (passed to test-integration.sh): - RETRY_TIMES (default: 15) - cluster readiness retry attempts - RETRY_DELAY (default: 2) - delay between retries in seconds - JOB_RETRY_DELAY (default: 1) - delay for job state checks - JOB_MAX_WAIT (default: 120) - maximum job wait time in seconds - JOB_POLL_INTERVAL (default: 3) - job polling interval in seconds - LOG_TAIL_LINES (default: 100) - lines to show in failure logs test-integration.sh configuration (container-specific defaults): - PLUGIN_LIBEXEC_DIR (default: /usr/libexec) - SLURM_SYSCONFDIR (default: /etc/slurm) - SLURM_JOB_SPOOL (default: /var/spool/slurm-jobs) - SLURM_LOG_DIR (default: /var/log/slurm) - SLURM_PARTITION (default: debug) All timing parameters from run-tests.sh are passed to test-integration.sh via docker exec -e flags for consistency. --- tests/runtime/README.md | 39 +++++++++++++++- tests/runtime/run-tests.sh | 38 ++++++++++------ tests/runtime/test-integration.sh | 74 ++++++++++++++++++------------- 3 files changed, 106 insertions(+), 45 deletions(-) diff --git a/tests/runtime/README.md b/tests/runtime/README.md index f252248..0caaf0f 100644 --- a/tests/runtime/README.md +++ b/tests/runtime/README.md @@ -9,7 +9,7 @@ The runtime tests: 2. Build and install the slurm-singularity-exec plugin 3. Verify plugin files are installed (library and configuration) 4. Verify plugin CLI options appear in `sbatch --help` and `srun --help` -5. Verify SPANK plugin loads when jobs run (check slurmd logs) +5. Verify SPANK plugin loads when jobs run (check container logs) 6. Submit and run a containerized test job (if singularity/apptainer is available) ## Docker Compose Architecture @@ -28,7 +28,7 @@ The test infrastructure consists of three services orchestrated by Docker Compos | Volume | Containers | Access | Purpose | |--------|------------|--------|---------| -| `../..` → `/workspace` | All | Read-only (`:z`) | Source code and build scripts | +| `../..` → `/workspace` | All | Read-write (`:z`) | Source code and build scripts | | `plugin-build` | All | Read-write | Shared build artifacts (plugin binaries) | | `slurmctld-state` | slurmctld | Read-write | Controller state persistence | | `slurmd-state` | slurmd | Read-write | Daemon state persistence | @@ -58,6 +58,41 @@ The test infrastructure consists of three services orchestrated by Docker Compos All services communicate via the `slurm-net` bridge network, allowing hostname-based service discovery. +## Configuration + +The test infrastructure uses environment variables for configuration, allowing customization without modifying scripts: + +### Timing Configuration (set in run-tests.sh, passed to test-integration.sh) + +| Variable | Default | Description | +|----------|---------|-------------| +| `RETRY_TIMES` | 15 | Number of retry attempts for cluster readiness | +| `RETRY_DELAY` | 2 | Delay in seconds between retry attempts | +| `JOB_RETRY_DELAY` | 1 | Delay in seconds between job state checks | +| `JOB_MAX_WAIT` | 120 | Maximum wait time in seconds for job completion | +| `JOB_POLL_INTERVAL` | 3 | Interval in seconds between job status polls | +| `LOG_TAIL_LINES` | 100 | Number of log lines to show on failure | + +### Container Path Configuration (test-integration.sh only) + +| Variable | Default | Description | +|----------|---------|-------------| +| `PLUGIN_LIBEXEC_DIR` | `/usr/libexec` | Plugin library directory | +| `SLURM_SYSCONFDIR` | `/etc/slurm` | Slurm configuration directory | +| `SLURM_JOB_SPOOL` | `/var/spool/slurm-jobs` | Job output spool directory | +| `SLURM_LOG_DIR` | `/var/log/slurm` | Slurm log directory | +| `SLURM_PARTITION` | `debug` | Default Slurm partition name | + +### Example: Custom Timing + +```bash +# Faster retries for local testing +RETRY_TIMES=5 RETRY_DELAY=1 ./run-tests.sh + +# Longer timeouts for slow environments +JOB_MAX_WAIT=300 ./run-tests.sh +``` + ## Quick Start ```bash diff --git a/tests/runtime/run-tests.sh b/tests/runtime/run-tests.sh index e4c3690..0346ce8 100755 --- a/tests/runtime/run-tests.sh +++ b/tests/runtime/run-tests.sh @@ -7,6 +7,14 @@ set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" cd "$SCRIPT_DIR" +# Configuration - can be overridden via environment variables +: "${RETRY_TIMES:=15}" +: "${RETRY_DELAY:=2}" +: "${JOB_RETRY_DELAY:=1}" +: "${JOB_MAX_WAIT:=120}" +: "${JOB_POLL_INTERVAL:=3}" +: "${LOG_TAIL_LINES:=100}" + echo "::group::Clean up previous containers" docker compose down -v 2>/dev/null || true echo "::endgroup::" @@ -38,26 +46,30 @@ echo "::endgroup::" echo "::group::Wait for services" echo "Waiting for slurmctld to be ready..." -# Give slurmctld up to 30 seconds to start (15 retries * 2 seconds) -RETRIES=15 -DELAY=2 -for i in $(seq 1 $RETRIES); do +# Give slurmctld up to RETRY_TIMES * RETRY_DELAY seconds to start +for i in $(seq 1 $RETRY_TIMES); do if docker compose exec -T slurmctld scontrol ping >/dev/null 2>&1; then - echo "✓ Slurm cluster is ready (attempt $i/$RETRIES)" + echo "✓ Slurm cluster is ready (attempt $i/$RETRY_TIMES)" break fi - if [ $i -eq $RETRIES ]; then - echo "ERROR: slurmctld not ready after $((RETRIES * DELAY)) seconds" + if [ $i -eq $RETRY_TIMES ]; then + echo "ERROR: slurmctld not ready after $((RETRY_TIMES * RETRY_DELAY)) seconds" docker compose logs slurmctld exit 1 fi - sleep $DELAY + sleep $RETRY_DELAY done echo "::endgroup::" echo "::group::Run integration tests" set +e # Temporarily disable exit on error -docker compose exec -T slurmctld /workspace/tests/runtime/test-integration.sh +docker compose exec -T \ + -e RETRY_TIMES="$RETRY_TIMES" \ + -e RETRY_DELAY="$RETRY_DELAY" \ + -e JOB_RETRY_DELAY="$JOB_RETRY_DELAY" \ + -e JOB_MAX_WAIT="$JOB_MAX_WAIT" \ + -e JOB_POLL_INTERVAL="$JOB_POLL_INTERVAL" \ + slurmctld /workspace/tests/runtime/test-integration.sh TEST_EXIT_CODE=$? set -e # Re-enable exit on error echo "::endgroup::" @@ -76,12 +88,12 @@ fi # Show logs if tests failed if [ $TEST_EXIT_CODE -ne 0 ]; then - echo "::group::slurmctld logs (last 100 lines)" - docker compose logs --tail=100 slurmctld + echo "::group::slurmctld logs (last $LOG_TAIL_LINES lines)" + docker compose logs --tail="$LOG_TAIL_LINES" slurmctld echo "::endgroup::" - echo "::group::slurmd logs (last 100 lines)" - docker compose logs --tail=100 slurmd + echo "::group::slurmd logs (last $LOG_TAIL_LINES lines)" + docker compose logs --tail="$LOG_TAIL_LINES" slurmd echo "::endgroup::" echo "::group::Container status" diff --git a/tests/runtime/test-integration.sh b/tests/runtime/test-integration.sh index 1ec6079..c478b7d 100755 --- a/tests/runtime/test-integration.sh +++ b/tests/runtime/test-integration.sh @@ -4,22 +4,37 @@ set -e +# Configuration - can be overridden via environment variables +: "${PLUGIN_LIBEXEC_DIR:=/usr/libexec}" +: "${SLURM_SYSCONFDIR:=/etc/slurm}" +: "${SLURM_JOB_SPOOL:=/var/spool/slurm-jobs}" +: "${SLURM_LOG_DIR:=/var/log/slurm}" +: "${SLURM_PARTITION:=debug}" +: "${RETRY_TIMES:=30}" +: "${RETRY_DELAY:=2}" +: "${JOB_RETRY_DELAY:=1}" +: "${JOB_MAX_WAIT:=120}" +: "${JOB_POLL_INTERVAL:=3}" + +PLUGIN_SO="${PLUGIN_LIBEXEC_DIR}/slurm-singularity-exec.so" +PLUGSTACK_CONF="${SLURM_SYSCONFDIR}/plugstack.conf.d/singularity-exec.conf" + echo "=== Slurm Singularity Plugin Runtime Tests ===" echo # Test 1: Verify plugin files are installed echo "Test 1: Verifying plugin installation..." -if [ -f "/usr/libexec/slurm-singularity-exec.so" ]; then - echo "✓ Found plugin library: /usr/libexec/slurm-singularity-exec.so" +if [ -f "$PLUGIN_SO" ]; then + echo "✓ Found plugin library: $PLUGIN_SO" else - echo "✗ ERROR: Plugin library not found at /usr/libexec/slurm-singularity-exec.so" + echo "✗ ERROR: Plugin library not found at $PLUGIN_SO" exit 1 fi -if [ -f "/etc/slurm/plugstack.conf.d/singularity-exec.conf" ]; then - echo "✓ Found plugin config: /etc/slurm/plugstack.conf.d/singularity-exec.conf" +if [ -f "$PLUGSTACK_CONF" ]; then + echo "✓ Found plugin config: $PLUGSTACK_CONF" else - echo "✗ ERROR: Plugin config not found at /etc/slurm/plugstack.conf.d/singularity-exec.conf" + echo "✗ ERROR: Plugin config not found at $PLUGSTACK_CONF" exit 1 fi echo @@ -84,7 +99,7 @@ echo if [ "$SKIP_CONTAINER_TEST" != "true" ]; then echo "Test 5: Creating a test container image..." # Use shared directory so container is accessible from both slurmctld and slurmd - TEST_CONTAINER="/var/spool/slurm-jobs/test-debian.sif" + TEST_CONTAINER="${SLURM_JOB_SPOOL}/test-debian.sif" if [ ! -f "$TEST_CONTAINER" ]; then # Create a minimal Debian container $SINGULARITY_CMD pull "$TEST_CONTAINER" docker://debian:stable-slim @@ -102,23 +117,23 @@ fi # Test 6: Wait for Slurm to be ready echo "Test 6: Waiting for Slurm cluster to be ready..." -if ! retry --times=30 --delay=2 -- scontrol ping >/dev/null 2>&1; then +if ! retry --times="$RETRY_TIMES" --delay="$RETRY_DELAY" -- scontrol ping >/dev/null 2>&1; then echo "✗ ERROR: Slurm controller not responding" exit 1 fi echo "✓ Slurm controller is responding" # Wait for node to be ready -if ! retry --times=30 --delay=2 -- bash -c 'sinfo -h -o "%T" 2>/dev/null | grep -qE "idle|mixed|alloc"'; then +if ! retry --times="$RETRY_TIMES" --delay="$RETRY_DELAY" -- bash -c 'sinfo -h -o "%T" 2>/dev/null | grep -qE "idle|mixed|alloc"'; then echo "✗ ERROR: No compute nodes are ready" echo "Showing sinfo output:" sinfo echo echo "Showing last 50 lines of slurmd logs:" - tail -50 /var/log/slurm/slurmd.log 2>/dev/null || echo "Could not read slurmd logs" + tail -50 "${SLURM_LOG_DIR}/slurmd.log" 2>/dev/null || echo "Could not read slurmd logs" echo echo "Showing last 50 lines of slurmctld logs:" - tail -50 /var/log/slurm/slurmctld.log 2>/dev/null || echo "Could not read slurmctld logs" + tail -50 "${SLURM_LOG_DIR}/slurmctld.log" 2>/dev/null || echo "Could not read slurmctld logs" exit 1 fi echo "✓ Compute node is ready" @@ -140,7 +155,7 @@ fi # Wait for job to complete echo " Waiting for job $TEST_JOB_ID to complete..." -retry --times=30 --delay=1 -- bash -c "scontrol show job $TEST_JOB_ID 2>/dev/null | grep -qE 'JobState=(COMPLETED|FAILED|CANCELLED)'" >/dev/null 2>&1 +retry --times="$RETRY_TIMES" --delay="$JOB_RETRY_DELAY" -- bash -c "scontrol show job $TEST_JOB_ID 2>/dev/null | grep -qE 'JobState=(COMPLETED|FAILED|CANCELLED)'" >/dev/null 2>&1 JOB_STATE=$(scontrol show job "$TEST_JOB_ID" 2>/dev/null | grep "JobState" | awk '{print $1}' | cut -d= -f2) if [ "$JOB_STATE" = "COMPLETED" ]; then @@ -158,24 +173,24 @@ echo if [ "$SKIP_CONTAINER_TEST" != "true" ]; then echo "Test 8: Submitting a containerized test job..." JOB_SCRIPT=$(mktemp /tmp/test_job.XXXXXX.sh) -cat > "$JOB_SCRIPT" <<'JOBEOF' +cat > "$JOB_SCRIPT" </dev/null | grep "JobState=" | sed 's/.*JobState=\([^ ]*\).*/\1/') - + if [ "$JOB_STATE" = "COMPLETED" ]; then echo "✓ Job completed successfully" break @@ -204,27 +218,27 @@ while true; do echo "✗ ERROR: Job failed with state: $JOB_STATE" scontrol show job "$JOB_ID" exit 1 - elif [ $waited -ge $max_wait ]; then - echo "✗ ERROR: Job did not complete within ${max_wait}s" + elif [ $waited -ge $JOB_MAX_WAIT ]; then + echo "✗ ERROR: Job did not complete within ${JOB_MAX_WAIT}s" scontrol show job "$JOB_ID" scancel "$JOB_ID" exit 1 fi - - echo " Job state: $JOB_STATE (${waited}s/${max_wait}s)" - sleep 3 - waited=$((waited + 3)) + + echo " Job state: $JOB_STATE (${waited}s/${JOB_MAX_WAIT}s)" + sleep "$JOB_POLL_INTERVAL" + waited=$((waited + JOB_POLL_INTERVAL)) done echo # Test 10: Check job output echo "Test 10: Checking job output..." -JOB_OUTPUT="/var/spool/slurm-jobs/test_job_${JOB_ID}.out" +JOB_OUTPUT="${SLURM_JOB_SPOOL}/test_job_${JOB_ID}.out" if [ -f "$JOB_OUTPUT" ]; then echo "Job output:" cat "$JOB_OUTPUT" echo - + if grep -q "PRETTY_NAME" "$JOB_OUTPUT"; then echo "✓ Job produced expected output (found PRETTY_NAME)" else