From 3dd85b28c819b26a33890f1382366b8d45b5a3c0 Mon Sep 17 00:00:00 2001 From: Idate96 Date: Thu, 29 May 2025 19:10:05 +0200 Subject: [PATCH 01/10] feat: Add improved Docker functionality and cluster workflow from moleworks_ext MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Replace simple Docker setup with comprehensive 2-container architecture - Add production container (ext) for cluster deployment - Add development container (ext-dev) with ROS2 and dual-mode support - Implement unified mount system for optional IsaacLab/RSL-RL mounts - Add cluster workflow scripts for SLURM/PBS job submission - Include mount configuration system with validation - Add container management script (container.sh) for easier usage - Support both root and rootless Docker operation modes - Add comprehensive documentation (MOUNT_SYSTEM_GUIDE.md, DOCKER_ARCHITECTURE.md) - Update .gitignore to exclude user-specific configuration files This brings the IsaacLabExtensionTemplate up to par with the advanced Docker and cluster capabilities developed in moleworks_ext. πŸ€– Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude --- .gitignore | 7 + docker/.env.base | 8 - docker/.env.ext_template-dev.template | 42 ++ docker/.env.ext_template.template | 40 ++ docker/.mount.config.template | 22 + docker/DOCKER_ARCHITECTURE.md | 310 ++++++++++++++ docker/Dockerfile | 21 - docker/Dockerfile.ext | 83 ++++ docker/Dockerfile.ext-dev | 234 ++++++++++ docker/MOUNT_SYSTEM_GUIDE.md | 389 +++++++++++++++++ docker/bashrc | 118 ++++++ docker/cluster/.env.cluster.template | 35 ++ docker/cluster/cluster_interface.sh | 320 ++++++++++++++ docker/cluster/run_singularity.sh | 338 +++++++++++++++ docker/cluster/submit_job_pbs.sh | 30 ++ docker/cluster/submit_job_slurm.sh | 32 ++ docker/cluster/sync_mounts.py | 141 +++++++ docker/container.py | 127 ++++++ docker/container.sh | 274 ++++++++++++ docker/docker-compose.override.yaml.template | 6 + docker/docker-compose.yaml | 189 ++++++++- docker/dynamic_entrypoint.sh | 117 +++++ docker/entrypoint.sh | 7 + docker/mount_config.py | 422 +++++++++++++++++++ docker/utils/__init__.py | 8 + docker/utils/container_interface.py | 295 +++++++++++++ docker/utils/state_file.py | 151 +++++++ docker/utils/x11_utils.py | 227 ++++++++++ docker/x11.yaml | 54 +++ 29 files changed, 4003 insertions(+), 44 deletions(-) delete mode 100644 docker/.env.base create mode 100644 docker/.env.ext_template-dev.template create mode 100644 docker/.env.ext_template.template create mode 100644 docker/.mount.config.template create mode 100644 docker/DOCKER_ARCHITECTURE.md delete mode 100644 docker/Dockerfile create mode 100644 docker/Dockerfile.ext create mode 100644 docker/Dockerfile.ext-dev create mode 100644 docker/MOUNT_SYSTEM_GUIDE.md create mode 100644 docker/bashrc create mode 100644 docker/cluster/.env.cluster.template create mode 100755 docker/cluster/cluster_interface.sh create mode 100755 docker/cluster/run_singularity.sh create mode 100755 docker/cluster/submit_job_pbs.sh create mode 100755 docker/cluster/submit_job_slurm.sh create mode 100755 docker/cluster/sync_mounts.py create mode 100755 docker/container.py create mode 100755 docker/container.sh create mode 100644 docker/docker-compose.override.yaml.template create mode 100755 docker/dynamic_entrypoint.sh create mode 100755 docker/entrypoint.sh create mode 100755 docker/mount_config.py create mode 100644 docker/utils/__init__.py create mode 100644 docker/utils/container_interface.py create mode 100644 docker/utils/state_file.py create mode 100644 docker/utils/x11_utils.py create mode 100644 docker/x11.yaml diff --git a/.gitignore b/.gitignore index 2c4ac5f9..8b1f2be4 100644 --- a/.gitignore +++ b/.gitignore @@ -29,6 +29,13 @@ docker/artifacts/ *.tmp +# Docker user-specific files +docker/.env.ext_template +docker/.env.ext_template-dev +docker/.mount.config +docker/docker-compose.override.yaml +docker/.container.cfg + # Isaac-Sim packman _isaac_sim* _repo diff --git a/docker/.env.base b/docker/.env.base deleted file mode 100644 index a837c476..00000000 --- a/docker/.env.base +++ /dev/null @@ -1,8 +0,0 @@ -### -# General settings -### - -# Isaac Lab base image -ISAACLAB_BASE_IMAGE=isaac-lab-base -# The Isaac Lab Extension Template path in the container -DOCKER_ISAACLAB_EXTENSION_TEMPLATE_PATH=/workspace/isaaclab_extension_template diff --git a/docker/.env.ext_template-dev.template b/docker/.env.ext_template-dev.template new file mode 100644 index 00000000..725666ac --- /dev/null +++ b/docker/.env.ext_template-dev.template @@ -0,0 +1,42 @@ +# ========================= +# Extension Configuration +# ========================= +EXTENSION_NAME=ext_template +EXTENSION_FOLDER=/path/to/your/project/folder +EXT_PATH=$EXTENSION_FOLDER/$EXTENSION_NAME +DOCKER_EXT_PATH=/workspace/$EXTENSION_NAME + +# ========================= +# Docker User Configuration +# ========================= +HOST_HOME=/home/your_username +DOCKER_USER_NAME=your_username +DOCKER_USER_HOME=/home/your_username + +# ========================= +# Isaac Sim Configuration +# ========================= +DOCKER_ISAACSIM_ROOT_PATH=/isaac-sim + +# ========================= +# Built-in IsaacLab Configuration +# ========================= +# The built-in IsaacLab is always at /workspace/isaaclab +DOCKER_ISAACLAB_PATH=/workspace/isaaclab + +# ========================= +# External Codebase Mounting +# ========================= +# External mounts are now configured via the unified mount system. +# Run './container.sh mount-setup' to configure optional mounts for IsaacLab and RSL-RL. + +# ========================= +# NVIDIA Configuration +# ========================= +ACCEPT_EULA=Y + +# ========================= +# WANDB Configuration (Optional) +# ========================= +# WANDB_API_KEY=your_wandb_api_key +# WANDB_USERNAME=your_wandb_username \ No newline at end of file diff --git a/docker/.env.ext_template.template b/docker/.env.ext_template.template new file mode 100644 index 00000000..860b6a85 --- /dev/null +++ b/docker/.env.ext_template.template @@ -0,0 +1,40 @@ +# ========================= +# Extension Configuration +# ========================= +EXTENSION_NAME=ext_template +EXT_PATH=/path/to/your/extension +DOCKER_EXT_PATH=/workspace/$EXTENSION_NAME + +# ========================= +# Docker User Configuration +# ========================= +DOCKER_USER_NAME=root +DOCKER_USER_HOME=/root + +# ========================= +# Isaac Sim Configuration +# ========================= +DOCKER_ISAACSIM_ROOT_PATH=/isaac-sim + +# ========================= +# Built-in IsaacLab Configuration +# ========================= +# The built-in IsaacLab is always at /workspace/isaaclab +DOCKER_ISAACLAB_PATH=/workspace/isaaclab + +# ========================= +# External Codebase Mounting +# ========================= +# External mounts are now configured via the unified mount system. +# Run './container.sh mount-setup' to configure optional mounts for IsaacLab and RSL-RL. + +# ========================= +# NVIDIA Configuration +# ========================= +ACCEPT_EULA=Y + +# ========================= +# WANDB Configuration (Optional) +# ========================= +# WANDB_API_KEY=your_wandb_api_key +# WANDB_USERNAME=your_wandb_username \ No newline at end of file diff --git a/docker/.mount.config.template b/docker/.mount.config.template new file mode 100644 index 00000000..9f5ad653 --- /dev/null +++ b/docker/.mount.config.template @@ -0,0 +1,22 @@ +{ + "mounts": { + "isaaclab": { + "enabled": false, + "local_path": "/path/to/your/isaaclab", + "cluster_path": "", + "container_path": "/workspace/isaaclab", + "mount_type": "source", + "sync_to_cluster": true, + "description": "External IsaacLab installation (mounts only source/ subdirectory to preserve container's Python environment)" + }, + "rsl_rl": { + "enabled": false, + "local_path": "/path/to/your/rsl_rl", + "cluster_path": "", + "container_path": "/workspace/isaaclab/_isaac_sim/kit/python/lib/python3.10/site-packages/rsl_rl", + "mount_type": "full", + "sync_to_cluster": true, + "description": "External RSL-RL installation (completely overrides built-in version)" + } + } +} \ No newline at end of file diff --git a/docker/DOCKER_ARCHITECTURE.md b/docker/DOCKER_ARCHITECTURE.md new file mode 100644 index 00000000..06095e28 --- /dev/null +++ b/docker/DOCKER_ARCHITECTURE.md @@ -0,0 +1,310 @@ +# Docker Container Architecture + +This document describes the simplified Docker container architecture for IsaacLab extensions. + +## Overview + +The Docker setup has been simplified from 4 containers to 2 containers: + +1. **Production Container** (`isaac-lab-ext`) - For cluster deployment and training +2. **Development Container** (`isaac-lab-ext-dev`) - Unified development container with ROS2 and dual-mode support + +## Container Details + +### 1. Production Container (Dockerfile.ext) + +- **Purpose**: Minimal container for cluster deployment and training +- **Base Image**: `isaac-lab-base` +- **Features**: + - Minimal package installation for reduced size + - Includes rsl_rl for reinforcement learning + - Optimized for training performance + - No development tools or ROS2 + +**Usage**: +```bash +./container.sh -p ext build +./container.sh -p ext run +``` + +### 2. Development Container (Dockerfile.ext-dev) + +- **Purpose**: Unified development environment with all features +- **Base Image**: `isaac-lab-ros2` +- **Features**: + - All ROS2 packages and dependencies + - Development tools (Claude Code, git-lfs, pytest, etc.) + - Pinocchio robotics library + - CUDA toolkit + - Dual-mode support (root/rootless) + +**Usage**: +```bash +# Standard mode (with user switching) +./container.sh -p ext-dev build +./container.sh -p ext-dev run + +# Rootless mode (everyone runs as root) +./container.sh -p ext-dev-rootless run +``` + +## Dual-Mode Operation + +The development container supports two modes of operation: + +### Root Mode (Default) +- Traditional Docker behavior with user switching +- Uses gosu to switch to the host user inside container +- Preserves host user permissions +- Full access to host filesystem via bind mounts + +### Rootless Mode +- Everyone runs as root inside the container +- Simplified permission model +- Reduced host system access +- Suitable for environments where Docker daemon doesn't have root access + +### Mode Selection + +The mode is controlled by the `DOCKER_ROOTLESS_MODE` environment variable: + +```bash +# Force rootless mode +export DOCKER_ROOTLESS_MODE=true +./container.sh -p ext-dev run + +# Or use the rootless service +./container.sh -p ext-dev-rootless run +``` + +## Flexible Permission System + +The development container implements a sophisticated permission management system that handles various deployment scenarios and permission requirements. + +### How It Works + +#### 1. Dynamic User Creation (Root Mode) +When running in root mode (default), the container dynamically creates a user matching your host system: + +```bash +# Automatically detects host user +USER_ID=${LOCAL_UID:-$(id -u)} # Your host UID +GROUP_ID=${LOCAL_GID:-$(id -g)} # Your host GID + +# Creates matching user inside container +groupadd -g $GROUP_ID -o $USER_NAME +useradd -m -u $USER_ID -g $GROUP_ID -o -s /bin/bash $USER_NAME + +# Switches to that user with gosu +exec gosu $USER_NAME bash +``` + +This ensures files created in the container have the same ownership as on your host system. + +#### 2. Rootless Mode Operation +In rootless mode, everyone runs as root inside the container: +- No user switching occurs +- Simplified permission model +- Ideal for Docker installations without root access +- All operations happen as UID 0 within container + +#### 3. Automatic Permission Fixing +The `FIX_PERMISSIONS` feature automatically corrects file ownership when the container exits: + +```bash +# Enable permission fixing +export FIX_PERMISSIONS=true +./docker/run_dev.sh + +# On container exit, automatically runs: +find "/workspace/${EXTENSION_NAME}" -user root -exec chown ${USER_ID}:${GROUP_ID} {} \; +``` + +#### 4. Background Permission Setup +Large directories are fixed in the background to avoid startup delays: +- `/isaac-sim/kit` permissions are corrected asynchronously +- Container is immediately usable while permissions are being fixed +- Check `~/.permissions_done` file to verify completion + +### Usage Scenarios + +#### Scenario 1: Personal Development Machine +```bash +# Standard mode - preserves your user permissions +./docker/run_dev.sh +# Files created as: youruser:yourgroup +``` + +#### Scenario 2: Student PC (No Root Docker) +```bash +# Rootless mode - everyone is root inside +./docker/run_dev.sh --rootless +# Files created as: root:root (inside container) +``` + +#### Scenario 3: Shared Development Server +```bash +# Custom UID/GID with permission fixing +./docker/run_dev.sh -u 2000 -g 2000 --fix-perms +# Files created as: uid=2000:gid=2000 +``` + +#### Scenario 4: CI/CD Pipeline +```bash +# Rootless with no permission concerns +export DOCKER_ROOTLESS_MODE=true +./docker/container.sh -p ext-dev run python scripts/test.py +``` + +### Environment Variables + +| Variable | Default | Description | +|----------|---------|-------------| +| `DOCKER_ROOTLESS_MODE` | `false` | Enable rootless mode (true/false) | +| `FIX_PERMISSIONS` | `false` | Auto-fix permissions on exit | +| `LOCAL_UID` | Current user | Override user ID | +| `LOCAL_GID` | Current group | Override group ID | +| `DOCKER_USER_NAME` | `user` | Username inside container | +| `DOCKER_USER_HOME` | `/home/$USER` | Home directory path | + +### Permission Decision Tree + +``` +Start Container + β”‚ + β”œβ”€ DOCKER_ROOTLESS_MODE=true? + β”‚ β”‚ + β”‚ └─ Yes β†’ Run as root (UID 0) + β”‚ No user switching + β”‚ Simplified permissions + β”‚ + └─ No β†’ Create user with LOCAL_UID/LOCAL_GID + β”‚ + β”œβ”€ Switch to user with gosu + β”‚ + └─ FIX_PERMISSIONS=true? + β”‚ + └─ Yes β†’ Install exit trap + Fix ownership on exit +``` + +## Migration Guide + +### From Old Setup + +If you were using an older multi-container setup: + +1. **ext** β†’ No changes needed +2. **ext-dev** β†’ Use new unified ext-dev +3. **ext-ros2** β†’ Use new unified ext-dev (includes ROS2) +4. **ext-dev-rootless** β†’ Use ext-dev-rootless service + +### Building Containers + +```bash +# Build production container +./container.sh -p ext build + +# Build development container +./container.sh -p ext-dev build +``` + +### Environment Files + +The following environment files are still used: +- `.env.ext_template` - Production container +- `.env.ext_template-dev` - Development container + +## Container Features Comparison + +| Feature | Production (ext) | Development (ext-dev) | +|---------|-----------------|---------------------| +| Base Image | isaac-lab-base | isaac-lab-ros2 | +| ROS2 | ❌ | βœ… | +| Claude Code | ❌ | βœ… | +| Development Tools | ❌ | βœ… | +| Pinocchio | ❌ | βœ… | +| CUDA Toolkit | ❌ | βœ… | +| Size | Minimal | Full | +| Dual-Mode | ❌ | βœ… | + +## Best Practices + +1. **Use Production Container for**: + - Cluster training jobs + - Performance testing + - Deployment scenarios + +2. **Use Development Container for**: + - Local development + - ROS2 integration work + - Testing and debugging + - Running with different permission models + +3. **Permission Management**: + - Enable `FIX_PERMISSIONS` when working with mounted volumes + - Use rootless mode on systems without root Docker access + - Check `.permissions_done` file to verify background tasks completed + +## Troubleshooting + +### Common Permission Issues and Solutions + +#### Issue 1: "Permission denied" errors +**Symptom**: Can't write to mounted directories +```bash +# Solution 1: Use rootless mode +./docker/run_dev.sh --rootless + +# Solution 2: Enable permission fixing +./docker/run_dev.sh --fix-perms + +# Solution 3: Manually fix permissions +sudo chown -R $(id -u):$(id -g) /workspace/ext_template +``` + +#### Issue 2: Files created as root on host +**Symptom**: After running container, files are owned by root +```bash +# Prevention: Always use FIX_PERMISSIONS +export FIX_PERMISSIONS=true +./docker/run_dev.sh + +# Fix existing files +sudo chown -R $(id -u):$(id -g) . +``` + +#### Issue 3: Container startup is slow +**Symptom**: Long wait before container is usable +```bash +# Check if permissions are still being fixed +./docker/container.sh -p ext-dev exec cat ~/.permissions_done +# If file exists, background setup is complete +``` + +#### Issue 4: Can't access GPU in rootless mode +**Symptom**: nvidia-smi fails in container +```bash +# Ensure NVIDIA_DRIVER_CAPABILITIES is set +export NVIDIA_DRIVER_CAPABILITIES=all +./docker/run_dev.sh --rootless +``` + +### Rootless Mode Issues +```bash +# Verify rootless mode is active +echo $DOCKER_ROOTLESS_MODE + +# Check container user +./container.sh -p ext-dev-rootless exec whoami +# Should output: root +``` + +### Build Issues +```bash +# Clean build with no cache +docker compose build --no-cache isaac-lab-ext-dev + +# Remove old images +docker image prune -f +``` \ No newline at end of file diff --git a/docker/Dockerfile b/docker/Dockerfile deleted file mode 100644 index 1785c440..00000000 --- a/docker/Dockerfile +++ /dev/null @@ -1,21 +0,0 @@ -ARG ISAACLAB_BASE_IMAGE_ARG - -# we use the basic isaaclab image as the base -FROM ${ISAACLAB_BASE_IMAGE_ARG} AS base - -ARG DOCKER_ISAACLAB_EXTENSION_TEMPLATE_PATH_ARG -ENV DOCKER_ISAACLAB_EXTENSION_TEMPLATE_PATH=${DOCKER_ISAACLAB_EXTENSION_TEMPLATE_PATH_ARG} - -USER root - -# Copy the Isaac Lab Extension Template directory (files to exclude are defined in .dockerignore) -COPY ../ ${DOCKER_ISAACLAB_EXTENSION_TEMPLATE_PATH} - -# # Install whatever you need as additional dependencies. -RUN bash -i -c "source ${HOME}/.bashrc && \ - cd ${DOCKER_ISAACLAB_EXTENSION_TEMPLATE_PATH}/source/ext_template && \ - pip install -e ." - -# make working directory as the Isaac Lab directory -# this is the default directory when the container is run -WORKDIR /workspace diff --git a/docker/Dockerfile.ext b/docker/Dockerfile.ext new file mode 100644 index 00000000..a0233e49 --- /dev/null +++ b/docker/Dockerfile.ext @@ -0,0 +1,83 @@ +# we use the basic isaaclab image as the base +FROM isaac-lab-base AS base + +# Declare build arguments +ARG EXTENSION_NAME_ARG +ARG DOCKER_EXT_PATH_ARG +ARG DOCKER_USER_NAME_ARG +ARG DOCKER_USER_HOME_ARG + +# Set environment variables +ENV EXTENSION_NAME=${EXTENSION_NAME_ARG} +ENV DOCKER_EXT_PATH=${DOCKER_EXT_PATH_ARG} +ENV DOCKER_USER_NAME=${DOCKER_USER_NAME_ARG} +ENV DOCKER_USER_HOME=${DOCKER_USER_HOME_ARG} + +# Set the home directory for the user +ENV HOME=${DOCKER_USER_HOME} + +# Create necessary directories for the extension with proper permissions +RUN mkdir -p ${DOCKER_EXT_PATH}/data && \ + mkdir -p ${DOCKER_EXT_PATH}/logs && \ + chmod -R 777 ${DOCKER_EXT_PATH} + + +# Copy the entire extension directory +COPY --chown=root:root . ${DOCKER_EXT_PATH} + +# Install required build dependencies +RUN --mount=type=cache,target=/root/.cache/pip \ + /workspace/isaaclab/isaaclab.sh -p -m pip install toml setuptools wheel build + +# Install extension with explicit setup dependencies +RUN --mount=type=cache,target=/root/.cache/pip \ + cd ${DOCKER_EXT_PATH} && \ + /workspace/isaaclab/isaaclab.sh -p -m pip install -e source/${EXTENSION_NAME} --no-build-isolation + +# Create required symlinks for sensors +RUN mkdir -p /workspace/isaaclab/source/exts/ && \ + ln -s /workspace/isaaclab/_isaac_sim/exts/isaacsim.sensors.rtx /workspace/isaaclab/source/exts/isaacsim.sensors.rtx + +# Clone and Install rsl_rl +RUN git clone https://github.com/leggedrobotics/rsl_rl.git /tmp/rsl_rl && \ + cd /tmp/rsl_rl && \ + ${ISAACLAB_PATH}/isaaclab.sh -p -m pip install . + +# (Do not remove the DOCKER_EXT_PATH folder; keep it for the bind mount) + +# System packages installation +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + figlet \ + gosu \ + apt-utils \ + libeigen3-dev \ + locate \ + wget \ + pkg-config \ + dialog \ + tasksel \ + curl \ + python3-pip \ + rsync + +# Python packages installation +RUN ${ISAACLAB_PATH}/_isaac_sim/python.sh -m pip install warp-lang ruamel.yaml + +#== +# Environment +#== +COPY docker/bashrc /home/bash.bashrc +RUN chmod a+rwx /home/bash.bashrc +COPY docker/entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +# Set up Python alias +RUN echo "alias python3='${ISAACLAB_PATH}/_isaac_sim/python.sh'" >> /home/bash.bashrc && \ + echo "alias python='${ISAACLAB_PATH}/_isaac_sim/python.sh'" >> /home/bash.bashrc + +# Set working directory +WORKDIR ${DOCKER_USER_HOME} + +# Set the entry point +ENTRYPOINT ["/entrypoint.sh"] \ No newline at end of file diff --git a/docker/Dockerfile.ext-dev b/docker/Dockerfile.ext-dev new file mode 100644 index 00000000..96a929bd --- /dev/null +++ b/docker/Dockerfile.ext-dev @@ -0,0 +1,234 @@ +# Unified Development Container with ROS2 and Dual-Mode Support +# Supports both root and rootless operation modes +FROM isaac-lab-ros2 AS base + +# ========================= +# Build Arguments and ENV +# ========================= +ARG EXTENSION_NAME_ARG +ARG EXT_PATH_ARG +ARG DOCKER_EXT_PATH_ARG +ARG DOCKER_USER_NAME_ARG +ARG DOCKER_USER_HOME_ARG + +ENV EXT_PATH=${EXT_PATH_ARG} \ + EXTENSION_NAME=${EXTENSION_NAME_ARG} \ + DOCKER_EXT_PATH=${DOCKER_EXT_PATH_ARG} \ + DOCKER_USER_NAME=${DOCKER_USER_NAME_ARG} \ + DOCKER_USER_HOME=${DOCKER_USER_HOME_ARG} \ + HOME=${DOCKER_USER_HOME} + +# ========================= +# Create User (for root mode) +# ========================= +RUN useradd -d ${DOCKER_USER_HOME} -s /bin/bash ${DOCKER_USER_NAME} || true + +# ========================= +# Install System Dependencies +# ========================= +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + figlet \ + gosu \ + apt-utils \ + libeigen3-dev \ + locate \ + wget \ + pkg-config \ + dialog \ + tasksel \ + curl \ + python3-pip \ + git \ + git-lfs \ + gnupg2 \ + tmux \ + libopen3d-dev \ + software-properties-common \ + rsync \ + python3-colcon-common-extensions \ + python3-colcon-mixin && \ + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb && \ + dpkg -i cuda-keyring_1.0-1_all.deb && \ + rm -f cuda-keyring_1.0-1_all.deb && \ + apt-get update && \ + apt-get install -y --no-install-recommends cuda-toolkit-11-8 && \ + rm -rf /var/lib/apt/lists/* + +# ========================= +# Configure Git LFS +# ========================= +RUN git lfs install + +# ========================= +# Remove Conflicting Eigen Packages +# ========================= +RUN apt-get remove -y ros-humble-eigen3-cmake-module && \ + apt-get update && \ + apt-get install -y --no-install-recommends libeigen3-dev + +# ========================= +# Install Pinocchio via Robotpkg +# ========================= +RUN apt-get update && \ + apt-get install -y --no-install-recommends lsb-release curl && \ + mkdir -p /etc/apt/keyrings && \ + curl http://robotpkg.openrobots.org/packages/debian/robotpkg.asc \ + | tee /etc/apt/keyrings/robotpkg.asc && \ + echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/robotpkg.asc] http://robotpkg.openrobots.org/packages/debian/pub $(lsb_release -cs) robotpkg" \ + | tee /etc/apt/sources.list.d/robotpkg.list && \ + apt-get update && \ + apt-get install -y --no-install-recommends robotpkg-py3*-pinocchio +RUN rm -f /etc/apt/sources.list.d/robotpkg.list + +# ========================= +# Set CUDA Environment Variables +# ========================= +ENV PATH=/usr/local/cuda-11.8/bin${PATH:+:${PATH}} \ + LD_LIBRARY_PATH=/usr/local/cuda-11.8/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} + +# ========================= +# Install Colcon Mixin +# ========================= +RUN colcon mixin add default https://raw.githubusercontent.com/colcon/colcon-mixin-repository/master/index.yaml && \ + colcon mixin update default + +# ========================= +# Install Python Dependencies +# ========================= +RUN python3 -m pip install --no-cache-dir \ + simple-parsing \ + cupy-cuda11x \ + scipy \ + shapely \ + ros2-numpy \ + panda3d_viewer \ + ruamel.yaml \ + --upgrade transforms3d \ + torch + +# ========================= +# Install Development Tools +# ========================= +RUN ${ISAACLAB_PATH}/_isaac_sim/python.sh -m pip install --no-cache-dir \ + ruff \ + black \ + mypy \ + bandit \ + vulture \ + pre-commit \ + pytest \ + pytest-cov \ + warp-lang + +# ========================= +# Install Node.js and Claude Code +# ========================= +RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \ + apt-get install -y nodejs && \ + npm config set registry https://registry.npmjs.org/ && \ + npm install -g @anthropic-ai/claude-code && \ + rm -rf /var/lib/apt/lists/* + +# ========================= +# Install ROS2 Packages +# ========================= +RUN rm -f /etc/apt/sources.list.d/deb.sury.org.list && \ + apt-get update && \ + apt-get install -y --allow-downgrades libbrotli1=1.0.9-2build6 && \ + apt-get install -y \ + libfontconfig1-dev libfreetype6-dev \ + ros-humble-xacro \ + ros-humble-vision-opencv \ + ros-humble-joint-state-publisher-gui \ + ros-humble-filters \ + ros-humble-nav2-msgs \ + ros-humble-tf-transformations \ + ros-humble-gazebo-ros-pkgs \ + ros-humble-gazebo-plugins \ + ros-humble-turtlebot3* \ + ros-humble-turtlebot3-simulations \ + ros-humble-octomap-msgs \ + ros-humble-octomap \ + ros-humble-octomap-rviz-plugins \ + ros-humble-octomap-server \ + ros-humble-tf2 \ + ros-humble-tf2-geometry-msgs \ + ros-humble-tf2-sensor-msgs \ + ros-humble-rqt-graph \ + ros-humble-pcl-ros \ + ros-humble-nav2-costmap-2d \ + ros-humble-robot-state-publisher \ + ros-humble-rviz-common \ + ros-humble-rviz2 \ + ros-humble-zenoh-cpp-vendor && \ + rm -rf /var/lib/apt/lists/* + +# ========================= +# Setup Extension Directories +# ========================= +RUN mkdir -p ${DOCKER_EXT_PATH}/data ${DOCKER_EXT_PATH}/logs && \ + chmod -R 777 ${DOCKER_EXT_PATH} + +# ========================= +# Copy and Install Extension +# ========================= +COPY --chown=root:root source/${EXTENSION_NAME} ${DOCKER_EXT_PATH}/source/${EXTENSION_NAME} +RUN --mount=type=cache,target=/root/.cache/pip \ + cd ${DOCKER_EXT_PATH} && \ + ${ISAACLAB_PATH}/isaaclab.sh -p -m pip install -e source/${EXTENSION_NAME} + +# ========================= +# Clean Up Extension Directory +# ========================= +RUN rm -rf ${DOCKER_EXT_PATH}/source/${EXTENSION_NAME} + +# ========================= +# Create Symlinks for Sensors +# ========================= +RUN mkdir -p /workspace/isaaclab/source/exts/ && \ + ln -s /workspace/isaaclab/_isaac_sim/exts/isaacsim.sensors.rtx /workspace/isaaclab/source/exts/isaacsim.sensors.rtx + +# ========================= +# Clone and Install rsl_rl +# ========================= +RUN git clone https://github.com/leggedrobotics/rsl_rl.git /tmp/rsl_rl && \ + cd /tmp/rsl_rl && \ + ${ISAACLAB_PATH}/isaaclab.sh -p -m pip install . && \ + rm -rf /tmp/rsl_rl + +# ========================= +# Set Ownership and Permissions +# ========================= +# Make kit directory accessible to all users +RUN chmod -R 777 /isaac-sim/kit + +# Create alternative home for rootless mode +RUN mkdir -p /root/project && chmod 777 /root/project + +# ========================= +# Environment Setup +# ========================= +COPY docker/bashrc /home/bash.bashrc +RUN chmod a+rwx /home/bash.bashrc + +# Copy bashrc also to root for rootless mode +RUN cp /home/bash.bashrc /root/.bashrc + +# Dynamic entrypoint that handles both modes +COPY docker/dynamic_entrypoint.sh /entrypoint.sh +RUN chmod +x /entrypoint.sh + +# ========================= +# Set Up Python Aliases +# ========================= +RUN echo "alias python3='${ISAACLAB_PATH}/_isaac_sim/python.sh'" >> /home/bash.bashrc && \ + echo "alias python='${ISAACLAB_PATH}/_isaac_sim/python.sh'" >> /home/bash.bashrc && \ + echo "alias python3='${ISAACLAB_PATH}/_isaac_sim/python.sh'" >> /root/.bashrc && \ + echo "alias python='${ISAACLAB_PATH}/_isaac_sim/python.sh'" >> /root/.bashrc + +# ========================= +# Final Configuration +# ========================= +WORKDIR ${DOCKER_USER_HOME} +ENTRYPOINT ["/entrypoint.sh"] \ No newline at end of file diff --git a/docker/MOUNT_SYSTEM_GUIDE.md b/docker/MOUNT_SYSTEM_GUIDE.md new file mode 100644 index 00000000..fa67b7d6 --- /dev/null +++ b/docker/MOUNT_SYSTEM_GUIDE.md @@ -0,0 +1,389 @@ +# Unified Mount System Guide + +## Overview + +The unified mount system provides a consistent and flexible way to optionally mount external codebases (IsaacLab and RSL-RL) in both Docker and Singularity containers. This solves the challenge of making mounts optional while maintaining compatibility across container runtimes. + +## Key Features + +- **Unified Configuration**: Single configuration file (`.mount.config`) that works for both Docker and Singularity +- **Optional Mounting**: Easily enable/disable mounts without modifying docker-compose files +- **Validation**: Automatic validation of mount paths before container startup +- **Interactive Setup**: User-friendly setup process for configuring mounts +- **Backward Compatibility**: Legacy environment variables still work for cluster operations + +## Quick Start + +### 1. Initial Setup + +Run the interactive setup to configure your mounts: + +```bash +cd docker +./container.sh mount-setup +``` + +This will: +- Create a `.mount.config` file with your mount preferences +- Validate the paths you provide +- Generate `docker-compose.override.yaml` automatically + +### 2. Running Containers + +Use the new container management script: + +```bash +# Run development container +./container.sh -p ext-dev run + +# Run with a specific script +./container.sh -p ext-ros2 run python scripts/train.py + +# Build container +./container.sh -p ext build +``` + +## Configuration File + +The `.mount.config` file stores your mount preferences: + +```json +{ + "mounts": { + "isaaclab": { + "enabled": false, + "local_path": "/path/to/your/isaaclab", + "container_path": "/workspace/isaaclab", + "mount_type": "source", + "description": "External IsaacLab installation" + }, + "rsl_rl": { + "enabled": false, + "local_path": "/path/to/your/rsl_rl", + "container_path": "/workspace/isaaclab/_isaac_sim/kit/python/lib/python3.10/site-packages/rsl_rl", + "mount_type": "full", + "description": "External RSL-RL installation" + } + } +} +``` + +### Mount Types + +- **source**: For IsaacLab - mounts only the `source/` subdirectory to preserve the container's Python environment +- **full**: For RSL-RL - completely overrides the built-in version + +## Mount Management Commands + +### Setup and Configuration + +```bash +# Interactive setup +./container.sh mount-setup + +# Show current configuration +./container.sh mount-show +./container.sh mount-show --format yaml +./container.sh mount-show --format docker +./container.sh mount-show --format singularity + +# Validate configuration +./container.sh mount-validate +``` + +### Enable/Disable Mounts + +```bash +# Enable a mount +./container.sh mount-enable isaaclab +./container.sh mount-enable rsl_rl + +# Disable a mount +./container.sh mount-disable isaaclab +./container.sh mount-disable rsl_rl +``` + +### Set Mount Paths + +```bash +# Set mount path +./container.sh mount-set isaaclab ~/my-isaaclab +./container.sh mount-set rsl_rl ~/my-rsl-rl +``` + +## Docker Usage + +The system automatically generates `docker-compose.override.yaml` based on your mount configuration. This file is used alongside the main `docker-compose.yaml`. + +### Manual Override Regeneration + +If you modify `.mount.config` directly, regenerate the override: + +```bash +./container.sh -r -p ext-dev run +# or +python3 mount_config.py generate +``` + +## Cluster/Singularity Usage + +The system supports both syncing from local to cluster and mount-only modes. + +### Mount Modes + +1. **Sync Mode** (default): Syncs codebase from local to cluster, then mounts +2. **Mount-Only Mode**: Mounts existing codebase on cluster without syncing + +### Basic Cluster Workflow (Sync Mode) + +1. Configure mounts locally: + ```bash + cd docker + ./container.sh mount-setup + # Enable mounts and set local paths + ``` + +2. Push container to cluster: + ```bash + cd cluster + ./cluster_interface.sh push ext_template + ``` + +3. Submit job (codebases are synced and mounted automatically): + ```bash + ./cluster_interface.sh job ext_template --task YourTask --num_envs 64000 + ``` + +### Mount-Only Mode (No Sync) + +Perfect for when codebases already exist on the cluster: + +```bash +# Configure mount-only for IsaacLab +./container.sh mount-enable isaaclab +./container.sh mount-set-sync isaaclab off +./container.sh mount-set-cluster isaaclab /cluster/home/$USER/isaaclab + +# Configure mount-only for RSL-RL +./container.sh mount-enable rsl_rl +./container.sh mount-set-sync rsl_rl off +./container.sh mount-set-cluster rsl_rl /cluster/home/$USER/rsl_rl + +# Submit job (no sync, just mount) +cd cluster +./cluster_interface.sh job ext_template --task YourTask +``` + +### Mixed Mode Example + +Sync IsaacLab but mount existing RSL-RL: + +```bash +# IsaacLab: sync from local +./container.sh mount-enable isaaclab +./container.sh mount-set isaaclab ~/my-isaaclab +./container.sh mount-set-sync isaaclab on + +# RSL-RL: mount existing on cluster +./container.sh mount-enable rsl_rl +./container.sh mount-set-sync rsl_rl off +./container.sh mount-set-cluster rsl_rl /cluster/scratch/$USER/rsl_rl +``` + +## Advanced Usage + +### Using Python API + +```python +from mount_config import MountConfig + +# Load configuration +config = MountConfig() + +# Enable a mount programmatically +config.config["mounts"]["isaaclab"]["enabled"] = True +config.config["mounts"]["isaaclab"]["local_path"] = "/path/to/isaaclab" +config.save_config() + +# Generate docker-compose override +config.generate_docker_compose_override() + +# Get Singularity bind string +binds = config.get_singularity_binds() +print(binds) # -B /path/to/isaaclab/source:/workspace/isaaclab/source:rw +``` + +### Custom Mount Profiles + +You can create different mount configurations for different scenarios: + +```bash +# Save current config +cp .mount.config .mount.config.backup + +# Create development config +./container.sh mount-setup +cp .mount.config .mount.config.dev + +# Create production config +./container.sh mount-setup +cp .mount.config .mount.config.prod + +# Switch between configs +cp .mount.config.dev .mount.config +./container.sh -r -p ext-dev run +``` + +## Migration from Old System + +### Docker + +Old docker-compose.yaml approach: +```yaml +- type: bind + source: ${EXTERNAL_ISAACLAB_PATH:-/dev/null}/source + target: /workspace/isaaclab/source +``` + +New approach: +1. Remove external mount lines from docker-compose.yaml +2. Run `./container.sh mount-setup` +3. Use `./container.sh` to manage containers + +### Environment Files + +Old `.env` approach: +```bash +EXTERNAL_ISAACLAB_PATH=/path/to/isaaclab +EXTERNAL_RSL_RL_PATH=/path/to/rsl_rl +``` + +New approach: +- These variables are no longer needed in `.env` files +- Configuration is stored in `.mount.config` +- Use `./container.sh mount-set` to update paths + +## Troubleshooting + +### Mount Not Working + +1. Check configuration: + ```bash + ./container.sh mount-validate + ``` + +2. Verify override file exists: + ```bash + ls -la docker-compose.override.yaml + ``` + +3. Regenerate override: + ```bash + ./container.sh -r -p ext-dev run + ``` + +### Path Validation Errors + +- **IsaacLab**: Ensure the path contains a `source/` subdirectory +- **RSL-RL**: Ensure the path is either: + - A Python package with `__init__.py` + - A repository with `rsl_rl/` subdirectory + +### Container Can't Find Mounted Code + +1. Check mount is enabled: + ```bash + ./container.sh mount-show --format docker + ``` + +2. Verify paths inside container: + ```bash + ./container.sh -p ext-dev exec ls -la /workspace/isaaclab/source + ``` + +## Best Practices + +1. **Always Use Interactive Setup**: The `mount-setup` command validates paths and prevents common errors + +2. **Test Locally First**: Verify mounts work in Docker before pushing to cluster + +3. **Keep Built-in Versions**: When possible, use the built-in IsaacLab and RSL-RL for stability + +4. **Document Custom Setups**: If using external codebases, document the specific versions/branches required + +5. **Version Control**: Don't commit `.mount.config` or `docker-compose.override.yaml` - they're user-specific + +## Technical Details + +### How It Works + +1. **Configuration**: User preferences stored in `.mount.config` (JSON format) +2. **Docker**: `mount_config.py` generates `docker-compose.override.yaml` with bind mounts +3. **Singularity**: `run_singularity.sh` reads `.mount.config` and adds `-B` bind flags +4. **Validation**: Paths are validated before container startup to prevent runtime errors + +### Docker Compose Override Mechanism + +The system uses Docker Compose's built-in override feature to cleanly manage user-specific mounts: + +#### How Override Files Work + +When you run any docker-compose command, Docker Compose automatically: +1. Reads `docker-compose.yaml` (base configuration) +2. Looks for `docker-compose.override.yaml` in the same directory +3. **Merges** the configurations, with override values taking precedence + +#### Example Merge Process + +**Base `docker-compose.yaml`:** +```yaml +services: + isaac-lab-ext-dev: + image: isaac-lab-ext_template-dev + volumes: + - type: bind + source: ${EXT_PATH} + target: /workspace/${EXTENSION_NAME} +``` + +**Generated `docker-compose.override.yaml`:** +```yaml +services: + isaac-lab-ext-dev: + volumes: + - type: bind + source: /home/user/isaaclab/source + target: /workspace/isaaclab/source +``` + +**Result:** The container gets ALL volumes from both files merged together. + +#### Why Use Override? + +1. **Separation of Concerns**: Base config (tracked) vs user mounts (ignored) +2. **No Manual Editing**: Users never modify docker-compose.yaml +3. **Clean Git History**: No merge conflicts from different mount preferences +4. **Easy Disable**: Just delete override file to remove all custom mounts +5. **Standard Feature**: Works with all docker-compose commands automatically + +### File Structure + +``` +docker/ +β”œβ”€β”€ mount_config.py # Core mount management script +β”œβ”€β”€ container.sh # User-friendly wrapper script +β”œβ”€β”€ .mount.config # User's mount configuration (git-ignored) +β”œβ”€β”€ .mount.config.template # Template for new users +β”œβ”€β”€ docker-compose.yaml # Clean compose file without external mounts +β”œβ”€β”€ docker-compose.override.yaml # Generated mount overrides (git-ignored) +└── cluster/ + β”œβ”€β”€ run_singularity.sh # Updated to support unified config + └── sync_mounts.py # Handles selective syncing to cluster +``` + +### Integration Points + +- **Docker Compose**: Uses override mechanism to add mounts without modifying base file +- **Singularity**: Parses config file and generates bind mount arguments +- **Cluster Sync**: `.mount.config` is automatically included when syncing to cluster +- **Selective Sync**: Only syncs codebases marked with `sync_to_cluster: true` \ No newline at end of file diff --git a/docker/bashrc b/docker/bashrc new file mode 100644 index 00000000..e96ded13 --- /dev/null +++ b/docker/bashrc @@ -0,0 +1,118 @@ +# IsaacLab Extension Docker Environment Configuration + +export DOCKER=1 + +case $- in + *i*) ;; + *) return;; +esac + +HISTCONTROL=ignoreboth +shopt -s histappend +HISTSIZE=1000 +HISTFILESIZE=2000 +shopt -s checkwinsize +[ -x /usr/bin/lesspipe ] && eval "$(SHELL=/bin/sh lesspipe)" + +if [ -z "${debian_chroot:-}" ] && [ -r /etc/debian_chroot ]; then + debian_chroot=$(cat /etc/debian_chroot) +fi + +case "$TERM" in + xterm-color|*-256color) color_prompt=yes;; +esac + +force_color_prompt=yes + +if [ -n "$force_color_prompt" ]; then + if [ -x /usr/bin/tput ] && tput setaf 1 >&/dev/null; then + color_prompt=yes + else + color_prompt= + fi +fi + +if [ "$color_prompt" = yes ]; then + PS1='${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\] ' +else + PS1='${debian_chroot:+($debian_chroot)}\u@\h:\w ' +fi +unset color_prompt force_color_prompt + +case "$TERM" in +xterm*|rxvt*) + PS1="\[\e]0;${debian_chroot:+($debian_chroot)}\u@\h: \w\a\]$PS1" + ;; +*) + ;; +esac + +if [ -x /usr/bin/dircolors ]; then + test -r ~/.dircolors && eval "$(dircolors -b ~/.dircolors)" || eval "$(dircolors -b)" + alias ls='ls --color=auto' + alias grep='grep --color=auto' + alias fgrep='fgrep --color=auto' + alias egrep='egrep --color=auto' +fi + +alias ll='ls -alF' +alias la='ls -A' +alias l='ls -CF' +alias alert='notify-send --urgency=low -i "$([ $? = 0 ] && echo terminal || echo error)" "$(history|tail -nsed -e '\''s/^\s*[0-9]\+\s*//;s/[;&|]\s*alert$//'\'')"' + +if [ -f ~/.bash_aliases ]; then + . ~/.bash_aliases +fi + +if ! shopt -oq posix; then + if [ -f /usr/share/bash-completion/bash_completion ]; then + . /usr/share/bash-completion/bash_completion + elif [ -f /etc/bash_completion ]; then + . /etc/bash_completion + fi +fi + +if [[ ! -d ~/.fzf ]] +then + git clone --depth 1 https://github.com/junegunn/fzf.git ~/.fzf && ~/.fzf/install --all +fi + +[ -f ~/.fzf.bash ] && source ~/.fzf.bash + +parse_git_branch() { + git branch 2> /dev/null | sed -e '/^[^*]/d' -e 's/* \(.*\)/(\1)/' +} + +export PS1=$PS1"\[\e[91m\]\$(parse_git_branch)\[\e[00m\]$ " +export PS1="(D) "$PS1 + +# Claude Code available - use 'claude' command for AI coding assistance +echo "πŸ€– Claude Code is available! Type 'claude' to start AI coding assistance." + +FILE_ROS2=/opt/ros/humble/setup.bash +source $FILE_ROS2 + +export PATH=/usr/local/cuda/bin${PATH:+:${PATH}} +export LD_LIBRARY_PATH=/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} + +# Robotpkg environment variables +export PATH=/opt/openrobots/bin:$PATH +export PKG_CONFIG_PATH=/opt/openrobots/lib/pkgconfig:$PKG_CONFIG_PATH +export LD_LIBRARY_PATH=/opt/openrobots/lib:$LD_LIBRARY_PATH +export PYTHONPATH=/opt/openrobots/lib/python3.10/site-packages:$PYTHONPATH +export CMAKE_PREFIX_PATH=/opt/openrobots:$CMAKE_PREFIX_PATH +export PYTHONPATH="/workspace/isaaclab/_isaac_sim/kit/python/lib/python3.10/site-packages:$PYTHONPATH" +# prevent the use of user packages +export PYTHONNOUSERSITE=1 + +__conda_setup="$('/software/conda/bin/conda' 'shell.bash' 'hook' 2> /dev/null)" +if [ $? -eq 0 ]; then + eval "$__conda_setup" +else + if [ -f "/software/conda/etc/profile.d/conda.sh" ]; then + . "/software/conda/etc/profile.d/conda.sh"cd + else + export PATH="/software/conda/bin:$PATH" + fi +fi +unset __conda_setup \ No newline at end of file diff --git a/docker/cluster/.env.cluster.template b/docker/cluster/.env.cluster.template new file mode 100644 index 00000000..302bcb8f --- /dev/null +++ b/docker/cluster/.env.cluster.template @@ -0,0 +1,35 @@ +### +# Cluster specific settings +### + +# Job scheduler used by cluster. +# Currently supports PBS and SLURM +CLUSTER_USER= +EXTENSION_NAME=ext_template + +CLUSTER_JOB_SCHEDULER=SLURM +# Docker cache dir for Isaac Sim (has to end on docker-isaac-sim) +# e.g. /cluster/scratch/$USER/docker-isaac-sim +CLUSTER_ISAAC_SIM_CACHE_DIR=/cluster/scratch/$CLUSTER_USER/docker-isaac-sim +# Main cluster directory for the extension +CLUSTER_ISAACLAB_DIR=/cluster/home/$CLUSTER_USER/$EXTENSION_NAME +# Cluster login +CLUSTER_LOGIN=$CLUSTER_USER@ +# Cluster scratch directory to store the SIF file +# e.g. /cluster/scratch/$USER +CLUSTER_SIF_PATH=/cluster/work/rsl/$CLUSTER_USER +# Remove the temporary isaaclab code copy after the job is done +REMOVE_CODE_COPY_AFTER_JOB=false +# Python executable within Isaac Lab directory to run with the submitted job +CLUSTER_PYTHON_EXECUTABLE=scripts/rsl_rl/train.py + +# External Codebase Mounting +# External mounts are now configured via the unified mount system. +# Before submitting cluster jobs, configure mounts locally: +# cd docker +# ./container.sh mount-setup +# The .mount.config file will be automatically synced to the cluster. + +# WANDB (optional) +WANDB_API_KEY="" +WANDB_USERNAME="" \ No newline at end of file diff --git a/docker/cluster/cluster_interface.sh b/docker/cluster/cluster_interface.sh new file mode 100755 index 00000000..cb677fc7 --- /dev/null +++ b/docker/cluster/cluster_interface.sh @@ -0,0 +1,320 @@ +#!/usr/bin/env bash + +#== +# Configurations +#== + +# Exits if error occurs +set -e + +# Set tab-spaces +tabs 4 + +# get script directory +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# Load custom environment paths if provided, else use defaults +ENV_CLUSTER_PATH=${ENV_CLUSTER_PATH:-"$SCRIPT_DIR/.env.cluster"} +# Codebase to sync +CODEBASE_PATH=${CODEBASE_PATH:-"$SCRIPT_DIR/../.."} + +# Debug output (comment out for production) +# echo "[DEBUG] SCRIPT_DIR: $SCRIPT_DIR" +# echo "[DEBUG] ENV_CLUSTER_PATH: $ENV_CLUSTER_PATH" +# echo "[DEBUG] CODEBASE_PATH: $CODEBASE_PATH" + +# Source the environment file +if [ -f "$ENV_CLUSTER_PATH" ]; then + source "$ENV_CLUSTER_PATH" +else + echo "[Error] Environment file '$ENV_CLUSTER_PATH' does not exist!" >&2 + exit 1 +fi + +#== +# Functions +#== +# Function to display warnings in red +display_warning() { + echo -e "\033[31mWARNING: $1\033[0m" +} + +# Helper function to compare version numbers +version_gte() { + # Returns 0 if the first version is greater than or equal to the second, otherwise 1 + [ "$(printf '%s\n' "$1" "$2" | sort -V | head -n 1)" == "$2" ] +} + +# Function to check docker versions +check_docker_version() { + # check if docker is installed + if ! command -v docker &> /dev/null; then + echo "[Error] Docker is not installed! Please check the 'Docker Guide' for instruction." >&2; + exit 1 + fi + # Retrieve Docker version + docker_version=$(docker --version | awk '{ print $3 }') + apptainer_version=$(apptainer --version | awk '{ print $3 }') + + # Check if Docker version is exactly 24.0.7 or Apptainer version is exactly 1.2.5 + if [ "$docker_version" = "24.0.7" ] && [ "$apptainer_version" = "1.2.5" ]; then + echo "[INFO]: Docker version ${docker_version} and Apptainer version ${apptainer_version} are tested and compatible." + + # Check if Docker version is >= 27.0.0 and Apptainer version is >= 1.3.4 + elif version_gte "$docker_version" "27.0.0" && version_gte "$apptainer_version" "1.3.4"; then + echo "[INFO]: Docker version ${docker_version} and Apptainer version ${apptainer_version} are tested and compatible." + + # Else, display a warning for non-tested versions + else + display_warning "Docker version ${docker_version} and Apptainer version ${apptainer_version} are non-tested versions. There could be issues, please try to update them. More info: https://isaac-sim.github.io/IsaacLab/source/deployment/cluster.html" + fi +} + +# Checks if a docker image exists, otherwise prints warning and exists +check_image_exists() { + image_name="$1" + if ! docker image inspect $image_name &> /dev/null; then + echo "[Error] The '$image_name' image does not exist!" >&2; + echo "[Error] You might be able to build it with /IsaacLab/docker/container.py." >&2; + exit 1 + fi +} + +# Check if the singularity image exists on the remote host, otherwise print warning and exit +check_singularity_image_exists() { + image_name="$1" + if ! ssh "$CLUSTER_LOGIN" "[ -f $CLUSTER_SIF_PATH/$image_name.tar ]"; then + echo "[Error] The '$image_name' image does not exist on the remote host $CLUSTER_LOGIN!" >&2; + exit 1 + fi +} + +# Function to handle mount configuration sync +sync_mount_config() { + local mount_config_path="$CODEBASE_PATH/docker/.mount.config" + + if [ -f "$mount_config_path" ]; then + echo "[INFO] Found mount configuration file" + + # Check if we need to sync any external codebases + python3 - < file + profile_name=$(basename "$env_file" | sed 's/^\.env\.//') + # Skip .env.cluster file which is not a profile + if [ "$profile_name" != "cluster" ]; then + echo " - $profile_name" + fi + fi + done +} + +#== +# Main +#== + +#!/bin/bash + +help() { + echo -e "\nusage: $(basename "$0") [-h] [-c] [] [...] -- Utility for interfacing between IsaacLab extension and compute clusters." + echo -e "\noptions:" + echo -e " -h Display this help message." + echo -e " -c Check for large files in the synced directory on the cluster (job command only)." + echo -e "\ncommands:" + echo -e " push [] Push the docker image to the cluster." + echo -e " job [] [] Submit a job to the cluster." + echo -e " list-profiles List all available profiles." + echo -e "\nwhere:" + echo -e " is the optional container profile specification. Defaults to 'base'." + echo -e " are optional arguments specific to the job command." + echo -e "\nExternal mount configuration:" + echo -e " Configure mounts locally with: ./container.sh mount-setup" + echo -e " Mount config is automatically synced to cluster" + echo -e "\n" >&2 +} + +# Parse options +while getopts ":hc" opt; do + case ${opt} in + h ) + help + exit 0 + ;; + c ) + check_large_files_flag=true + ;; + \? ) + echo "Invalid option: -$OPTARG" >&2 + help + exit 1 + ;; + esac +done +shift $((OPTIND -1)) + +# Check for command +if [ $# -lt 1 ]; then + echo "Error: Command is required." >&2 + help + exit 1 +fi + +command=$1 +shift +profile="base" + +case $command in + push) + if [ $# -gt 1 ]; then + echo "Error: Too many arguments for push command." >&2 + help + exit 1 + fi + [ $# -eq 1 ] && profile=$1 + echo "Executing push command" + [ -n "$profile" ] && echo "Using profile: $profile" + if ! command -v apptainer &> /dev/null; then + echo "[INFO] Exiting because apptainer was not installed" + echo "[INFO] You may follow the installation procedure from here: https://apptainer.org/docs/admin/main/installation.html#install-ubuntu-packages" + exit + fi + # Check if Docker image exists + check_image_exists isaac-lab-$profile:latest + # Check docker and apptainer version + check_docker_version + # source env file to get cluster login and path information + source $ENV_CLUSTER_PATH + # make sure exports directory exists + mkdir -p /$SCRIPT_DIR/exports + # clear old exports for selected profile + rm -rf /$SCRIPT_DIR/exports/isaac-lab-$profile* + # create singularity image + # NOTE: we create the singularity image as non-root user to allow for more flexibility. If this causes + # issues, remove the --fakeroot flag and open an issue on the IsaacLab repository. + cd /$SCRIPT_DIR/exports + APPTAINER_NOHTTPS=1 apptainer build --sandbox --fakeroot isaac-lab-$profile.sif docker-daemon://isaac-lab-$profile:latest + # tar image (faster to send single file as opposed to directory with many files) + tar -cvf /$SCRIPT_DIR/exports/isaac-lab-$profile.tar isaac-lab-$profile.sif + # make sure target directory exists + ssh $CLUSTER_LOGIN "mkdir -p $CLUSTER_SIF_PATH" + # send image to cluster + scp $SCRIPT_DIR/exports/isaac-lab-$profile.tar $CLUSTER_LOGIN:$CLUSTER_SIF_PATH/isaac-lab-$profile.tar + ;; + list-profiles) + list_profiles + ;; + job) + if [ $# -ge 1 ] && [ -f "$CODEBASE_PATH/docker/.env.$1" ]; then + profile=$1 + shift + fi + job_args="$@" + echo "[INFO] Executing job command" + [ -n "$profile" ] && echo -e "\tUsing profile: $profile" + [ -n "$job_args" ] && echo -e "\tJob arguments: $job_args" + source $ENV_CLUSTER_PATH + # Get current date and time + current_datetime=$(date +"%Y%m%d_%H%M%S") + # Append current date and time to CLUSTER_ISAACLAB_DIR + CLUSTER_ISAACLAB_DIR="${CLUSTER_ISAACLAB_DIR}_${current_datetime}" + # Check if singularity image exists on the remote host + check_singularity_image_exists isaac-lab-$profile + # make sure target directory exists + ssh $CLUSTER_LOGIN "mkdir -p $CLUSTER_ISAACLAB_DIR" + + # Sync mount configuration if present + sync_mount_config + + # Sync extension code + echo "[INFO] Syncing extension codebase: $CODEBASE_PATH" + echo "[INFO] Preparing to sync files to cluster..." + + # Show estimated transfer size before starting + echo -n "[INFO] Calculating transfer size... " + transfer_size=$(rsync -avhnL --exclude="*.git*" --filter=':- .dockerignore' "$CODEBASE_PATH" "$CLUSTER_LOGIN:$CLUSTER_ISAACLAB_DIR" | tail -n 1 | awk '{print $4}') + echo "done" + echo "[INFO] Estimated transfer size: $transfer_size" + + # Sync with progress bar + echo "[INFO] Starting sync..." + rsync -avhL --progress --exclude="*.git*" --filter=':- .dockerignore' "$CODEBASE_PATH" "$CLUSTER_LOGIN:$CLUSTER_ISAACLAB_DIR" | \ + while IFS= read -r line; do + if [[ "$line" =~ ^[[:space:]]*[0-9,]+[[:space:]]+[0-9]+%[[:space:]]+[0-9.]+[A-Za-z]+/s[[:space:]]+[0-9:]+[[:space:]]*$ ]]; then + # This is a progress line, show it with nice formatting + echo -ne "\r[SYNC] $line" + elif [[ "$line" =~ sent.*received.*bytes ]]; then + # Final summary line + echo -e "\n[INFO] Transfer complete: $line" + fi + done + echo "" + echo "[INFO] βœ“ Codebase sync completed successfully" + # Report large files + if [ "$check_large_files_flag" = true ]; then + echo "[INFO] Checking for large files in synced directory on cluster..." + ssh $CLUSTER_LOGIN "echo 'Files larger than 50MB:'; find '$CLUSTER_ISAACLAB_DIR' -type f -size +10M -print0 | xargs -0 du -h | sort -rh" + echo "[INFO] If any of the above files are not needed, consider adding them to your .dockerignore file to speed up future syncs." + fi + # execute job script + echo "[INFO] Executing job script..." + # check whether the second argument is a profile or a job argument + submit_job $job_args + ;; + *) + echo "Error: Invalid command: $command" >&2 + help + exit 1 + ;; +esac \ No newline at end of file diff --git a/docker/cluster/run_singularity.sh b/docker/cluster/run_singularity.sh new file mode 100755 index 00000000..f638b82b --- /dev/null +++ b/docker/cluster/run_singularity.sh @@ -0,0 +1,338 @@ +#!/usr/bin/env bash + +echo "(run_singularity.sh): Called on compute node from current directory $1 with container profile $2 and arguments ${@:5}" +echo "[DEBUG] Working directory: $(pwd)" +echo "[DEBUG] All arguments: $*" +echo "[DEBUG] Directory path (arg1): $1" +echo "[DEBUG] Container profile (arg2): $2" +echo "[DEBUG] Cluster env (arg3): $3" +echo "[DEBUG] Base env (arg4): $4" +echo "[DEBUG] Script arguments (arg5+): ${@:5}" +echo "[DEBUG] Current user: $(whoami)" +echo "[DEBUG] TMPDIR: $TMPDIR" + +# Parse mount arguments +MOUNT_ISAACLAB_PATH="" +MOUNT_RSL_RL_PATH="" +SCRIPT_ARGS=() +ALL_ARGS=("${@:5}") # Get all arguments after the env files + +# Find the "--" delimiter to separate mount args from script args +delimiter_found=false +i=0 +while [ $i -lt ${#ALL_ARGS[@]} ]; do + if [ "${ALL_ARGS[$i]}" = "--" ]; then + delimiter_found=true + break + fi + i=$((i+1)) +done + +if [ "$delimiter_found" = true ]; then + # Extract mount args (before delimiter) and script args (after delimiter) + MOUNT_ARGS_TEMP=("${ALL_ARGS[@]:0:$i}") + SCRIPT_ARGS=("${ALL_ARGS[@]:$((i+1))}") + echo "[DEBUG] Found delimiter at position $i" + echo "[DEBUG] Mount args: ${MOUNT_ARGS_TEMP[*]}" + echo "[DEBUG] Script args: ${SCRIPT_ARGS[*]}" +else + # No delimiter, all args are script args (backward compatibility) + echo "[DEBUG] No delimiter found, treating all as script args" + SCRIPT_ARGS=("${ALL_ARGS[@]}") + MOUNT_ARGS_TEMP=() +fi + +# Parse mount arguments +i=0 +while [ $i -lt ${#MOUNT_ARGS_TEMP[@]} ]; do + arg_val="${MOUNT_ARGS_TEMP[$i]}" + echo "[DEBUG] Processing mount arg $i: $arg_val" + + case "$arg_val" in + --mount_isaaclab) + echo "[DEBUG] Found --mount_isaaclab at position $i" + i=$((i+1)) # Move to the path + if [ $i -lt ${#MOUNT_ARGS_TEMP[@]} ]; then + MOUNT_ISAACLAB_PATH="${MOUNT_ARGS_TEMP[$i]}" + echo "[DEBUG] run_singularity.sh: Captured MOUNT_ISAACLAB_PATH as: $MOUNT_ISAACLAB_PATH" + else + echo "[ERROR] run_singularity.sh: --mount_isaaclab requires a value." >&2; exit 1; + fi + ;; + --mount_rsl_rl) + echo "[DEBUG] Found --mount_rsl_rl at position $i" + i=$((i+1)) # Move to the path + if [ $i -lt ${#MOUNT_ARGS_TEMP[@]} ]; then + MOUNT_RSL_RL_PATH="${MOUNT_ARGS_TEMP[$i]}" + echo "[DEBUG] run_singularity.sh: Captured MOUNT_RSL_RL_PATH as: $MOUNT_RSL_RL_PATH" + else + echo "[ERROR] run_singularity.sh: --mount_rsl_rl requires a value." >&2; exit 1; + fi + ;; + *) + echo "[WARNING] Unknown mount argument: $arg_val" + ;; + esac + i=$((i+1)) +done + +echo "[DEBUG] run_singularity.sh: Final MOUNT_ISAACLAB_PATH: $MOUNT_ISAACLAB_PATH" +echo "[DEBUG] run_singularity.sh: Final MOUNT_RSL_RL_PATH: $MOUNT_RSL_RL_PATH" +echo "[DEBUG] run_singularity.sh: Final SCRIPT_ARGS: ${SCRIPT_ARGS[*]}" + +#== +# Helper functions +#== + +setup_directories() { + # Check and create directories + for dir in \ + "${CLUSTER_ISAAC_SIM_CACHE_DIR}/cache/kit" \ + "${CLUSTER_ISAAC_SIM_CACHE_DIR}/cache/ov" \ + "${CLUSTER_ISAAC_SIM_CACHE_DIR}/cache/pip" \ + "${CLUSTER_ISAAC_SIM_CACHE_DIR}/cache/glcache" \ + "${CLUSTER_ISAAC_SIM_CACHE_DIR}/cache/computecache" \ + "${CLUSTER_ISAAC_SIM_CACHE_DIR}/logs" \ + "${CLUSTER_ISAAC_SIM_CACHE_DIR}/data" \ + "${CLUSTER_ISAAC_SIM_CACHE_DIR}/documents"; do + if [ ! -d "$dir" ]; then + mkdir -p "$dir" + echo "Created directory: $dir" + fi + done +} + +#== +# Main +#== + +# get script directory +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# load variables to set the Isaac Lab path on the cluster +ENV_CLUSTER_PATH=${3:-"$SCRIPT_DIR/.env.cluster"} +ENV_BASE_PATH=${4:-"$SCRIPT_DIR/../.env.ext_template"} + +# load variables to set the Isaac Lab path on the cluster +echo "[DEBUG] Loading environment from: $ENV_CLUSTER_PATH" +echo "[DEBUG] Loading base environment from: $ENV_BASE_PATH" + +if [ ! -f "$ENV_CLUSTER_PATH" ]; then + echo "[ERROR] Cluster environment file not found: $ENV_CLUSTER_PATH" + exit 1 +fi + +if [ ! -f "$ENV_BASE_PATH" ]; then + echo "[ERROR] Base environment file not found: $ENV_BASE_PATH" + exit 1 +fi + +source "$ENV_CLUSTER_PATH" +source "$ENV_BASE_PATH" + +echo "[DEBUG] CLUSTER_SIF_PATH: $CLUSTER_SIF_PATH" +echo "[DEBUG] Container profile: $2" + +# make sure that all directories exists in cache directory +setup_directories +# copy all cache files +cp -r $CLUSTER_ISAAC_SIM_CACHE_DIR $TMPDIR + +# make sure logs directory exists (in the permanent isaaclab directory) +mkdir -p "$CLUSTER_ISAACLAB_DIR/logs" +touch "$CLUSTER_ISAACLAB_DIR/logs/.keep" + +# copy the temporary isaaclab directory with the latest changes to the compute node +cp -r $1 $TMPDIR +# Get the directory name +dir_name=$(basename "$1") + +# if defined, remove the temporary isaaclab directory pushed when the job was submitted +echo "[DEBUG] REMOVE_CODE_COPY_AFTER_JOB: $REMOVE_CODE_COPY_AFTER_JOB" +if [ "$REMOVE_CODE_COPY_AFTER_JOB" = "true" ]; then + echo "[DEBUG] Attempting to remove temporary directory early: $1" + # cd to a neutral directory before removing. Using a subshell with cd / + (cd / && rm -rf "$1") || echo "[WARNING] Failed to remove $1 early." +else + echo "[DEBUG] Keeping temporary directory as per REMOVE_CODE_COPY_AFTER_JOB=$REMOVE_CODE_COPY_AFTER_JOB: $1" +fi + +# copy container to the compute node +echo "[DEBUG] Extracting container: $CLUSTER_SIF_PATH/$2.tar" +if [ ! -f "$CLUSTER_SIF_PATH/$2.tar" ]; then + echo "[ERROR] Container file not found: $CLUSTER_SIF_PATH/$2.tar" + exit 1 +fi + +tar -xf $CLUSTER_SIF_PATH/$2.tar -C $TMPDIR +echo "[DEBUG] Container extracted successfully" + +# Determine binding strategy based on environment variables +SINGULARITY_BINDS="" + +# Always bind cache directories +SINGULARITY_BINDS="$SINGULARITY_BINDS -B $TMPDIR/docker-isaac-sim/cache/kit:${DOCKER_ISAACSIM_ROOT_PATH}/kit/cache:rw" +SINGULARITY_BINDS="$SINGULARITY_BINDS -B $TMPDIR/docker-isaac-sim/cache/ov:${DOCKER_USER_HOME}/.cache/ov:rw" +SINGULARITY_BINDS="$SINGULARITY_BINDS -B $TMPDIR/docker-isaac-sim/cache/pip:${DOCKER_USER_HOME}/.cache/pip:rw" +SINGULARITY_BINDS="$SINGULARITY_BINDS -B $TMPDIR/docker-isaac-sim/cache/glcache:${DOCKER_USER_HOME}/.cache/nvidia/GLCache:rw" +SINGULARITY_BINDS="$SINGULARITY_BINDS -B $TMPDIR/docker-isaac-sim/cache/computecache:${DOCKER_USER_HOME}/.nv/ComputeCache:rw" +SINGULARITY_BINDS="$SINGULARITY_BINDS -B $TMPDIR/docker-isaac-sim/logs:${DOCKER_USER_HOME}/.nvidia-omniverse/logs:rw" +SINGULARITY_BINDS="$SINGULARITY_BINDS -B $TMPDIR/docker-isaac-sim/data:${DOCKER_USER_HOME}/.local/share/ov/data:rw" +SINGULARITY_BINDS="$SINGULARITY_BINDS -B $TMPDIR/docker-isaac-sim/documents:${DOCKER_USER_HOME}/Documents:rw" + +# Bind logs directory +SINGULARITY_BINDS="$SINGULARITY_BINDS -B $CLUSTER_ISAACLAB_DIR/logs:$DOCKER_ISAACLAB_PATH/logs:rw" + +# NEW: Check for unified mount config file +MOUNT_CONFIG_FILE="$TMPDIR/$dir_name/docker/.mount.config" +if [ -f "$MOUNT_CONFIG_FILE" ]; then + echo "[INFO] Found unified mount configuration file" + + # Use Python to parse mount config and generate binds + ADDITIONAL_BINDS=$(python3 - </dev/null || true + source /home/bash.bashrc 2>/dev/null || true + cd $DOCKER_EXT_PATH + export ISAACLAB_PATH=/workspace/isaaclab + $WANDB_ENV_VARS + + # Debug information + echo '[CONTAINER] Current directory:' \$(pwd) + echo '[CONTAINER] ISAACLAB_PATH:' \$ISAACLAB_PATH + echo '[CONTAINER] Checking if _isaac_sim exists:' \$(ls -la \$ISAACLAB_PATH/_isaac_sim 2>/dev/null | head -5 || echo '_isaac_sim directory not found') + echo '[CONTAINER] Checking python.sh:' \$(ls -la \$ISAACLAB_PATH/_isaac_sim/python.sh 2>/dev/null || echo 'python.sh not found') + echo '[CONTAINER] Python aliases:' \$(alias | grep python || echo 'No python aliases found') + echo '[CONTAINER] PATH:' \$PATH + echo '[CONTAINER] Running python from:' \$(which python 2>/dev/null || echo 'python not found in PATH') + + # Set up Python function instead of alias (aliases don't work in non-interactive bash) + if ! command -v python >/dev/null 2>&1; then + echo '[CONTAINER] Setting up Python function manually' + function python() { \$ISAACLAB_PATH/_isaac_sim/python.sh \"\$@\"; } + function python3() { \$ISAACLAB_PATH/_isaac_sim/python.sh \"\$@\"; } + export -f python python3 + fi + + echo '[CONTAINER] Python version:' \$(python --version 2>/dev/null || echo 'python command failed') + echo '[CONTAINER] Direct python.sh test:' \$(\$ISAACLAB_PATH/_isaac_sim/python.sh --version 2>/dev/null || echo 'direct python.sh failed') + echo '[CONTAINER] PYTHONPATH:' \$PYTHONPATH + echo '[CONTAINER] Listing /workspace:' \$(ls /workspace 2>/dev/null || echo '/workspace not found or empty') + echo '[CONTAINER] Listing /workspace/isaaclab:' \$(ls /workspace/isaaclab 2>/dev/null || echo '/workspace/isaaclab not found or empty') + echo '[CONTAINER] Listing $DOCKER_EXT_PATH:' \$(ls $DOCKER_EXT_PATH 2>/dev/null || echo '$DOCKER_EXT_PATH not found or empty') + echo '[CONTAINER] Python sys.path:' + python -c 'import sys; print(sys.path)' 2>/dev/null || echo 'Failed to get Python sys.path' + echo '[CONTAINER] Direct python.sh sys.path:' + \$ISAACLAB_PATH/_isaac_sim/python.sh -c 'import sys; print(sys.path)' 2>/dev/null || echo 'Failed to get direct python.sh sys.path' + echo '[CONTAINER] Attempting to import isaaclab.app:' + python -c 'from isaaclab.app import AppLauncher; print(\"AppLauncher imported successfully\")' 2>/dev/null || echo 'Failed to import isaaclab.app' + echo '[CONTAINER] Direct python.sh isaaclab.app import:' + \$ISAACLAB_PATH/_isaac_sim/python.sh -c 'from isaaclab.app import AppLauncher; print(\"AppLauncher imported successfully\")' 2>/dev/null || echo 'Failed to import isaaclab.app with direct python.sh' + echo '[CONTAINER] Checking RSL-RL installation:' + echo '[CONTAINER] RSL-RL site-packages location:' \$(ls -la \$ISAACLAB_PATH/_isaac_sim/kit/python/lib/python3.10/site-packages/ | grep rsl_rl || echo 'No rsl_rl found in site-packages') + echo '[CONTAINER] RSL-RL directory contents:' \$(ls -la \$ISAACLAB_PATH/_isaac_sim/kit/python/lib/python3.10/site-packages/rsl_rl/ 2>/dev/null | head -10 || echo 'RSL-RL directory not accessible') + echo '[CONTAINER] Testing RSL-RL import:' + python -c 'import rsl_rl; print(\"RSL-RL version:\", rsl_rl.__version__ if hasattr(rsl_rl, \"__version__\") else \"no version info\")' 2>/dev/null || echo 'Failed to import rsl_rl' + echo '[CONTAINER] Testing RSL-RL runners import:' + python -c 'from rsl_rl.runners import OnPolicyRunner; print(\"OnPolicyRunner imported successfully\")' 2>/dev/null || echo 'Failed to import rsl_rl.runners' + echo '[CONTAINER] Running main script: $CLUSTER_PYTHON_EXECUTABLE' + \$ISAACLAB_PATH/_isaac_sim/python.sh $CLUSTER_PYTHON_EXECUTABLE ${SCRIPT_ARGS[*]} + " + +# copy resulting cache files back to host +echo "[DEBUG] Copying cache files back to host" +rsync -azPv $TMPDIR/docker-isaac-sim $CLUSTER_ISAAC_SIM_CACHE_DIR/.. + +# if defined, remove the temporary isaaclab directory pushed when the job was submitted +echo "(run_singularity.sh): Return" \ No newline at end of file diff --git a/docker/cluster/submit_job_pbs.sh b/docker/cluster/submit_job_pbs.sh new file mode 100755 index 00000000..bd9ba374 --- /dev/null +++ b/docker/cluster/submit_job_pbs.sh @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +# in the case you need to load specific modules on the cluster, add them here +# e.g., `module load eth_proxy` + +# create job script with compute demands +### MODIFY HERE FOR YOUR JOB ### +cat < job.sh +#!/bin/bash + +#PBS -l select=1:ncpus=8:mpiprocs=1:ngpus=1 +#PBS -l walltime=01:00:00 +#PBS -j oe +#PBS -q gpu +#PBS -N isaaclab-ext +#PBS -m bea -M "user@mail" + +# Variables passed from submit script +dir="$1" +profile="$2" +# Skip empty mount args and "--" delimiter +shift 4 +script_args="\$@" + +# Mount configuration is now handled by .mount.config file +bash "\$dir/docker/cluster/run_singularity.sh" "\$dir" "\$profile" "\$dir/docker/cluster/.env.cluster" "\$dir/docker/.env.ext_template" -- \$script_args +EOT + +qsub job.sh +rm job.sh \ No newline at end of file diff --git a/docker/cluster/submit_job_slurm.sh b/docker/cluster/submit_job_slurm.sh new file mode 100755 index 00000000..3248c792 --- /dev/null +++ b/docker/cluster/submit_job_slurm.sh @@ -0,0 +1,32 @@ +#!/usr/bin/env bash + +# in the case you need to load specific modules on the cluster, add them here +module load eth_proxy + +# create job script with compute demands +### MODIFY HERE FOR YOUR JOB ### +cat < job.sh +#!/bin/bash + +#SBATCH -n 1 +#SBATCH --cpus-per-task=4 +#SBATCH --gpus=rtx_3090:1 +#SBATCH --time=03:00:00 +#SBATCH --mem-per-cpu=4048 +#SBATCH --mail-type=END +#SBATCH --mail-user=name@mail +#SBATCH --job-name="isaaclab-ext-$(date +"%Y-%m-%dT%H:%M")" + +# Variables passed from submit script +dir="$1" +profile="$2" +# Skip empty mount args and "--" delimiter +shift 4 +script_args="\$@" + +# Mount configuration is now handled by .mount.config file +bash "\$dir/docker/cluster/run_singularity.sh" "\$dir" "\$profile" "\$dir/docker/cluster/.env.cluster" "\$dir/docker/.env.ext_template" -- \$script_args +EOT + +sbatch < job.sh +rm job.sh \ No newline at end of file diff --git a/docker/cluster/sync_mounts.py b/docker/cluster/sync_mounts.py new file mode 100755 index 00000000..62498c21 --- /dev/null +++ b/docker/cluster/sync_mounts.py @@ -0,0 +1,141 @@ +#!/usr/bin/env python3 +""" +Sync external codebases to cluster based on mount configuration. +""" + +import json +import os +import subprocess +import sys +from pathlib import Path + + +def sync_codebase(mount_name: str, local_path: str, cluster_path: str, cluster_login: str): + """Sync a codebase to the cluster.""" + local_path = Path(os.path.expanduser(local_path)) + + if not local_path.exists(): + print(f"[ERROR] Local path not found: {local_path}") + return False + + print(f"[INFO] Syncing {mount_name} from {local_path}") + + # Common excludes + excludes = [ + "--exclude=*.git*", + "--exclude=_build/", + "--exclude=logs/", + "--exclude=*.pyc", + "--exclude=__pycache__/", + "--exclude=*.egg-info/", + "--exclude=wandb/", + "--exclude=*.ckpt", + "--exclude=*.pth", + "--exclude=*.pt" + ] + + # Create remote directory + ssh_cmd = ["ssh", cluster_login, f"mkdir -p {cluster_path}"] + subprocess.run(ssh_cmd, check=True) + + # Calculate transfer size + print(f"[INFO] Calculating transfer size for {mount_name}...") + size_cmd = ["rsync", "-avhnL"] + excludes + [f"{local_path}/", f"{cluster_login}:{cluster_path}"] + result = subprocess.run(size_cmd, capture_output=True, text=True) + if result.returncode == 0: + lines = result.stdout.strip().split('\n') + if lines: + size_line = lines[-1] + print(f"[INFO] Estimated size: {size_line}") + + # Sync with progress + print(f"[INFO] Starting {mount_name} sync...") + sync_cmd = ["rsync", "-avhL", "--progress"] + excludes + [f"{local_path}/", f"{cluster_login}:{cluster_path}"] + + process = subprocess.Popen(sync_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1) + + for line in iter(process.stdout.readline, ''): + line = line.rstrip() + if line: + # Show progress lines with special formatting + if any(x in line for x in ['%', '/s', 'xfr#']): + print(f"\r[{mount_name.upper()} SYNC] {line}", end='', flush=True) + elif "sent" in line and "received" in line: + print(f"\n[INFO] {mount_name} transfer complete: {line}") + elif not line.startswith(' '): + # Show file names being transferred + print(f"\n[{mount_name.upper()}] {line}", end='', flush=True) + + process.wait() + + if process.returncode == 0: + print(f"\n[INFO] βœ“ {mount_name} sync completed successfully") + return True + else: + print(f"\n[ERROR] {mount_name} sync failed with code {process.returncode}") + return False + + +def main(): + if len(sys.argv) != 3: + print("Usage: sync_mounts.py ") + sys.exit(1) + + config_path = sys.argv[1] + cluster_login = sys.argv[2] + + # Load mount configuration + with open(config_path, 'r') as f: + config = json.load(f) + + # Get cluster username for path expansion + cluster_user = cluster_login.split('@')[0] + + # Sync enabled mounts + for mount_name, mount_config in config.get('mounts', {}).items(): + if not mount_config.get('enabled', False): + continue + + if not mount_config.get('sync_to_cluster', True): + # Mount-only mode + cluster_path = mount_config.get('cluster_path', '') + if cluster_path: + cluster_path = cluster_path.replace('$CLUSTER_USER', cluster_user) + print(f"[INFO] {mount_name} configured for mount-only from: {cluster_path}") + + # Verify it exists on cluster + check_cmd = ["ssh", cluster_login, f"[ -d {cluster_path} ] && echo 'EXISTS' || echo 'NOT_FOUND'"] + result = subprocess.run(check_cmd, capture_output=True, text=True) + if result.returncode == 0 and "EXISTS" in result.stdout: + print(f"[INFO] βœ“ {mount_name} directory verified on cluster") + else: + print(f"[WARNING] {mount_name} directory not found on cluster: {cluster_path}") + continue + + # Sync mode + local_path = mount_config.get('local_path', '') + if not local_path: + print(f"[WARNING] No local path set for {mount_name}, skipping") + continue + + # Determine cluster path + if mount_name == 'isaaclab': + cluster_path = f"/cluster/home/{cluster_user}/isaaclab" + elif mount_name == 'rsl_rl': + cluster_path = f"/cluster/home/{cluster_user}/rsl_rl" + else: + cluster_path = f"/cluster/home/{cluster_user}/{mount_name}" + + # Sync the codebase + sync_codebase(mount_name, local_path, cluster_path, cluster_login) + + # Update config with actual cluster path used + mount_config['cluster_path'] = cluster_path + + # Save updated config with cluster paths + with open(config_path, 'w') as f: + json.dump(config, f, indent=2) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/docker/container.py b/docker/container.py new file mode 100755 index 00000000..add5c07d --- /dev/null +++ b/docker/container.py @@ -0,0 +1,127 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2022-2024, The Isaac Lab Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +import argparse +import shutil +from pathlib import Path + +from utils import ContainerInterface, x11_utils + + +def parse_cli_args() -> argparse.Namespace: + """Parse command line arguments. + + This function creates a parser object and adds subparsers for each command. The function then parses the + command line arguments and returns the parsed arguments. + + Returns: + The parsed command line arguments. + """ + parser = argparse.ArgumentParser(description="Utility for using Docker with Isaac Lab.") + + # We have to create separate parent parsers for common options to our subparsers + parent_parser = argparse.ArgumentParser(add_help=False) + parent_parser.add_argument( + "profile", nargs="?", default="base", help="Optional container profile specification. Example: 'base' or 'ros'." + ) + parent_parser.add_argument( + "--files", + nargs="*", + default=None, + help=( + "Allows additional '.yaml' files to be passed to the docker compose command. These files will be merged" + " with 'docker-compose.yaml' in their provided order." + ), + ) + parent_parser.add_argument( + "--env-files", + nargs="*", + default=None, + help=( + "Allows additional '.env' files to be passed to the docker compose command. These files will be merged with" + " '.env.base' in their provided order." + ), + ) + + # Actual command definition begins here + subparsers = parser.add_subparsers(dest="command", required=True) + subparsers.add_parser( + "start", + help="Build the docker image and create the container in detached mode.", + parents=[parent_parser], + ) + subparsers.add_parser( + "enter", help="Begin a new bash process within an existing Isaac Lab container.", parents=[parent_parser] + ) + config = subparsers.add_parser( + "config", + help=( + "Generate a docker-compose.yaml from the passed yamls, .envs, and either print to the terminal or create a" + " yaml at output_yaml" + ), + parents=[parent_parser], + ) + config.add_argument( + "--output-yaml", nargs="?", default=None, help="Yaml file to write config output to. Defaults to None." + ) + subparsers.add_parser( + "copy", help="Copy build and logs artifacts from the container to the host machine.", parents=[parent_parser] + ) + subparsers.add_parser("stop", help="Stop the docker container and remove it.", parents=[parent_parser]) + + # parse the arguments to determine the command + args = parser.parse_args() + + return args + + +def main(args: argparse.Namespace): + """Main function for the Docker utility.""" + # check if docker is installed + if not shutil.which("docker"): + raise RuntimeError( + "Docker is not installed! Please check the 'Docker Guide' for instruction: " + "https://isaac-sim.github.io/IsaacLab/source/deployment/docker.html" + ) + + # creating container interface + ci = ContainerInterface( + context_dir=Path(__file__).resolve().parent, profile=args.profile, yamls=args.files, envs=args.env_files + ) + + print(f"[INFO] Using container profile: {ci.profile}") + if args.command == "start": + # check if x11 forwarding is enabled + x11_outputs = x11_utils.x11_check(ci.statefile) + # if x11 forwarding is enabled, add the x11 yaml and environment variables + if x11_outputs is not None: + (x11_yaml, x11_envar) = x11_outputs + ci.add_yamls += x11_yaml + ci.environ.update(x11_envar) + # start the container + ci.start() + elif args.command == "enter": + # refresh the x11 forwarding + x11_utils.x11_refresh(ci.statefile) + # enter the container + ci.enter() + elif args.command == "config": + ci.config(args.output_yaml) + elif args.command == "copy": + ci.copy() + elif args.command == "stop": + # stop the container + ci.stop() + # cleanup the x11 forwarding + x11_utils.x11_cleanup(ci.statefile) + else: + raise RuntimeError(f"Invalid command provided: {args.command}. Please check the help message.") + + +if __name__ == "__main__": + args_cli = parse_cli_args() + main(args_cli) \ No newline at end of file diff --git a/docker/container.sh b/docker/container.sh new file mode 100755 index 00000000..76e781fc --- /dev/null +++ b/docker/container.sh @@ -0,0 +1,274 @@ +#!/usr/bin/env bash +# +# Container management script with unified mount configuration support +# This script wraps docker-compose commands and manages optional mounts +# + +set -e + +# Get script directory +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +MOUNT_CONFIG="$SCRIPT_DIR/.mount.config" +DOCKER_COMPOSE_OVERRIDE="$SCRIPT_DIR/docker-compose.override.yaml" + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +NC='\033[0m' # No Color + +# Function to display help +show_help() { + echo "Container Management Script for IsaacLab Extension" + echo "" + echo "Usage: $0 [options] [args...]" + echo "" + echo "Options:" + echo " -h, --help Show this help message" + echo " -p, --profile PROFILE Container profile (ext, ext-dev, ext-dev-rootless)" + echo " -e, --env ENV_FILE Environment file to use (default: auto-detect based on profile)" + echo " -r, --regenerate Regenerate docker-compose.override.yaml from mount config" + echo "" + echo "Mount Commands:" + echo " mount-setup Interactive setup of mount configuration" + echo " mount-show Show current mount configuration" + echo " mount-validate Validate mount configuration" + echo " mount-enable NAME Enable a mount (isaaclab or rsl_rl)" + echo " mount-disable NAME Disable a mount (isaaclab or rsl_rl)" + echo " mount-set NAME PATH Set local mount path" + echo " mount-set-cluster NAME PATH Set cluster path for mount-only mode" + echo " mount-set-sync NAME on|off Enable/disable sync to cluster" + echo "" + echo "Container Commands:" + echo " build Build the container" + echo " run [ARGS] Run the container (passes args to container)" + echo " exec [CMD] Execute command in running container" + echo " attach Attach to running container" + echo " stop Stop the container" + echo " logs Show container logs" + echo " ps Show running containers" + echo "" + echo "Examples:" + echo " $0 mount-setup # Setup mounts interactively" + echo " $0 -p ext-dev run # Run development container" + echo " $0 -p ext-dev run python scripts/run.py # Run dev container with script" + echo " $0 mount-enable isaaclab # Enable IsaacLab mount" + echo " $0 mount-set isaaclab ~/isaaclab # Set IsaacLab path" +} + +# Function to detect environment file based on profile +detect_env_file() { + local profile=$1 + case $profile in + ext) + echo "$SCRIPT_DIR/.env.ext_template" + ;; + ext-dev) + echo "$SCRIPT_DIR/.env.ext_template-dev" + ;; + ext-dev-rootless) + echo "$SCRIPT_DIR/.env.ext_template-dev" + ;; + *) + echo "$SCRIPT_DIR/.env.ext_template" + ;; + esac +} + +# Function to get service name from profile +get_service_name() { + local profile=$1 + echo "isaac-lab-${profile//_/-}" +} + +# Function to check if mount config exists +check_mount_config() { + if [ ! -f "$MOUNT_CONFIG" ]; then + echo -e "${YELLOW}Warning: Mount configuration not found at $MOUNT_CONFIG${NC}" + echo -e "${YELLOW}Run '$0 mount-setup' to configure optional mounts.${NC}" + # Create empty override file if it doesn't exist + if [ ! -f "$DOCKER_COMPOSE_OVERRIDE" ]; then + echo "version: '3.8'" > "$DOCKER_COMPOSE_OVERRIDE" + echo "services: {}" >> "$DOCKER_COMPOSE_OVERRIDE" + fi + return 1 + fi + return 0 +} + +# Function to regenerate docker-compose override +regenerate_override() { + if check_mount_config; then + echo "Regenerating docker-compose.override.yaml..." + python3 "$SCRIPT_DIR/mount_config.py" generate + fi +} + +# Parse command line arguments +PROFILE="" +ENV_FILE="" +REGENERATE=false + +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + show_help + exit 0 + ;; + -p|--profile) + PROFILE="$2" + shift 2 + ;; + -e|--env) + ENV_FILE="$2" + shift 2 + ;; + -r|--regenerate) + REGENERATE=true + shift + ;; + *) + break + ;; + esac +done + +# Get command +COMMAND=${1:-help} +shift || true + +# Set default profile if not specified +if [ -z "$PROFILE" ]; then + PROFILE="ext-dev" + echo "Using default profile: $PROFILE" +fi + +# Auto-detect env file if not specified +if [ -z "$ENV_FILE" ]; then + ENV_FILE=$(detect_env_file "$PROFILE") +fi + +# Check if env file exists +if [ ! -f "$ENV_FILE" ]; then + echo -e "${RED}Error: Environment file not found: $ENV_FILE${NC}" + echo "Please copy the template and configure it:" + echo " cp ${ENV_FILE}.template $ENV_FILE" + exit 1 +fi + +# Get service name +SERVICE_NAME=$(get_service_name "$PROFILE") + +# Regenerate override if requested +if [ "$REGENERATE" = true ]; then + regenerate_override +fi + +# Handle commands +case $COMMAND in + # Mount management commands + mount-setup) + python3 "$SCRIPT_DIR/mount_config.py" setup + ;; + mount-show) + python3 "$SCRIPT_DIR/mount_config.py" show "$@" + ;; + mount-validate) + python3 "$SCRIPT_DIR/mount_config.py" validate + ;; + mount-enable) + if [ -z "$1" ]; then + echo -e "${RED}Error: Mount name required (isaaclab or rsl_rl)${NC}" + exit 1 + fi + python3 "$SCRIPT_DIR/mount_config.py" enable "$1" + ;; + mount-disable) + if [ -z "$1" ]; then + echo -e "${RED}Error: Mount name required (isaaclab or rsl_rl)${NC}" + exit 1 + fi + python3 "$SCRIPT_DIR/mount_config.py" disable "$1" + ;; + mount-set) + if [ -z "$1" ] || [ -z "$2" ]; then + echo -e "${RED}Error: Usage: mount-set NAME PATH${NC}" + exit 1 + fi + python3 "$SCRIPT_DIR/mount_config.py" set "$1" "$2" + ;; + mount-set-cluster) + if [ -z "$1" ] || [ -z "$2" ]; then + echo -e "${RED}Error: Usage: mount-set-cluster NAME PATH${NC}" + exit 1 + fi + python3 "$SCRIPT_DIR/mount_config.py" set-cluster "$1" "$2" + ;; + mount-set-sync) + if [ -z "$1" ] || [ -z "$2" ]; then + echo -e "${RED}Error: Usage: mount-set-sync NAME on|off${NC}" + exit 1 + fi + python3 "$SCRIPT_DIR/mount_config.py" set-sync "$1" "$2" + ;; + + # Container commands + build) + echo "Building container: $SERVICE_NAME" + check_mount_config + docker compose --env-file "$ENV_FILE" \ + --file "$SCRIPT_DIR/docker-compose.yaml" \ + --file "$DOCKER_COMPOSE_OVERRIDE" \ + build "$SERVICE_NAME" "$@" + ;; + run) + echo "Running container: $SERVICE_NAME" + check_mount_config + docker compose --env-file "$ENV_FILE" \ + --file "$SCRIPT_DIR/docker-compose.yaml" \ + --file "$DOCKER_COMPOSE_OVERRIDE" \ + run --rm "$SERVICE_NAME" "$@" + ;; + exec) + echo "Executing in container: $SERVICE_NAME" + docker compose --env-file "$ENV_FILE" \ + --file "$SCRIPT_DIR/docker-compose.yaml" \ + --file "$DOCKER_COMPOSE_OVERRIDE" \ + exec "$SERVICE_NAME" "$@" + ;; + attach) + echo "Attaching to container: $SERVICE_NAME" + docker compose --env-file "$ENV_FILE" \ + --file "$SCRIPT_DIR/docker-compose.yaml" \ + --file "$DOCKER_COMPOSE_OVERRIDE" \ + attach "$SERVICE_NAME" + ;; + stop) + echo "Stopping container: $SERVICE_NAME" + docker compose --env-file "$ENV_FILE" \ + --file "$SCRIPT_DIR/docker-compose.yaml" \ + --file "$DOCKER_COMPOSE_OVERRIDE" \ + stop "$SERVICE_NAME" + ;; + logs) + docker compose --env-file "$ENV_FILE" \ + --file "$SCRIPT_DIR/docker-compose.yaml" \ + --file "$DOCKER_COMPOSE_OVERRIDE" \ + logs "$SERVICE_NAME" "$@" + ;; + ps) + docker compose --env-file "$ENV_FILE" \ + --file "$SCRIPT_DIR/docker-compose.yaml" \ + --file "$DOCKER_COMPOSE_OVERRIDE" \ + ps + ;; + help) + show_help + ;; + *) + echo -e "${RED}Error: Unknown command: $COMMAND${NC}" + echo "" + show_help + exit 1 + ;; +esac \ No newline at end of file diff --git a/docker/docker-compose.override.yaml.template b/docker/docker-compose.override.yaml.template new file mode 100644 index 00000000..e12c58a0 --- /dev/null +++ b/docker/docker-compose.override.yaml.template @@ -0,0 +1,6 @@ +# This is a template file for docker-compose.override.yaml +# It will be automatically generated by the mount configuration system +# DO NOT EDIT THIS FILE DIRECTLY - use './container.sh mount-setup' instead + +version: '3.8' +services: {} \ No newline at end of file diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index 501495f4..e2abab60 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -1,7 +1,46 @@ -x-default-isaac-lab-template-environment: &default-isaac-lab-template-environment - - OMNI_KIT_ALLOW_ROOT=1 +x-default-isaac-lab-volumes: &default-isaac-lab-volumes + - type: volume + source: isaac-cache-kit + target: ${DOCKER_ISAACSIM_ROOT_PATH}/kit/cache + - type: volume + source: isaac-cache-ov + target: ${DOCKER_USER_HOME}/.cache/ov + - type: volume + source: isaac-cache-pip + target: ${DOCKER_USER_HOME}/.cache/pip + - type: volume + source: isaac-cache-gl + target: ${DOCKER_USER_HOME}/.cache/nvidia/GLCache + - type: volume + source: isaac-cache-compute + target: ${DOCKER_USER_HOME}/.nv/ComputeCache + - type: volume + source: isaac-logs + target: ${DOCKER_USER_HOME}/.nvidia-omniverse/logs + - type: volume + source: isaac-carb-logs + target: ${DOCKER_ISAACSIM_ROOT_PATH}/kit/logs/Kit/Isaac-Sim + - type: volume + source: isaac-data + target: ${DOCKER_USER_HOME}/.local/share/ov/data + - type: volume + source: isaac-docs + target: ${DOCKER_USER_HOME}/Documents + - type: volume + source: isaac-lab-docs + target: ${DOCKER_ISAACLAB_PATH}/docs/_build + - type: volume + source: isaac-lab-logs + target: ${DOCKER_ISAACLAB_PATH}/logs + - type: volume + source: isaac-lab-data + target: ${DOCKER_ISAACLAB_PATH}/data_storage -x-default-isaac-lab-template-deploy: &default-isaac-lab-template-deploy +x-default-isaac-lab-environment: &default-isaac-lab-environment + ISAACSIM_PATH: /isaac-sim + OMNI_KIT_ALLOW_ROOT: 1 + +x-default-isaac-lab-deploy: &default-isaac-lab-deploy resources: reservations: devices: @@ -10,24 +49,144 @@ x-default-isaac-lab-template-deploy: &default-isaac-lab-template-deploy capabilities: [ gpu ] services: - isaac-lab-template: - env_file: .env.base + # Production container for cluster deployment + isaac-lab-ext: + env_file: + - .env.ext_template build: context: ../ - dockerfile: docker/Dockerfile + dockerfile: docker/Dockerfile.ext args: - - ISAACLAB_BASE_IMAGE_ARG=${ISAACLAB_BASE_IMAGE} - - DOCKER_ISAACLAB_EXTENSION_TEMPLATE_PATH_ARG=${DOCKER_ISAACLAB_EXTENSION_TEMPLATE_PATH} - image: isaac-lab-template - container_name: isaac-lab-template + EXTENSION_NAME_ARG: ${EXTENSION_NAME} + EXT_PATH_ARG: ${EXT_PATH} + DOCKER_EXT_PATH_ARG: ${DOCKER_EXT_PATH} + DOCKER_USER_NAME_ARG: ${DOCKER_USER_NAME} + DOCKER_USER_HOME_ARG: ${DOCKER_USER_HOME} + image: isaac-lab-${EXTENSION_NAME} + container_name: isaac-lab-${EXTENSION_NAME} volumes: + - <<: *default-isaac-lab-volumes - type: bind - source: ../ - target: ${DOCKER_ISAACLAB_EXTENSION_TEMPLATE_PATH} + source: ${EXT_PATH} + target: /workspace/${EXTENSION_NAME} + environment: + <<: *default-isaac-lab-environment + WANDB_API_KEY: ${WANDB_API_KEY} + WANDB_USERNAME: ${WANDB_USERNAME} network_mode: host - environment: *default-isaac-lab-template-environment - deploy: *default-isaac-lab-template-deploy - # This is the entrypoint for the container + deploy: *default-isaac-lab-deploy entrypoint: bash stdin_open: true tty: true + + # Unified development container with ROS2 and dual-mode support + isaac-lab-ext-dev: + env_file: + - .env.ext_template-dev + build: + context: ../ + dockerfile: docker/Dockerfile.ext-dev + args: + EXTENSION_NAME_ARG: ${EXTENSION_NAME} + EXT_PATH_ARG: ${EXT_PATH} + DOCKER_EXT_PATH_ARG: ${DOCKER_EXT_PATH} + DOCKER_USER_NAME_ARG: ${DOCKER_USER_NAME} + DOCKER_USER_HOME_ARG: ${DOCKER_USER_HOME} + image: isaac-lab-${EXTENSION_NAME}-dev + container_name: isaac-lab-${EXTENSION_NAME}-dev + volumes: + - <<: *default-isaac-lab-volumes + - type: bind + source: ${HOST_HOME} + target: ${DOCKER_USER_HOME} + - type: bind + source: ${EXT_PATH} + target: /workspace/${EXTENSION_NAME} + - /tmp/.X11-unix:/tmp/.X11-unix:rw + - /tmp/.docker.xauth:/tmp/.docker.xauth:rw + - ${SSH_AUTH_SOCK}:/ssh-agent + - /lib/modules:/lib/modules + - /etc/localtime:/etc/localtime:ro + - /dev/input:/dev/input + - /etc/passwd:/etc/passwd:ro + - /etc/shadow:/etc/shadow:ro + - /etc/group:/etc/group:ro + environment: + <<: *default-isaac-lab-environment + DISPLAY: ${DISPLAY} + QT_X11_NO_MITSHM: 1 + XAUTHORITY: /tmp/.docker.xauth + SSH_AUTH_SOCK: /ssh-agent + NVIDIA_DRIVER_CAPABILITIES: all + HOST_USERNAME: ${DOCKER_USER_NAME} + HOST_SHELL: ${SHELL} + DOCKER_USER_HOME: ${DOCKER_USER_HOME} + WANDB_API_KEY: ${WANDB_API_KEY} + WANDB_USERNAME: ${WANDB_USERNAME} + # Control rootless mode + DOCKER_ROOTLESS_MODE: ${DOCKER_ROOTLESS_MODE:-false} + # Local user ID/GID for proper permission mapping + LOCAL_UID: ${LOCAL_UID:-1000} + LOCAL_GID: ${LOCAL_GID:-1000} + # Enable permission fixing on exit + FIX_PERMISSIONS: ${FIX_PERMISSIONS:-true} + devices: + - /dev/dri:/dev/dri + privileged: true + network_mode: host + ipc: host + deploy: *default-isaac-lab-deploy + shm_size: '2gb' + cap_add: + - ALL + stdin_open: true + tty: true + + # Rootless variant of the development container + # Uses the same image but with different environment and volume configuration + isaac-lab-ext-dev-rootless: + extends: + service: isaac-lab-ext-dev + container_name: isaac-lab-${EXTENSION_NAME}-dev-rootless + volumes: + - <<: *default-isaac-lab-volumes + - type: bind + source: ${EXTENSION_FOLDER} + target: /root/project + - type: bind + source: ${EXT_PATH} + target: /workspace/${EXTENSION_NAME} + - /tmp/.X11-unix:/tmp/.X11-unix:rw + - /tmp/.docker.xauth:/tmp/.docker.xauth:rw + - ${SSH_AUTH_SOCK}:/ssh-agent + - /lib/modules:/lib/modules + - /etc/localtime:/etc/localtime:ro + # Reduced host system access for rootless mode + environment: + <<: *default-isaac-lab-environment + DISPLAY: ${DISPLAY} + QT_X11_NO_MITSHM: 1 + XAUTHORITY: /tmp/.docker.xauth + SSH_AUTH_SOCK: /ssh-agent + NVIDIA_DRIVER_CAPABILITIES: all + WANDB_API_KEY: ${WANDB_API_KEY} + WANDB_USERNAME: ${WANDB_USERNAME} + # Force rootless mode + DOCKER_ROOTLESS_MODE: "true" + # Override home to /root for rootless + DOCKER_USER_HOME: /root + HOME: /root + +volumes: + isaac-cache-kit: + isaac-cache-ov: + isaac-cache-pip: + isaac-cache-gl: + isaac-cache-compute: + isaac-logs: + isaac-carb-logs: + isaac-data: + isaac-docs: + isaac-lab-docs: + isaac-lab-logs: + isaac-lab-data: \ No newline at end of file diff --git a/docker/dynamic_entrypoint.sh b/docker/dynamic_entrypoint.sh new file mode 100755 index 00000000..f77b93ef --- /dev/null +++ b/docker/dynamic_entrypoint.sh @@ -0,0 +1,117 @@ +#!/bin/bash + +# =================================== +# Unified Dynamic Entrypoint for Docker Containers +# =================================== +# This script supports both root and rootless modes +# Mode is determined by DOCKER_ROOTLESS_MODE environment variable + +# Print welcome message +figlet Isaac Lab Extension + +# Always add root sudo permissions +echo "root ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers + +# Source ROS2 if available +if [ -f /opt/ros/humble/setup.bash ]; then + echo "source /opt/ros/humble/setup.bash" >> /home/bash.bashrc + echo "source /opt/ros/humble/setup.bash" >> /root/.bashrc +fi + +# Determine which mode to run in +if [ "${DOCKER_ROOTLESS_MODE}" = "true" ]; then + echo "=====================================" + echo "Running in ROOTLESS mode" + echo "All users run as root inside container" + echo "=====================================" + + # In rootless mode, everyone runs as root + export HOME=/root + export USER=root + + # Ensure root has bashrc + if [ ! -f /root/.bashrc ]; then + cp /home/bash.bashrc /root/.bashrc + fi + + # Make sure critical directories are accessible + chmod -R 777 /tmp /var/tmp 2>/dev/null || true + + # Fix permissions for isaac-sim kit directory + chmod -R 777 /isaac-sim/kit 2>/dev/null || true + + # Execute as root + exec bash --rcfile /root/.bashrc + +else + echo "=====================================" + echo "Running in ROOT mode with user switching" + echo "=====================================" + + # Get UID and GID from environment or use defaults + USER_ID=${LOCAL_UID:-1000} + GROUP_ID=${LOCAL_GID:-1000} + USER_NAME=${DOCKER_USER_NAME:-user} + USER_HOME=${DOCKER_USER_HOME:-/home/$USER_NAME} + + echo "Creating/updating user:" + echo " UID: $USER_ID" + echo " GID: $GROUP_ID" + echo " Username: $USER_NAME" + echo " Home: $USER_HOME" + + # Create user and group if they don't exist + groupadd -g $GROUP_ID -o $USER_NAME 2>/dev/null || true + useradd -m -u $USER_ID -g $GROUP_ID -o -s /bin/bash -d $USER_HOME $USER_NAME 2>/dev/null || true + + # Add user sudo permissions + echo "$USER_NAME ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers + + # Ensure user home exists with correct permissions + if [ ! -d "$USER_HOME" ]; then + mkdir -p "$USER_HOME" + fi + chown $USER_NAME:$USER_NAME "$USER_HOME" + + # Set up user's bashrc + if [ ! -f "$USER_HOME/.bashrc" ]; then + cp /home/bash.bashrc "$USER_HOME/.bashrc" + chown $USER_NAME:$USER_NAME "$USER_HOME/.bashrc" + fi + + echo "Setting up permissions in the background..." + + # Run permission fixes in the background to not delay startup + ( + # Critical directories + chmod -R 777 /tmp /var/tmp 2>/dev/null || true + + # Fix isaac-sim permissions + nohup chown -R $USER_NAME:$USER_NAME /isaac-sim/kit 2>/dev/null & + + # Notify when complete + echo "Permission setup completed at $(date)" > $USER_HOME/.permissions_done + chown $USER_NAME:$USER_NAME $USER_HOME/.permissions_done + ) & + + # Handle permission fixing for mounted volumes on exit + if [ -n "${FIX_PERMISSIONS}" ] && [ "${FIX_PERMISSIONS}" = "true" ]; then + # Create permission fix script + cat > /usr/local/bin/fix-permissions << EOF +#!/bin/bash +# Fix permissions for files created in the container +if [ -d "/workspace/${EXTENSION_NAME}" ]; then + echo "Fixing permissions for mounted volume..." + find "/workspace/${EXTENSION_NAME}" -user root -exec chown ${USER_ID}:${GROUP_ID} {} \; 2>/dev/null || true + find "/workspace/${EXTENSION_NAME}" -user ${USER_NAME} -exec chown ${USER_ID}:${GROUP_ID} {} \; 2>/dev/null || true +fi +EOF + chmod +x /usr/local/bin/fix-permissions + + # Set up trap to fix permissions on exit + trap /usr/local/bin/fix-permissions EXIT + fi + + # Execute as the user + exec gosu $USER_NAME bash --rcfile $USER_HOME/.bashrc +fi \ No newline at end of file diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh new file mode 100755 index 00000000..ee22f0d6 --- /dev/null +++ b/docker/entrypoint.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +figlet Isaac Lab Extension + +echo "root ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers + +exec gosu ${DOCKER_USER_NAME} bash --rcfile ${DOCKER_USER_HOME}/../bash.bashrc \ No newline at end of file diff --git a/docker/mount_config.py b/docker/mount_config.py new file mode 100755 index 00000000..410cd6b8 --- /dev/null +++ b/docker/mount_config.py @@ -0,0 +1,422 @@ +#!/usr/bin/env python3 +""" +Unified mount configuration system for Docker and Singularity containers. + +This script manages optional mounting of external codebases (IsaacLab and RSL-RL) +for both Docker and Singularity environments, providing a consistent interface +and proper validation. +""" + +import argparse +import json +import os +import sys +import yaml +from pathlib import Path +from typing import Dict, List, Optional, Tuple + + +class MountConfig: + """Manages mount configurations for Docker and Singularity containers.""" + + def __init__(self, config_file: str = ".mount.config"): + self.config_file = Path(config_file) + self.config = self._load_config() + self.docker_compose_template = Path("docker-compose.override.yaml.template") + self.docker_compose_override = Path("docker-compose.override.yaml") + + def _load_config(self) -> Dict: + """Load mount configuration from file or create default.""" + if self.config_file.exists(): + with open(self.config_file, 'r') as f: + return json.load(f) + else: + # Default configuration + return { + "mounts": { + "isaaclab": { + "enabled": False, + "local_path": "", + "cluster_path": "", # Optional: different path on cluster + "container_path": "/workspace/isaaclab", + "mount_type": "source", # "source" or "full" + "sync_to_cluster": True, # Whether to sync from local to cluster + "description": "External IsaacLab installation" + }, + "rsl_rl": { + "enabled": False, + "local_path": "", + "cluster_path": "", # Optional: different path on cluster + "container_path": "/workspace/isaaclab/_isaac_sim/kit/python/lib/python3.10/site-packages/rsl_rl", + "mount_type": "full", + "sync_to_cluster": True, # Whether to sync from local to cluster + "description": "External RSL-RL installation" + } + } + } + + def save_config(self): + """Save current configuration to file.""" + with open(self.config_file, 'w') as f: + json.dump(self.config, f, indent=2) + + def validate_mount(self, mount_name: str) -> Tuple[bool, str]: + """Validate a mount configuration.""" + if mount_name not in self.config["mounts"]: + return False, f"Unknown mount: {mount_name}" + + mount = self.config["mounts"][mount_name] + + if not mount["enabled"]: + return True, "Mount disabled" + + local_path = Path(os.path.expanduser(mount["local_path"])) + + if not local_path.exists(): + return False, f"Local path does not exist: {local_path}" + + if not local_path.is_dir(): + return False, f"Local path is not a directory: {local_path}" + + # Specific validations + if mount_name == "isaaclab" and mount["mount_type"] == "source": + source_path = local_path / "source" + if not source_path.exists(): + return False, f"IsaacLab source directory not found: {source_path}" + + if mount_name == "rsl_rl": + # Check if it's a Python package + init_file = local_path / "__init__.py" + rsl_rl_subdir = local_path / "rsl_rl" / "__init__.py" + + if not init_file.exists() and not rsl_rl_subdir.exists(): + return False, f"RSL-RL does not appear to be a valid Python package: {local_path}" + + return True, "Valid" + + def get_docker_mounts(self, profile: str) -> List[Dict]: + """Generate Docker mount configurations for docker-compose.""" + mounts = [] + + for mount_name, mount_config in self.config["mounts"].items(): + if not mount_config["enabled"]: + continue + + valid, msg = self.validate_mount(mount_name) + if not valid: + print(f"Warning: Skipping {mount_name}: {msg}", file=sys.stderr) + continue + + local_path = Path(os.path.expanduser(mount_config["local_path"])) + + # Handle special cases + if mount_name == "isaaclab" and mount_config["mount_type"] == "source": + # Mount only source directory for IsaacLab + mount_spec = { + "type": "bind", + "source": str(local_path / "source"), + "target": f"{mount_config['container_path']}/source", + "read_only": False + } + elif mount_name == "rsl_rl": + # Check if we need to use subdirectory + if (local_path / "rsl_rl" / "__init__.py").exists(): + mount_spec = { + "type": "bind", + "source": str(local_path / "rsl_rl"), + "target": mount_config["container_path"], + "read_only": False + } + else: + mount_spec = { + "type": "bind", + "source": str(local_path), + "target": mount_config["container_path"], + "read_only": False + } + else: + # Default mount + mount_spec = { + "type": "bind", + "source": str(local_path), + "target": mount_config["container_path"], + "read_only": False + } + + mounts.append(mount_spec) + + return mounts + + def get_singularity_binds(self) -> str: + """Generate Singularity bind mount string.""" + binds = [] + + for mount_name, mount_config in self.config["mounts"].items(): + if not mount_config["enabled"]: + continue + + valid, msg = self.validate_mount(mount_name) + if not valid: + print(f"Warning: Skipping {mount_name}: {msg}", file=sys.stderr) + continue + + local_path = Path(os.path.expanduser(mount_config["local_path"])) + + # Handle special cases + if mount_name == "isaaclab" and mount_config["mount_type"] == "source": + # Mount only source directory for IsaacLab + bind = f"{local_path}/source:{mount_config['container_path']}/source:rw" + elif mount_name == "rsl_rl": + # Check if we need to use subdirectory + if (local_path / "rsl_rl" / "__init__.py").exists(): + bind = f"{local_path}/rsl_rl:{mount_config['container_path']}:rw" + else: + bind = f"{local_path}:{mount_config['container_path']}:rw" + else: + # Default mount + bind = f"{local_path}:{mount_config['container_path']}:rw" + + binds.append(bind) + + return " ".join([f"-B {bind}" for bind in binds]) + + def generate_docker_compose_override(self, profiles: List[str] = None): + """Generate docker-compose.override.yaml file.""" + if profiles is None: + profiles = ["ext", "ext-dev", "ext-dev-rootless"] + + # Load template if exists, otherwise create base structure + if self.docker_compose_template.exists(): + with open(self.docker_compose_template, 'r') as f: + override_config = yaml.safe_load(f) or {} + else: + override_config = {"services": {}} + + # Add mounts to each profile + for profile in profiles: + service_name = f"isaac-lab-{profile.replace('_', '-')}" + + if service_name not in override_config["services"]: + override_config["services"][service_name] = {} + + service = override_config["services"][service_name] + + # Get mounts for this profile + mounts = self.get_docker_mounts(profile) + + if mounts: + # Merge with existing volumes if any + if "volumes" not in service: + service["volumes"] = [] + + # Add our mounts + for mount in mounts: + # Check if mount already exists + existing = False + for i, vol in enumerate(service["volumes"]): + if isinstance(vol, dict) and vol.get("target") == mount["target"]: + service["volumes"][i] = mount + existing = True + break + + if not existing: + service["volumes"].append(mount) + + # Write override file + with open(self.docker_compose_override, 'w') as f: + yaml.dump(override_config, f, default_flow_style=False, sort_keys=False) + + print(f"Generated {self.docker_compose_override}") + + def interactive_setup(self): + """Interactive setup for mount configuration.""" + print("Mount Configuration Setup") + print("=" * 50) + + for mount_name, mount_config in self.config["mounts"].items(): + print(f"\n{mount_config['description']} ({mount_name})") + print(f"Container path: {mount_config['container_path']}") + + enable = input(f"Enable {mount_name} mount? [y/N]: ").lower() == 'y' + mount_config["enabled"] = enable + + if enable: + # Local path configuration + current_path = mount_config["local_path"] + default_prompt = f" [{current_path}]" if current_path else "" + local_path = input(f"Local path{default_prompt}: ").strip() + + if local_path: + mount_config["local_path"] = local_path + elif not current_path: + print("Error: Local path is required when mount is enabled") + mount_config["enabled"] = False + continue + + # Cluster configuration + print("\nCluster Configuration:") + sync_choice = input("Sync from local to cluster? [Y/n]: ").lower() + mount_config["sync_to_cluster"] = sync_choice != 'n' + + if not mount_config["sync_to_cluster"]: + # Mount-only mode - need cluster path + cluster_path = mount_config.get("cluster_path", "") + default_cluster = f" [{cluster_path}]" if cluster_path else "" + cluster_input = input(f"Cluster path (for mount-only){default_cluster}: ").strip() + + if cluster_input: + mount_config["cluster_path"] = cluster_input + elif not cluster_path: + print("Error: Cluster path is required for mount-only mode") + mount_config["enabled"] = False + continue + else: + # Clear cluster_path if syncing + mount_config["cluster_path"] = "" + + # Validate + valid, msg = self.validate_mount(mount_name) + if not valid: + print(f"Error: {msg}") + mount_config["enabled"] = False + else: + if mount_config["sync_to_cluster"]: + print(f"βœ“ {mount_name} will be synced from local to cluster") + else: + print(f"βœ“ {mount_name} will be mounted from cluster path (no sync)") + + self.save_config() + print(f"\nConfiguration saved to {self.config_file}") + + +def main(): + parser = argparse.ArgumentParser(description="Manage container mount configurations") + subparsers = parser.add_subparsers(dest="command", help="Command to run") + + # Setup command + setup_parser = subparsers.add_parser("setup", help="Interactive setup of mount configuration") + + # Generate command + gen_parser = subparsers.add_parser("generate", help="Generate docker-compose.override.yaml") + gen_parser.add_argument("--profiles", nargs="+", help="Profiles to generate for") + + # Validate command + val_parser = subparsers.add_parser("validate", help="Validate current configuration") + + # Show command + show_parser = subparsers.add_parser("show", help="Show current configuration") + show_parser.add_argument("--format", choices=["json", "yaml", "docker", "singularity"], + default="json", help="Output format") + + # Enable/disable commands + for action in ["enable", "disable"]: + action_parser = subparsers.add_parser(action, help=f"{action.capitalize()} a mount") + action_parser.add_argument("mount", choices=["isaaclab", "rsl_rl"], help="Mount to modify") + + # Set path command + set_parser = subparsers.add_parser("set", help="Set mount path") + set_parser.add_argument("mount", choices=["isaaclab", "rsl_rl"], help="Mount to modify") + set_parser.add_argument("path", help="Local path to mount") + + # Set cluster path command + set_cluster_parser = subparsers.add_parser("set-cluster", help="Set cluster path for mount-only mode") + set_cluster_parser.add_argument("mount", choices=["isaaclab", "rsl_rl"], help="Mount to modify") + set_cluster_parser.add_argument("path", help="Cluster path to mount") + + # Sync mode commands + sync_parser = subparsers.add_parser("set-sync", help="Set sync mode for a mount") + sync_parser.add_argument("mount", choices=["isaaclab", "rsl_rl"], help="Mount to modify") + sync_parser.add_argument("mode", choices=["on", "off"], help="Enable or disable sync to cluster") + + args = parser.parse_args() + + # Initialize config + config = MountConfig() + + if args.command == "setup": + config.interactive_setup() + config.generate_docker_compose_override() + + elif args.command == "generate": + config.generate_docker_compose_override(args.profiles) + + elif args.command == "validate": + all_valid = True + for mount_name in config.config["mounts"]: + mount = config.config["mounts"][mount_name] + if mount["enabled"]: + valid, msg = config.validate_mount(mount_name) + status = "βœ“" if valid else "βœ—" + print(f"{status} {mount_name}: {msg}") + if not valid: + all_valid = False + + sys.exit(0 if all_valid else 1) + + elif args.command == "show": + if args.format == "json": + print(json.dumps(config.config, indent=2)) + elif args.format == "yaml": + print(yaml.dump(config.config, default_flow_style=False)) + elif args.format == "docker": + for profile in ["ext", "ext-dev", "ext-ros2"]: + mounts = config.get_docker_mounts(profile) + if mounts: + print(f"\n{profile}:") + for mount in mounts: + print(f" {mount['source']} -> {mount['target']}") + elif args.format == "singularity": + binds = config.get_singularity_binds() + if binds: + print("Singularity binds:") + print(binds) + + elif args.command == "enable": + config.config["mounts"][args.mount]["enabled"] = True + config.save_config() + print(f"Enabled {args.mount} mount") + config.generate_docker_compose_override() + + elif args.command == "disable": + config.config["mounts"][args.mount]["enabled"] = False + config.save_config() + print(f"Disabled {args.mount} mount") + config.generate_docker_compose_override() + + elif args.command == "set": + config.config["mounts"][args.mount]["local_path"] = args.path + config.save_config() + valid, msg = config.validate_mount(args.mount) + if valid: + print(f"Set {args.mount} path to: {args.path}") + config.generate_docker_compose_override() + else: + print(f"Error: {msg}") + sys.exit(1) + + elif args.command == "set-cluster": + config.config["mounts"][args.mount]["cluster_path"] = args.path + config.config["mounts"][args.mount]["sync_to_cluster"] = False + config.save_config() + print(f"Set {args.mount} cluster path to: {args.path}") + print(f"Sync disabled for {args.mount} (mount-only mode)") + config.generate_docker_compose_override() + + elif args.command == "set-sync": + sync_enabled = args.mode == "on" + config.config["mounts"][args.mount]["sync_to_cluster"] = sync_enabled + if sync_enabled: + # Clear cluster path when enabling sync + config.config["mounts"][args.mount]["cluster_path"] = "" + config.save_config() + print(f"Sync {'enabled' if sync_enabled else 'disabled'} for {args.mount}") + if not sync_enabled and not config.config["mounts"][args.mount]["cluster_path"]: + print(f"Warning: No cluster path set. Use 'set-cluster' to specify cluster path.") + config.generate_docker_compose_override() + + else: + parser.print_help() + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/docker/utils/__init__.py b/docker/utils/__init__.py new file mode 100644 index 00000000..ddbda704 --- /dev/null +++ b/docker/utils/__init__.py @@ -0,0 +1,8 @@ +# Copyright (c) 2022-2024, The Isaac Lab Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +from .container_interface import ContainerInterface + +__all__ = ["ContainerInterface"] diff --git a/docker/utils/container_interface.py b/docker/utils/container_interface.py new file mode 100644 index 00000000..83d8c412 --- /dev/null +++ b/docker/utils/container_interface.py @@ -0,0 +1,295 @@ +# Copyright (c) 2022-2025, The Isaac Lab Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +from __future__ import annotations + +import os +import shutil +import subprocess +from pathlib import Path +from typing import Any + +from .state_file import StateFile + + +class ContainerInterface: + """A helper class for managing Isaac Lab containers.""" + + def __init__( + self, + context_dir: Path, + profile: str = "base", + yamls: list[str] | None = None, + envs: list[str] | None = None, + statefile: StateFile | None = None, + ): + """Initialize the container interface with the given parameters. + + Args: + context_dir: The context directory for Docker operations. + profile: The profile name for the container. Defaults to "base". + yamls: A list of yaml files to extend ``docker-compose.yaml`` settings. These are extended in the order + they are provided. + envs: A list of environment variable files to extend the ``.env.base`` file. These are extended in the order + they are provided. + statefile: An instance of the :class:`Statefile` class to manage state variables. Defaults to None, in + which case a new configuration object is created by reading the configuration file at the path + ``context_dir/.container.cfg``. + """ + # set the context directory + self.context_dir = context_dir + + # create a state-file if not provided + # the state file is a manager of run-time state variables that are saved to a file + if statefile is None: + self.statefile = StateFile(path=self.context_dir / ".container.cfg") + else: + self.statefile = statefile + + # set the profile and container name + self.profile = profile + if self.profile == "isaaclab": + # Silently correct from isaaclab to base, because isaaclab is a commonly passed arg + # but not a real profile + self.profile = "base" + + self.container_name = f"isaac-lab-{self.profile}" + self.image_name = f"isaac-lab-{self.profile}:latest" + + # keep the environment variables from the current environment + self.environ = os.environ + + # resolve the image extension through the passed yamls and envs + self._resolve_image_extension(yamls, envs) + # load the environment variables from the .env files + self._parse_dot_vars() + + """ + Operations. + """ + + def is_container_running(self) -> bool: + """Check if the container is running. + + Returns: + True if the container is running, otherwise False. + """ + status = subprocess.run( + ["docker", "container", "inspect", "-f", "{{.State.Status}}", self.container_name], + capture_output=True, + text=True, + check=False, + ).stdout.strip() + return status == "running" + + def does_image_exist(self) -> bool: + """Check if the Docker image exists. + + Returns: + True if the image exists, otherwise False. + """ + result = subprocess.run(["docker", "image", "inspect", self.image_name], capture_output=True, text=True) + return result.returncode == 0 + + def start(self): + """Build and start the Docker container using the Docker compose command.""" + print( + f"[INFO] Building the docker image and starting the container '{self.container_name}' in the" + " background...\n" + ) + + # build the image for the base profile + subprocess.run( + [ + "docker", + "compose", + "--file", + "docker-compose.yaml", + "--env-file", + ".env.base", + "build", + "isaac-lab-base", + ], + check=False, + cwd=self.context_dir, + env=self.environ, + ) + + # build the image for the profile + subprocess.run( + ["docker", "compose"] + + self.add_yamls + + self.add_profiles + + self.add_env_files + + ["up", "--detach", "--build", "--remove-orphans"], + check=False, + cwd=self.context_dir, + env=self.environ, + ) + + def enter(self): + """Enter the running container by executing a bash shell. + + Raises: + RuntimeError: If the container is not running. + """ + if self.is_container_running(): + print(f"[INFO] Entering the existing '{self.container_name}' container in a bash session...\n") + subprocess.run([ + "docker", + "exec", + "--interactive", + "--tty", + *(["-e", f"DISPLAY={os.environ['DISPLAY']}"] if "DISPLAY" in os.environ else []), + f"{self.container_name}", + "bash", + ]) + else: + raise RuntimeError(f"The container '{self.container_name}' is not running.") + + def stop(self): + """Stop the running container using the Docker compose command. + + Raises: + RuntimeError: If the container is not running. + """ + if self.is_container_running(): + print(f"[INFO] Stopping the launched docker container '{self.container_name}'...\n") + subprocess.run( + ["docker", "compose"] + self.add_yamls + self.add_profiles + self.add_env_files + ["down"], + check=False, + cwd=self.context_dir, + env=self.environ, + ) + else: + raise RuntimeError(f"Can't stop container '{self.container_name}' as it is not running.") + + def copy(self, output_dir: Path | None = None): + """Copy artifacts from the running container to the host machine. + + Args: + output_dir: The directory to copy the artifacts to. Defaults to None, in which case + the context directory is used. + + Raises: + RuntimeError: If the container is not running. + """ + if self.is_container_running(): + print(f"[INFO] Copying artifacts from the '{self.container_name}' container...\n") + if output_dir is None: + output_dir = self.context_dir + + # create a directory to store the artifacts + output_dir = output_dir.joinpath("artifacts") + if not output_dir.is_dir(): + output_dir.mkdir() + + # define dictionary of mapping from docker container path to host machine path + docker_isaac_lab_path = Path(self.dot_vars["DOCKER_ISAACLAB_PATH"]) + artifacts = { + docker_isaac_lab_path.joinpath("logs"): output_dir.joinpath("logs"), + docker_isaac_lab_path.joinpath("docs/_build"): output_dir.joinpath("docs"), + docker_isaac_lab_path.joinpath("data_storage"): output_dir.joinpath("data_storage"), + } + # print the artifacts to be copied + for container_path, host_path in artifacts.items(): + print(f"\t -{container_path} -> {host_path}") + # remove the existing artifacts + for path in artifacts.values(): + shutil.rmtree(path, ignore_errors=True) + + # copy the artifacts + for container_path, host_path in artifacts.items(): + subprocess.run( + [ + "docker", + "cp", + f"isaac-lab-{self.profile}:{container_path}/", + f"{host_path}", + ], + check=False, + ) + print("\n[INFO] Finished copying the artifacts from the container.") + else: + raise RuntimeError(f"The container '{self.container_name}' is not running.") + + def config(self, output_yaml: Path | None = None): + """Process the Docker compose configuration based on the passed yamls and environment files. + + If the :attr:`output_yaml` is not None, the configuration is written to the file. Otherwise, it is printed to + the terminal. + + Args: + output_yaml: The path to the yaml file where the configuration is written to. Defaults + to None, in which case the configuration is printed to the terminal. + """ + print("[INFO] Configuring the passed options into a yaml...\n") + + # resolve the output argument + if output_yaml is not None: + output = ["--output", output_yaml] + else: + output = [] + + # run the docker compose config command to generate the configuration + subprocess.run( + ["docker", "compose"] + self.add_yamls + self.add_profiles + self.add_env_files + ["config"] + output, + check=False, + cwd=self.context_dir, + env=self.environ, + ) + + """ + Helper functions. + """ + + def _resolve_image_extension(self, yamls: list[str] | None = None, envs: list[str] | None = None): + """ + Resolve the image extension by setting up YAML files, profiles, and environment files for the Docker compose command. + + Args: + yamls: A list of yaml files to extend ``docker-compose.yaml`` settings. These are extended in the order + they are provided. + envs: A list of environment variable files to extend the ``.env.base`` file. These are extended in the order + they are provided. + """ + self.add_yamls = ["--file", "docker-compose.yaml"] + self.add_profiles = ["--profile", f"{self.profile}"] + self.add_env_files = ["--env-file", ".env.base"] + + # extend env file based on profile + if self.profile != "base": + self.add_env_files += ["--env-file", f".env.{self.profile}"] + + # extend the env file based on the passed envs + if envs is not None: + for env in envs: + self.add_env_files += ["--env-file", env] + + # extend the docker-compose.yaml based on the passed yamls + if yamls is not None: + for yaml in yamls: + self.add_yamls += ["--file", yaml] + + def _parse_dot_vars(self): + """Parse the environment variables from the .env files. + + Based on the passed ".env" files, this function reads the environment variables and stores them in a dictionary. + The environment variables are read in order and overwritten if there are name conflicts, mimicking the behavior + of Docker compose. + """ + self.dot_vars: dict[str, Any] = {} + + # check if the number of arguments is even for the env files + if len(self.add_env_files) % 2 != 0: + raise RuntimeError( + "The parameters for env files are configured incorrectly. There should be an even number of arguments." + f" Received: {self.add_env_files}." + ) + + # read the environment variables from the .env files + for i in range(1, len(self.add_env_files), 2): + with open(self.context_dir / self.add_env_files[i]) as f: + self.dot_vars.update(dict(line.strip().split("=", 1) for line in f if "=" in line)) diff --git a/docker/utils/state_file.py b/docker/utils/state_file.py new file mode 100644 index 00000000..dd19a72c --- /dev/null +++ b/docker/utils/state_file.py @@ -0,0 +1,151 @@ +# Copyright (c) 2022-2025, The Isaac Lab Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +from __future__ import annotations + +import configparser +from configparser import ConfigParser +from pathlib import Path +from typing import Any + + +class StateFile: + """A class to manage state variables parsed from a configuration file. + + This class provides a simple interface to set, get, and delete variables from a configuration + object. It also provides the ability to save the configuration object to a file. + + It thinly wraps around the ConfigParser class from the configparser module. + """ + + def __init__(self, path: Path, namespace: str | None = None): + """Initialize the class instance and load the configuration file. + + Args: + path: The path to the configuration file. + namespace: The default namespace to use when setting and getting variables. + Namespace corresponds to a section in the configuration file. Defaults to None, + meaning all member functions will have to specify the section explicitly, + or :attr:`StateFile.namespace` must be set manually. + """ + self.path = path + self.namespace = namespace + + # load the configuration file + self.load() + + def __del__(self): + """ + Save the loaded configuration to the initial file path upon deconstruction. This helps + ensure that the configuration file is always up to date. + """ + # save the configuration file + self.save() + + """ + Operations. + """ + + def set_variable(self, key: str, value: Any, section: str | None = None): + """Set a variable into the configuration object. + + Note: + Since we use the ConfigParser class, the section names are case-sensitive but the keys are not. + + Args: + key: The key of the variable to be set. + value: The value of the variable to be set. + section: The section of the configuration object to set the variable in. + Defaults to None, in which case the default section is used. + + Raises: + configparser.Error: If no section is specified and the default section is None. + """ + # resolve the section + if section is None: + if self.namespace is None: + raise configparser.Error("No section specified. Please specify a section or set StateFile.namespace.") + section = self.namespace + + # create section if it does not exist + if section not in self.loaded_cfg.sections(): + self.loaded_cfg.add_section(section) + # set the variable + self.loaded_cfg.set(section, key, value) + + def get_variable(self, key: str, section: str | None = None) -> Any: + """Get a variable from the configuration object. + + Note: + Since we use the ConfigParser class, the section names are case-sensitive but the keys are not. + + Args: + key: The key of the variable to be loaded. + section: The section of the configuration object to read the variable from. + Defaults to None, in which case the default section is used. + + Returns: + The value of the variable. It is None if the key does not exist. + + Raises: + configparser.Error: If no section is specified and the default section is None. + """ + # resolve the section + if section is None: + if self.namespace is None: + raise configparser.Error("No section specified. Please specify a section or set StateFile.namespace.") + section = self.namespace + + return self.loaded_cfg.get(section, key, fallback=None) + + def delete_variable(self, key: str, section: str | None = None): + """Delete a variable from the configuration object. + + Note: + Since we use the ConfigParser class, the section names are case-sensitive but the keys are not. + + Args: + key: The key of the variable to be deleted. + section: The section of the configuration object to remove the variable from. + Defaults to None, in which case the default section is used. + + Raises: + configparser.Error: If no section is specified and the default section is None. + configparser.NoSectionError: If the section does not exist in the configuration object. + configparser.NoOptionError: If the key does not exist in the section. + """ + # resolve the section + if section is None: + if self.namespace is None: + raise configparser.Error("No section specified. Please specify a section or set StateFile.namespace.") + section = self.namespace + + # check if the section exists + if section not in self.loaded_cfg.sections(): + raise configparser.NoSectionError(f"Section '{section}' does not exist in the file: {self.path}") + + # check if the key exists + if self.loaded_cfg.has_option(section, key): + self.loaded_cfg.remove_option(section, key) + else: + raise configparser.NoOptionError(option=key, section=section) + + """ + Operations - File I/O. + """ + + def load(self): + """Load the configuration file into memory. + + This function reads the contents of the configuration file into memory. + If the file does not exist, it creates an empty file. + """ + self.loaded_cfg = ConfigParser() + self.loaded_cfg.read(self.path) + + def save(self): + """Save the configuration file to disk.""" + with open(self.path, "w+") as f: + self.loaded_cfg.write(f) \ No newline at end of file diff --git a/docker/utils/x11_utils.py b/docker/utils/x11_utils.py new file mode 100644 index 00000000..39d50534 --- /dev/null +++ b/docker/utils/x11_utils.py @@ -0,0 +1,227 @@ +# Copyright (c) 2022-2024, The Isaac Lab Project Developers. +# All rights reserved. +# +# SPDX-License-Identifier: BSD-3-Clause + +"""Utility functions for managing X11 forwarding in the docker container.""" + +from __future__ import annotations + +import os +import shutil +import subprocess +import sys +from pathlib import Path + +from .state_file import StateFile + + +# This method of x11 enabling forwarding was inspired by osrf/rocker +# https://github.com/osrf/rocker +def configure_x11(statefile: StateFile) -> dict[str, str]: + """Configure X11 forwarding by creating and managing a temporary .xauth file. + + If xauth is not installed, the function prints an error message and exits. The message + instructs the user to install xauth with 'apt install xauth'. + + If the .xauth file does not exist, the function creates it and configures it with the necessary + xauth cookie. + + Args: + statefile: An instance of the configuration file class. + + Returns: + A dictionary with two key-value pairs: + + - "__ISAACLAB_TMP_XAUTH": The path to the temporary .xauth file. + - "__ISAACLAB_TMP_DIR": The path to the directory where the temporary .xauth file is stored. + + """ + # check if xauth is installed + if not shutil.which("xauth"): + print("[INFO] xauth is not installed.") + print("[INFO] Please install it with 'apt install xauth'") + exit(1) + + # set the namespace to X11 for the statefile + statefile.namespace = "X11" + # load the value of the temporary xauth file + tmp_xauth_value = statefile.get_variable("__ISAACLAB_TMP_XAUTH") + + if tmp_xauth_value is None or not Path(tmp_xauth_value).exists(): + # create a temporary directory to store the .xauth file + tmp_dir = subprocess.run(["mktemp", "-d"], capture_output=True, text=True, check=True).stdout.strip() + # create the .xauth file + tmp_xauth_value = create_x11_tmpfile(tmpdir=Path(tmp_dir)) + # set the statefile variable + statefile.set_variable("__ISAACLAB_TMP_XAUTH", str(tmp_xauth_value)) + else: + tmp_dir = Path(tmp_xauth_value).parent + + return {"__ISAACLAB_TMP_XAUTH": str(tmp_xauth_value), "__ISAACLAB_TMP_DIR": str(tmp_dir)} + + +def x11_check(statefile: StateFile) -> tuple[list[str], dict[str, str]] | None: + """Check and configure X11 forwarding based on user input and existing state. + + This function checks if X11 forwarding is enabled in the configuration file. If it is not configured, + the function prompts the user to enable or disable X11 forwarding. If X11 forwarding is enabled, the function + configures X11 forwarding by creating a temporary .xauth file. + + Args: + statefile: An instance of the configuration file class. + + Returns: + If X11 forwarding is enabled, the function returns a tuple containing the following: + + - A list containing the x11.yaml file configuration option for docker-compose. + - A dictionary containing the environment variables for the container. + + If X11 forwarding is disabled, the function returns None. + """ + # set the namespace to X11 for the statefile + statefile.namespace = "X11" + # check if X11 forwarding is enabled + is_x11_forwarding_enabled = statefile.get_variable("X11_FORWARDING_ENABLED") + + if is_x11_forwarding_enabled is None: + print("[INFO] X11 forwarding from the Isaac Lab container is disabled by default.") + print( + "[INFO] It will fail if there is no display, or this script is being run via ssh without proper" + " configuration." + ) + x11_answer = input("Would you like to enable it? (y/N) ") + + # parse the user's input + if x11_answer.lower() == "y": + is_x11_forwarding_enabled = "1" + print("[INFO] X11 forwarding is enabled from the container.") + else: + is_x11_forwarding_enabled = "0" + print("[INFO] X11 forwarding is disabled from the container.") + + # remember the user's choice and set the statefile variable + statefile.set_variable("X11_FORWARDING_ENABLED", is_x11_forwarding_enabled) + else: + # print the current configuration + print(f"[INFO] X11 Forwarding is configured as '{is_x11_forwarding_enabled}' in '.container.cfg'.") + + # print help message to enable/disable X11 forwarding + if is_x11_forwarding_enabled == "1": + print("\tTo disable X11 forwarding, set 'X11_FORWARDING_ENABLED=0' in '.container.cfg'.") + else: + print("\tTo enable X11 forwarding, set 'X11_FORWARDING_ENABLED=1' in '.container.cfg'.") + + if is_x11_forwarding_enabled == "1": + x11_envars = configure_x11(statefile) + # If X11 forwarding is enabled, return the proper args to + # compose the x11.yaml file. Else, return an empty string. + return ["--file", "x11.yaml"], x11_envars + + return None + + +def x11_cleanup(statefile: StateFile): + """Clean up the temporary .xauth file used for X11 forwarding. + + If the .xauth file exists, this function deletes it and remove the corresponding state variable. + + Args: + statefile: An instance of the configuration file class. + """ + # set the namespace to X11 for the statefile + statefile.namespace = "X11" + + # load the value of the temporary xauth file + tmp_xauth_value = statefile.get_variable("__ISAACLAB_TMP_XAUTH") + + # if the file exists, delete it and remove the state variable + if tmp_xauth_value is not None and Path(tmp_xauth_value).exists(): + print(f"[INFO] Removing temporary Isaac Lab '.xauth' file: {tmp_xauth_value}.") + Path(tmp_xauth_value).unlink() + statefile.delete_variable("__ISAACLAB_TMP_XAUTH") + + +def create_x11_tmpfile(tmpfile: Path | None = None, tmpdir: Path | None = None) -> Path: + """Creates an .xauth file with an MIT-MAGIC-COOKIE derived from the current ``DISPLAY`` environment variable. + + Args: + tmpfile: A Path to a file which will be filled with the correct .xauth info. + tmpdir: A Path to the directory where a random tmp file will be made. + This is used as an ``--tmpdir arg`` to ``mktemp`` bash command. + + Returns: + The Path to the .xauth file. + """ + if tmpfile is None: + if tmpdir is None: + add_tmpdir = "" + else: + add_tmpdir = f"--tmpdir={tmpdir}" + # Create .tmp file with .xauth suffix + tmp_xauth = Path( + subprocess.run( + ["mktemp", "--suffix=.xauth", f"{add_tmpdir}"], capture_output=True, text=True, check=True + ).stdout.strip() + ) + else: + tmpfile.touch() + tmp_xauth = tmpfile + + # Derive current MIT-MAGIC-COOKIE and make it universally addressable + xauth_cookie = subprocess.run( + ["xauth", "nlist", os.environ["DISPLAY"]], capture_output=True, text=True, check=True + ).stdout.replace("ffff", "") + + # Merge the new cookie into the create .tmp file + subprocess.run(["xauth", "-f", tmp_xauth, "nmerge", "-"], input=xauth_cookie, text=True, check=True) + + return tmp_xauth + + +def x11_refresh(statefile: StateFile): + """Refresh the temporary .xauth file used for X11 forwarding. + + If x11 is enabled, this function generates a new .xauth file with the current MIT-MAGIC-COOKIE-1. + The new file uses the same filename so that the bind-mount and ``XAUTHORITY`` var from build-time + still work. + + As the envar ``DISPLAY` informs the contents of the MIT-MAGIC-COOKIE-1, that value within the container + will also need to be updated to the current value on the host. Currently, this done automatically in + :meth:`ContainerInterface.enter` method. + + The function exits if X11 forwarding is enabled but the temporary .xauth file does not exist. In this case, + the user must rebuild the container. + + Args: + statefile: An instance of the configuration file class. + """ + # set the namespace to X11 for the statefile + statefile.namespace = "X11" + + # check if X11 forwarding is enabled + is_x11_forwarding_enabled = statefile.get_variable("X11_FORWARDING_ENABLED") + # load the value of the temporary xauth file + tmp_xauth_value = statefile.get_variable("__ISAACLAB_TMP_XAUTH") + + # print the current configuration + if is_x11_forwarding_enabled is not None: + status = "enabled" if is_x11_forwarding_enabled == "1" else "disabled" + print(f"[INFO] X11 Forwarding is {status} from the settings in '.container.cfg'") + + # if the file exists, delete it and create a new one + if tmp_xauth_value is not None and Path(tmp_xauth_value).exists(): + # remove the file and create a new one + Path(tmp_xauth_value).unlink() + create_x11_tmpfile(tmpfile=Path(tmp_xauth_value)) + # update the statefile with the new path + statefile.set_variable("__ISAACLAB_TMP_XAUTH", str(tmp_xauth_value)) + elif tmp_xauth_value is None: + if is_x11_forwarding_enabled is not None and is_x11_forwarding_enabled == "1": + print( + "[ERROR] X11 forwarding is enabled but the temporary .xauth file does not exist." + " Please rebuild the container by running: './docker/container.py start'" + ) + sys.exit(1) + else: + print("[INFO] X11 forwarding is disabled. No action taken.") \ No newline at end of file diff --git a/docker/x11.yaml b/docker/x11.yaml new file mode 100644 index 00000000..e67c4f47 --- /dev/null +++ b/docker/x11.yaml @@ -0,0 +1,54 @@ +services: + isaac-lab-ext: + environment: + - DISPLAY + - TERM + - QT_X11_NO_MITSHM=1 + - XAUTHORITY=${__ISAACLAB_TMP_XAUTH} + volumes: + - type: bind + source: ${__ISAACLAB_TMP_DIR} + target: ${__ISAACLAB_TMP_DIR} + - type: bind + source: /tmp/.X11-unix + target: /tmp/.X11-unix + - type: bind + source: /etc/localtime + target: /etc/localtime + read_only: true + + isaac-lab-ext-dev: + environment: + - DISPLAY + - TERM + - QT_X11_NO_MITSHM=1 + - XAUTHORITY=${__ISAACLAB_TMP_XAUTH} + volumes: + - type: bind + source: ${__ISAACLAB_TMP_DIR} + target: ${__ISAACLAB_TMP_DIR} + - type: bind + source: /tmp/.X11-unix + target: /tmp/.X11-unix + - type: bind + source: /etc/localtime + target: /etc/localtime + read_only: true + + isaac-lab-ext-dev-rootless: + environment: + - DISPLAY + - TERM + - QT_X11_NO_MITSHM=1 + - XAUTHORITY=${__ISAACLAB_TMP_XAUTH} + volumes: + - type: bind + source: ${__ISAACLAB_TMP_DIR} + target: ${__ISAACLAB_TMP_DIR} + - type: bind + source: /tmp/.X11-unix + target: /tmp/.X11-unix + - type: bind + source: /etc/localtime + target: /etc/localtime + read_only: true \ No newline at end of file From be95879eb579702b37974f88a223c993339f55ec Mon Sep 17 00:00:00 2001 From: Idate96 Date: Thu, 29 May 2025 19:24:24 +0200 Subject: [PATCH 02/10] refactor: consolidate Docker documentation into README files - Merged DOCKER_ARCHITECTURE.md and MOUNT_SYSTEM_GUIDE.md into docker/README.md - Created docker/cluster/README.md for cluster-specific operations - Added missing sync_experiments.sh script from moleworks_ext - Follows moleworks_ext documentation structure --- docker/DOCKER_ARCHITECTURE.md | 310 ----------------------- docker/MOUNT_SYSTEM_GUIDE.md | 389 ----------------------------- docker/README.md | 387 ++++++++++++++++++++++++++++ docker/cluster/README.md | 268 ++++++++++++++++++++ docker/cluster/sync_experiments.sh | 144 +++++++++++ 5 files changed, 799 insertions(+), 699 deletions(-) delete mode 100644 docker/DOCKER_ARCHITECTURE.md delete mode 100644 docker/MOUNT_SYSTEM_GUIDE.md create mode 100644 docker/README.md create mode 100644 docker/cluster/README.md create mode 100644 docker/cluster/sync_experiments.sh diff --git a/docker/DOCKER_ARCHITECTURE.md b/docker/DOCKER_ARCHITECTURE.md deleted file mode 100644 index 06095e28..00000000 --- a/docker/DOCKER_ARCHITECTURE.md +++ /dev/null @@ -1,310 +0,0 @@ -# Docker Container Architecture - -This document describes the simplified Docker container architecture for IsaacLab extensions. - -## Overview - -The Docker setup has been simplified from 4 containers to 2 containers: - -1. **Production Container** (`isaac-lab-ext`) - For cluster deployment and training -2. **Development Container** (`isaac-lab-ext-dev`) - Unified development container with ROS2 and dual-mode support - -## Container Details - -### 1. Production Container (Dockerfile.ext) - -- **Purpose**: Minimal container for cluster deployment and training -- **Base Image**: `isaac-lab-base` -- **Features**: - - Minimal package installation for reduced size - - Includes rsl_rl for reinforcement learning - - Optimized for training performance - - No development tools or ROS2 - -**Usage**: -```bash -./container.sh -p ext build -./container.sh -p ext run -``` - -### 2. Development Container (Dockerfile.ext-dev) - -- **Purpose**: Unified development environment with all features -- **Base Image**: `isaac-lab-ros2` -- **Features**: - - All ROS2 packages and dependencies - - Development tools (Claude Code, git-lfs, pytest, etc.) - - Pinocchio robotics library - - CUDA toolkit - - Dual-mode support (root/rootless) - -**Usage**: -```bash -# Standard mode (with user switching) -./container.sh -p ext-dev build -./container.sh -p ext-dev run - -# Rootless mode (everyone runs as root) -./container.sh -p ext-dev-rootless run -``` - -## Dual-Mode Operation - -The development container supports two modes of operation: - -### Root Mode (Default) -- Traditional Docker behavior with user switching -- Uses gosu to switch to the host user inside container -- Preserves host user permissions -- Full access to host filesystem via bind mounts - -### Rootless Mode -- Everyone runs as root inside the container -- Simplified permission model -- Reduced host system access -- Suitable for environments where Docker daemon doesn't have root access - -### Mode Selection - -The mode is controlled by the `DOCKER_ROOTLESS_MODE` environment variable: - -```bash -# Force rootless mode -export DOCKER_ROOTLESS_MODE=true -./container.sh -p ext-dev run - -# Or use the rootless service -./container.sh -p ext-dev-rootless run -``` - -## Flexible Permission System - -The development container implements a sophisticated permission management system that handles various deployment scenarios and permission requirements. - -### How It Works - -#### 1. Dynamic User Creation (Root Mode) -When running in root mode (default), the container dynamically creates a user matching your host system: - -```bash -# Automatically detects host user -USER_ID=${LOCAL_UID:-$(id -u)} # Your host UID -GROUP_ID=${LOCAL_GID:-$(id -g)} # Your host GID - -# Creates matching user inside container -groupadd -g $GROUP_ID -o $USER_NAME -useradd -m -u $USER_ID -g $GROUP_ID -o -s /bin/bash $USER_NAME - -# Switches to that user with gosu -exec gosu $USER_NAME bash -``` - -This ensures files created in the container have the same ownership as on your host system. - -#### 2. Rootless Mode Operation -In rootless mode, everyone runs as root inside the container: -- No user switching occurs -- Simplified permission model -- Ideal for Docker installations without root access -- All operations happen as UID 0 within container - -#### 3. Automatic Permission Fixing -The `FIX_PERMISSIONS` feature automatically corrects file ownership when the container exits: - -```bash -# Enable permission fixing -export FIX_PERMISSIONS=true -./docker/run_dev.sh - -# On container exit, automatically runs: -find "/workspace/${EXTENSION_NAME}" -user root -exec chown ${USER_ID}:${GROUP_ID} {} \; -``` - -#### 4. Background Permission Setup -Large directories are fixed in the background to avoid startup delays: -- `/isaac-sim/kit` permissions are corrected asynchronously -- Container is immediately usable while permissions are being fixed -- Check `~/.permissions_done` file to verify completion - -### Usage Scenarios - -#### Scenario 1: Personal Development Machine -```bash -# Standard mode - preserves your user permissions -./docker/run_dev.sh -# Files created as: youruser:yourgroup -``` - -#### Scenario 2: Student PC (No Root Docker) -```bash -# Rootless mode - everyone is root inside -./docker/run_dev.sh --rootless -# Files created as: root:root (inside container) -``` - -#### Scenario 3: Shared Development Server -```bash -# Custom UID/GID with permission fixing -./docker/run_dev.sh -u 2000 -g 2000 --fix-perms -# Files created as: uid=2000:gid=2000 -``` - -#### Scenario 4: CI/CD Pipeline -```bash -# Rootless with no permission concerns -export DOCKER_ROOTLESS_MODE=true -./docker/container.sh -p ext-dev run python scripts/test.py -``` - -### Environment Variables - -| Variable | Default | Description | -|----------|---------|-------------| -| `DOCKER_ROOTLESS_MODE` | `false` | Enable rootless mode (true/false) | -| `FIX_PERMISSIONS` | `false` | Auto-fix permissions on exit | -| `LOCAL_UID` | Current user | Override user ID | -| `LOCAL_GID` | Current group | Override group ID | -| `DOCKER_USER_NAME` | `user` | Username inside container | -| `DOCKER_USER_HOME` | `/home/$USER` | Home directory path | - -### Permission Decision Tree - -``` -Start Container - β”‚ - β”œβ”€ DOCKER_ROOTLESS_MODE=true? - β”‚ β”‚ - β”‚ └─ Yes β†’ Run as root (UID 0) - β”‚ No user switching - β”‚ Simplified permissions - β”‚ - └─ No β†’ Create user with LOCAL_UID/LOCAL_GID - β”‚ - β”œβ”€ Switch to user with gosu - β”‚ - └─ FIX_PERMISSIONS=true? - β”‚ - └─ Yes β†’ Install exit trap - Fix ownership on exit -``` - -## Migration Guide - -### From Old Setup - -If you were using an older multi-container setup: - -1. **ext** β†’ No changes needed -2. **ext-dev** β†’ Use new unified ext-dev -3. **ext-ros2** β†’ Use new unified ext-dev (includes ROS2) -4. **ext-dev-rootless** β†’ Use ext-dev-rootless service - -### Building Containers - -```bash -# Build production container -./container.sh -p ext build - -# Build development container -./container.sh -p ext-dev build -``` - -### Environment Files - -The following environment files are still used: -- `.env.ext_template` - Production container -- `.env.ext_template-dev` - Development container - -## Container Features Comparison - -| Feature | Production (ext) | Development (ext-dev) | -|---------|-----------------|---------------------| -| Base Image | isaac-lab-base | isaac-lab-ros2 | -| ROS2 | ❌ | βœ… | -| Claude Code | ❌ | βœ… | -| Development Tools | ❌ | βœ… | -| Pinocchio | ❌ | βœ… | -| CUDA Toolkit | ❌ | βœ… | -| Size | Minimal | Full | -| Dual-Mode | ❌ | βœ… | - -## Best Practices - -1. **Use Production Container for**: - - Cluster training jobs - - Performance testing - - Deployment scenarios - -2. **Use Development Container for**: - - Local development - - ROS2 integration work - - Testing and debugging - - Running with different permission models - -3. **Permission Management**: - - Enable `FIX_PERMISSIONS` when working with mounted volumes - - Use rootless mode on systems without root Docker access - - Check `.permissions_done` file to verify background tasks completed - -## Troubleshooting - -### Common Permission Issues and Solutions - -#### Issue 1: "Permission denied" errors -**Symptom**: Can't write to mounted directories -```bash -# Solution 1: Use rootless mode -./docker/run_dev.sh --rootless - -# Solution 2: Enable permission fixing -./docker/run_dev.sh --fix-perms - -# Solution 3: Manually fix permissions -sudo chown -R $(id -u):$(id -g) /workspace/ext_template -``` - -#### Issue 2: Files created as root on host -**Symptom**: After running container, files are owned by root -```bash -# Prevention: Always use FIX_PERMISSIONS -export FIX_PERMISSIONS=true -./docker/run_dev.sh - -# Fix existing files -sudo chown -R $(id -u):$(id -g) . -``` - -#### Issue 3: Container startup is slow -**Symptom**: Long wait before container is usable -```bash -# Check if permissions are still being fixed -./docker/container.sh -p ext-dev exec cat ~/.permissions_done -# If file exists, background setup is complete -``` - -#### Issue 4: Can't access GPU in rootless mode -**Symptom**: nvidia-smi fails in container -```bash -# Ensure NVIDIA_DRIVER_CAPABILITIES is set -export NVIDIA_DRIVER_CAPABILITIES=all -./docker/run_dev.sh --rootless -``` - -### Rootless Mode Issues -```bash -# Verify rootless mode is active -echo $DOCKER_ROOTLESS_MODE - -# Check container user -./container.sh -p ext-dev-rootless exec whoami -# Should output: root -``` - -### Build Issues -```bash -# Clean build with no cache -docker compose build --no-cache isaac-lab-ext-dev - -# Remove old images -docker image prune -f -``` \ No newline at end of file diff --git a/docker/MOUNT_SYSTEM_GUIDE.md b/docker/MOUNT_SYSTEM_GUIDE.md deleted file mode 100644 index fa67b7d6..00000000 --- a/docker/MOUNT_SYSTEM_GUIDE.md +++ /dev/null @@ -1,389 +0,0 @@ -# Unified Mount System Guide - -## Overview - -The unified mount system provides a consistent and flexible way to optionally mount external codebases (IsaacLab and RSL-RL) in both Docker and Singularity containers. This solves the challenge of making mounts optional while maintaining compatibility across container runtimes. - -## Key Features - -- **Unified Configuration**: Single configuration file (`.mount.config`) that works for both Docker and Singularity -- **Optional Mounting**: Easily enable/disable mounts without modifying docker-compose files -- **Validation**: Automatic validation of mount paths before container startup -- **Interactive Setup**: User-friendly setup process for configuring mounts -- **Backward Compatibility**: Legacy environment variables still work for cluster operations - -## Quick Start - -### 1. Initial Setup - -Run the interactive setup to configure your mounts: - -```bash -cd docker -./container.sh mount-setup -``` - -This will: -- Create a `.mount.config` file with your mount preferences -- Validate the paths you provide -- Generate `docker-compose.override.yaml` automatically - -### 2. Running Containers - -Use the new container management script: - -```bash -# Run development container -./container.sh -p ext-dev run - -# Run with a specific script -./container.sh -p ext-ros2 run python scripts/train.py - -# Build container -./container.sh -p ext build -``` - -## Configuration File - -The `.mount.config` file stores your mount preferences: - -```json -{ - "mounts": { - "isaaclab": { - "enabled": false, - "local_path": "/path/to/your/isaaclab", - "container_path": "/workspace/isaaclab", - "mount_type": "source", - "description": "External IsaacLab installation" - }, - "rsl_rl": { - "enabled": false, - "local_path": "/path/to/your/rsl_rl", - "container_path": "/workspace/isaaclab/_isaac_sim/kit/python/lib/python3.10/site-packages/rsl_rl", - "mount_type": "full", - "description": "External RSL-RL installation" - } - } -} -``` - -### Mount Types - -- **source**: For IsaacLab - mounts only the `source/` subdirectory to preserve the container's Python environment -- **full**: For RSL-RL - completely overrides the built-in version - -## Mount Management Commands - -### Setup and Configuration - -```bash -# Interactive setup -./container.sh mount-setup - -# Show current configuration -./container.sh mount-show -./container.sh mount-show --format yaml -./container.sh mount-show --format docker -./container.sh mount-show --format singularity - -# Validate configuration -./container.sh mount-validate -``` - -### Enable/Disable Mounts - -```bash -# Enable a mount -./container.sh mount-enable isaaclab -./container.sh mount-enable rsl_rl - -# Disable a mount -./container.sh mount-disable isaaclab -./container.sh mount-disable rsl_rl -``` - -### Set Mount Paths - -```bash -# Set mount path -./container.sh mount-set isaaclab ~/my-isaaclab -./container.sh mount-set rsl_rl ~/my-rsl-rl -``` - -## Docker Usage - -The system automatically generates `docker-compose.override.yaml` based on your mount configuration. This file is used alongside the main `docker-compose.yaml`. - -### Manual Override Regeneration - -If you modify `.mount.config` directly, regenerate the override: - -```bash -./container.sh -r -p ext-dev run -# or -python3 mount_config.py generate -``` - -## Cluster/Singularity Usage - -The system supports both syncing from local to cluster and mount-only modes. - -### Mount Modes - -1. **Sync Mode** (default): Syncs codebase from local to cluster, then mounts -2. **Mount-Only Mode**: Mounts existing codebase on cluster without syncing - -### Basic Cluster Workflow (Sync Mode) - -1. Configure mounts locally: - ```bash - cd docker - ./container.sh mount-setup - # Enable mounts and set local paths - ``` - -2. Push container to cluster: - ```bash - cd cluster - ./cluster_interface.sh push ext_template - ``` - -3. Submit job (codebases are synced and mounted automatically): - ```bash - ./cluster_interface.sh job ext_template --task YourTask --num_envs 64000 - ``` - -### Mount-Only Mode (No Sync) - -Perfect for when codebases already exist on the cluster: - -```bash -# Configure mount-only for IsaacLab -./container.sh mount-enable isaaclab -./container.sh mount-set-sync isaaclab off -./container.sh mount-set-cluster isaaclab /cluster/home/$USER/isaaclab - -# Configure mount-only for RSL-RL -./container.sh mount-enable rsl_rl -./container.sh mount-set-sync rsl_rl off -./container.sh mount-set-cluster rsl_rl /cluster/home/$USER/rsl_rl - -# Submit job (no sync, just mount) -cd cluster -./cluster_interface.sh job ext_template --task YourTask -``` - -### Mixed Mode Example - -Sync IsaacLab but mount existing RSL-RL: - -```bash -# IsaacLab: sync from local -./container.sh mount-enable isaaclab -./container.sh mount-set isaaclab ~/my-isaaclab -./container.sh mount-set-sync isaaclab on - -# RSL-RL: mount existing on cluster -./container.sh mount-enable rsl_rl -./container.sh mount-set-sync rsl_rl off -./container.sh mount-set-cluster rsl_rl /cluster/scratch/$USER/rsl_rl -``` - -## Advanced Usage - -### Using Python API - -```python -from mount_config import MountConfig - -# Load configuration -config = MountConfig() - -# Enable a mount programmatically -config.config["mounts"]["isaaclab"]["enabled"] = True -config.config["mounts"]["isaaclab"]["local_path"] = "/path/to/isaaclab" -config.save_config() - -# Generate docker-compose override -config.generate_docker_compose_override() - -# Get Singularity bind string -binds = config.get_singularity_binds() -print(binds) # -B /path/to/isaaclab/source:/workspace/isaaclab/source:rw -``` - -### Custom Mount Profiles - -You can create different mount configurations for different scenarios: - -```bash -# Save current config -cp .mount.config .mount.config.backup - -# Create development config -./container.sh mount-setup -cp .mount.config .mount.config.dev - -# Create production config -./container.sh mount-setup -cp .mount.config .mount.config.prod - -# Switch between configs -cp .mount.config.dev .mount.config -./container.sh -r -p ext-dev run -``` - -## Migration from Old System - -### Docker - -Old docker-compose.yaml approach: -```yaml -- type: bind - source: ${EXTERNAL_ISAACLAB_PATH:-/dev/null}/source - target: /workspace/isaaclab/source -``` - -New approach: -1. Remove external mount lines from docker-compose.yaml -2. Run `./container.sh mount-setup` -3. Use `./container.sh` to manage containers - -### Environment Files - -Old `.env` approach: -```bash -EXTERNAL_ISAACLAB_PATH=/path/to/isaaclab -EXTERNAL_RSL_RL_PATH=/path/to/rsl_rl -``` - -New approach: -- These variables are no longer needed in `.env` files -- Configuration is stored in `.mount.config` -- Use `./container.sh mount-set` to update paths - -## Troubleshooting - -### Mount Not Working - -1. Check configuration: - ```bash - ./container.sh mount-validate - ``` - -2. Verify override file exists: - ```bash - ls -la docker-compose.override.yaml - ``` - -3. Regenerate override: - ```bash - ./container.sh -r -p ext-dev run - ``` - -### Path Validation Errors - -- **IsaacLab**: Ensure the path contains a `source/` subdirectory -- **RSL-RL**: Ensure the path is either: - - A Python package with `__init__.py` - - A repository with `rsl_rl/` subdirectory - -### Container Can't Find Mounted Code - -1. Check mount is enabled: - ```bash - ./container.sh mount-show --format docker - ``` - -2. Verify paths inside container: - ```bash - ./container.sh -p ext-dev exec ls -la /workspace/isaaclab/source - ``` - -## Best Practices - -1. **Always Use Interactive Setup**: The `mount-setup` command validates paths and prevents common errors - -2. **Test Locally First**: Verify mounts work in Docker before pushing to cluster - -3. **Keep Built-in Versions**: When possible, use the built-in IsaacLab and RSL-RL for stability - -4. **Document Custom Setups**: If using external codebases, document the specific versions/branches required - -5. **Version Control**: Don't commit `.mount.config` or `docker-compose.override.yaml` - they're user-specific - -## Technical Details - -### How It Works - -1. **Configuration**: User preferences stored in `.mount.config` (JSON format) -2. **Docker**: `mount_config.py` generates `docker-compose.override.yaml` with bind mounts -3. **Singularity**: `run_singularity.sh` reads `.mount.config` and adds `-B` bind flags -4. **Validation**: Paths are validated before container startup to prevent runtime errors - -### Docker Compose Override Mechanism - -The system uses Docker Compose's built-in override feature to cleanly manage user-specific mounts: - -#### How Override Files Work - -When you run any docker-compose command, Docker Compose automatically: -1. Reads `docker-compose.yaml` (base configuration) -2. Looks for `docker-compose.override.yaml` in the same directory -3. **Merges** the configurations, with override values taking precedence - -#### Example Merge Process - -**Base `docker-compose.yaml`:** -```yaml -services: - isaac-lab-ext-dev: - image: isaac-lab-ext_template-dev - volumes: - - type: bind - source: ${EXT_PATH} - target: /workspace/${EXTENSION_NAME} -``` - -**Generated `docker-compose.override.yaml`:** -```yaml -services: - isaac-lab-ext-dev: - volumes: - - type: bind - source: /home/user/isaaclab/source - target: /workspace/isaaclab/source -``` - -**Result:** The container gets ALL volumes from both files merged together. - -#### Why Use Override? - -1. **Separation of Concerns**: Base config (tracked) vs user mounts (ignored) -2. **No Manual Editing**: Users never modify docker-compose.yaml -3. **Clean Git History**: No merge conflicts from different mount preferences -4. **Easy Disable**: Just delete override file to remove all custom mounts -5. **Standard Feature**: Works with all docker-compose commands automatically - -### File Structure - -``` -docker/ -β”œβ”€β”€ mount_config.py # Core mount management script -β”œβ”€β”€ container.sh # User-friendly wrapper script -β”œβ”€β”€ .mount.config # User's mount configuration (git-ignored) -β”œβ”€β”€ .mount.config.template # Template for new users -β”œβ”€β”€ docker-compose.yaml # Clean compose file without external mounts -β”œβ”€β”€ docker-compose.override.yaml # Generated mount overrides (git-ignored) -└── cluster/ - β”œβ”€β”€ run_singularity.sh # Updated to support unified config - └── sync_mounts.py # Handles selective syncing to cluster -``` - -### Integration Points - -- **Docker Compose**: Uses override mechanism to add mounts without modifying base file -- **Singularity**: Parses config file and generates bind mount arguments -- **Cluster Sync**: `.mount.config` is automatically included when syncing to cluster -- **Selective Sync**: Only syncs codebases marked with `sync_to_cluster: true` \ No newline at end of file diff --git a/docker/README.md b/docker/README.md new file mode 100644 index 00000000..d3e64a23 --- /dev/null +++ b/docker/README.md @@ -0,0 +1,387 @@ +# Docker Setup for IsaacLab Extension Template + +This guide provides comprehensive documentation for building, running, and deploying IsaacLab extension containers for both local development and cluster deployment. + +## Table of Contents + +- [Overview](#overview) +- [Prerequisites](#prerequisites) +- [Quick Start](#quick-start) +- [Container Architecture](#container-architecture) + - [Container Types](#container-types) + - [Permission System](#permission-system) +- [Building Containers](#building-containers) +- [Running Containers](#running-containers) + - [Development Workflow](#development-workflow) + - [Production Usage](#production-usage) +- [Mount Configuration System](#mount-configuration-system) + - [Quick Setup](#quick-setup) + - [Mount Management](#mount-management) + - [Technical Details](#technical-details) +- [Cluster Deployment](#cluster-deployment) +- [Troubleshooting](#troubleshooting) +- [Migration Guide](#migration-guide) + +## Overview + +The Docker setup provides two main containers: + +1. **Production Container** (`isaac-lab-ext`) - Minimal container for training and cluster deployment +2. **Development Container** (`isaac-lab-ext-dev`) - Full-featured container with ROS2, development tools, and dual-mode support + +Both containers include: +- Isaac Lab and Isaac Sim integration +- RSL-RL for reinforcement learning +- Your extension package +- Optional mounting of external codebases + +## Prerequisites + +1. **Docker and Docker Compose** installed on your system +2. **NVIDIA GPU** with appropriate drivers +3. **Environment files** - Create by copying templates: + ```bash + cp docker/.env.ext_template.template docker/.env.ext_template + cp docker/.env.ext_template-dev.template docker/.env.ext_template-dev + ``` + Edit these files with your settings (they're git-ignored). +4. **IsaacLab base images** - Either pull from DockerHub or build locally (see [Building Containers](#building-containers)) + +## Quick Start + +### Using container.sh (Recommended) + +```bash +# Build and run development container +./docker/container.sh -p ext-dev build +./docker/container.sh -p ext-dev run + +# Run with a specific command +./docker/container.sh -p ext-dev run python scripts/rsl_rl/train.py --task YourTask + +# Run in rootless mode (for systems without root Docker) +./docker/container.sh -p ext-dev-rootless run +``` + +### Production Container + +```bash +# Build and run production container +./docker/container.sh -p ext build +./docker/container.sh -p ext run +``` + +## Container Architecture + +### Container Types + +#### 1. Production Container (`Dockerfile.ext`) +- **Base Image**: `isaac-lab-base` +- **Purpose**: Cluster deployment and training +- **Features**: + - Minimal footprint for performance + - Includes RSL-RL + - No development tools or ROS2 +- **Use Cases**: Training jobs, performance testing, deployment + +#### 2. Development Container (`Dockerfile.ext-dev`) +- **Base Image**: `isaac-lab-ros2` +- **Purpose**: Local development with all features +- **Features**: + - ROS2 packages and integration + - Development tools (Claude Code, pytest, ruff, etc.) + - Pinocchio robotics library + - CUDA toolkit + - Git LFS support + - Dual-mode operation (root/rootless) +- **Use Cases**: Development, debugging, ROS2 integration, testing + +### Permission System + +The development container supports flexible permission management through two modes: + +#### Root Mode (Default) +- Traditional Docker behavior with user switching +- Preserves host user permissions +- Files created match host user ownership +- Uses `gosu` for seamless user switching + +#### Rootless Mode +- Everyone runs as root inside container +- Simplified permission model +- Suitable for Docker installations without root access +- Ideal for student PCs or restricted environments + +#### Quick Permission Reference + +| Use Case | Command | Result | +|----------|---------|--------| +| Personal dev machine | `./docker/container.sh -p ext-dev run` | Files owned by your user | +| Student PC (no root) | `./docker/container.sh -p ext-dev-rootless run` | Run as root inside container | +| Shared server | `FIX_PERMISSIONS=true ./docker/container.sh -p ext-dev run` | Auto-fix ownership on exit | + +#### Environment Variables + +- `DOCKER_ROOTLESS_MODE`: Enable rootless mode (true/false) +- `FIX_PERMISSIONS`: Auto-fix file permissions on exit (true/false) +- `LOCAL_UID`: Override user ID (default: current user) +- `LOCAL_GID`: Override group ID (default: current group) + +## Building Containers + +### Step 1: Obtain Base Images + +```bash +# Option 1: Pull from DockerHub (recommended) +docker pull jmanan/isaac-lab-base:latest +docker tag jmanan/isaac-lab-base:latest isaac-lab-base + +docker pull jmanan/isaac-lab-ros2:latest +docker tag jmanan/isaac-lab-ros2:latest isaac-lab-ros2 + +# Option 2: Build locally (if you have IsaacLab source) +cd /path/to/isaaclab +./docker/container.py build isaac-lab-base +./docker/container.py build isaac-lab-ros2 +``` + +### Step 2: Build Extension Containers + +```bash +# Build production container +./docker/container.sh -p ext build + +# Build development container +./docker/container.sh -p ext-dev build +``` + +## Running Containers + +### Development Workflow + +```bash +# Standard development mode +./docker/container.sh -p ext-dev run + +# Rootless mode for restricted systems +./docker/container.sh -p ext-dev-rootless run + +# Run specific command +./docker/container.sh -p ext-dev run python scripts/rsl_rl/train.py --task YourTask + +# Attach to running container +./docker/container.sh -p ext-dev attach + +# Execute command in running container +./docker/container.sh -p ext-dev exec nvidia-smi +``` + +### Production Usage + +```bash +# Run production container +./docker/container.sh -p ext run + +# Training example +./docker/container.sh -p ext run python scripts/rsl_rl/train.py \ + --task YourTask --num_envs 1024 +``` + +### Claude Code AI Assistant + +The development container includes Claude Code for AI-assisted development: + +```bash +# Inside the container +claude +``` + +## Mount Configuration System + +The unified mount system allows optional mounting of external IsaacLab and RSL-RL codebases in both Docker and Singularity environments. + +### Quick Setup + +```bash +# Interactive setup +./docker/container.sh mount-setup + +# This will: +# 1. Create .mount.config with your preferences +# 2. Validate paths +# 3. Generate docker-compose.override.yaml +``` + +### Mount Management + +```bash +# Show current configuration +./docker/container.sh mount-show + +# Enable/disable mounts +./docker/container.sh mount-enable isaaclab +./docker/container.sh mount-disable rsl_rl + +# Set mount paths +./docker/container.sh mount-set isaaclab ~/my-isaaclab +./docker/container.sh mount-set rsl_rl ~/my-rsl-rl + +# Validate configuration +./docker/container.sh mount-validate +``` + +### Technical Details + +#### Configuration File (.mount.config) + +```json +{ + "mounts": { + "isaaclab": { + "enabled": false, + "local_path": "/path/to/isaaclab", + "container_path": "/workspace/isaaclab", + "mount_type": "source", // Mounts only source/ subdirectory + "description": "External IsaacLab installation" + }, + "rsl_rl": { + "enabled": false, + "local_path": "/path/to/rsl_rl", + "container_path": "/workspace/isaaclab/_isaac_sim/.../rsl_rl", + "mount_type": "full", // Completely overrides built-in + "description": "External RSL-RL installation" + } + } +} +``` + +#### How It Works + +1. **Configuration**: User preferences stored in `.mount.config` +2. **Docker**: Generates `docker-compose.override.yaml` with bind mounts +3. **Singularity**: Reads config and adds `-B` bind flags +4. **Validation**: Paths validated before container startup + +The system uses Docker Compose's override mechanism, automatically merging: +- Base configuration: `docker-compose.yaml` +- User mounts: `docker-compose.override.yaml` (auto-generated) + +## Cluster Deployment + +For detailed cluster operations, see [docker/cluster/README.md](cluster/README.md). + +### Quick Cluster Workflow + +```bash +# 1. Configure mounts (optional) +./docker/container.sh mount-setup + +# 2. Push container to cluster +cd docker/cluster +./cluster_interface.sh push ext_template + +# 3. Submit job +./cluster_interface.sh job ext_template --task YourTask --num_envs 64000 + +# 4. Sync logs back +./sync_experiments.sh --remove ~/experiments/logs +``` + +## Troubleshooting + +### Permission Issues + +```bash +# Enable automatic permission fixing +export FIX_PERMISSIONS=true +./docker/container.sh -p ext-dev run + +# Use rootless mode +./docker/container.sh -p ext-dev-rootless run + +# Manually fix permissions +sudo chown -R $(id -u):$(id -g) /workspace/ext_template +``` + +### Build Issues + +```bash +# Clean rebuild +docker compose build --no-cache isaac-lab-ext-dev + +# Remove old images +docker image prune -f + +# Check logs +./docker/container.sh -p ext-dev logs +``` + +### Mount Issues + +```bash +# Validate mounts +./docker/container.sh mount-validate + +# Regenerate override file +./docker/container.sh -r -p ext-dev run + +# Check mounted paths inside container +./docker/container.sh -p ext-dev exec ls -la /workspace/isaaclab/source +``` + +### GPU Access + +```bash +# Verify host GPU +nvidia-smi + +# Check container GPU access +./docker/container.sh -p ext-dev exec nvidia-smi + +# For rootless mode +export NVIDIA_DRIVER_CAPABILITIES=all +./docker/container.sh -p ext-dev-rootless run +``` + +## Migration Guide + +### From Simple Docker Setup + +If using the previous simple Docker setup: +- The single Dockerfile has been replaced with two specialized containers +- Use `ext` for production/training +- Use `ext-dev` for development + +### From Environment Variables + +Old approach with environment variables: +```bash +EXTERNAL_ISAACLAB_PATH=/path/to/isaaclab # No longer used +EXTERNAL_RSL_RL_PATH=/path/to/rsl_rl # No longer used +``` + +New approach: +```bash +./docker/container.sh mount-setup # Interactive configuration +# or +./docker/container.sh mount-set isaaclab /path/to/isaaclab +``` + +## Best Practices + +1. **Container Selection**: + - Use production container for cluster training and deployment + - Use development container for local development and debugging + +2. **Permission Management**: + - Enable `FIX_PERMISSIONS` when working with mounted volumes + - Use rootless mode on systems without root Docker access + +3. **Mount Configuration**: + - Prefer built-in IsaacLab/RSL-RL for stability + - Test mounts locally before cluster deployment + - Don't commit `.mount.config` or `docker-compose.override.yaml` + +4. **Performance**: + - Use production container for training to minimize overhead + - Limit mounted volumes to necessary paths only \ No newline at end of file diff --git a/docker/cluster/README.md b/docker/cluster/README.md new file mode 100644 index 00000000..5251c2cd --- /dev/null +++ b/docker/cluster/README.md @@ -0,0 +1,268 @@ +# Cluster Operations Guide + +This guide focuses on cluster-specific operations for deploying IsaacLab extension training jobs. For general Docker setup and local development, see the main [Docker README](../README.md). + +## Prerequisites + +1. **SSH access** to your cluster +2. **Docker and Apptainer** installed locally +3. **Environment file**: Copy and configure the cluster environment: + ```bash + cp .env.cluster.template .env.cluster + ``` + Edit `.env.cluster` with your cluster-specific settings: + - `CLUSTER_USER`: Your cluster username + - `CLUSTER_LOGIN`: SSH login string (e.g., `username@euler.ethz.ch`) + - `CLUSTER_ISAACLAB_DIR`: Base directory for experiments on cluster + - `CLUSTER_SIF_PATH`: Directory for Singularity images + - `CLUSTER_JOB_SCHEDULER`: SLURM or PBS + - `CLUSTER_PYTHON_EXECUTABLE`: Script to run (e.g., `scripts/rsl_rl/train.py`) + +## Quick Start + +```bash +# 1. Build container locally (if not already done) +cd ../ +./container.sh -p ext build + +# 2. Push container to cluster +cd cluster +./cluster_interface.sh push ext_template + +# 3. Submit training job +./cluster_interface.sh job ext_template --task YourTask --num_envs 64000 + +# 4. Sync logs back to local machine +./sync_experiments.sh --remove ~/experiments/logs +``` + +## Detailed Operations + +### Pushing Container to Cluster + +The push operation converts your Docker image to Singularity format and uploads it: + +```bash +./cluster_interface.sh push + +# Example +./cluster_interface.sh push ext_template +``` + +**Note**: The image must be named `isaac-lab-` (e.g., `isaac-lab-ext_template`). + +### Submitting Jobs + +Submit jobs with custom arguments: + +```bash +./cluster_interface.sh job [arguments] + +# Examples +./cluster_interface.sh job ext_template --task YourTask --num_envs 64000 +./cluster_interface.sh job ext_template --task YourOtherTask --headless + +# Check for large files after sync (adds validation step) +./cluster_interface.sh -c job ext_template --task YourTask +``` + +### External Codebase Mounting + +The cluster system supports the unified mount configuration: + +```bash +# Configure mounts locally before pushing +cd ../ +./container.sh mount-setup + +# Mounts are automatically synced with the container +cd cluster +./cluster_interface.sh push ext_template +``` + +#### Mount Modes + +1. **Sync Mode** (default): Syncs codebase from local to cluster +2. **Mount-Only Mode**: Uses existing codebase on cluster without syncing + +```bash +# Configure mount-only mode +cd ../ +./container.sh mount-enable isaaclab +./container.sh mount-set-sync isaaclab off +./container.sh mount-set-cluster isaaclab /cluster/home/$USER/isaaclab +``` + +### Synchronizing Logs + +Sync experiment logs from cluster to local machine: + +```bash +# Basic sync +./sync_experiments.sh + +# Sync to specific folder +./sync_experiments.sh ~/my-experiments + +# Sync and remove remote logs +./sync_experiments.sh --remove ~/experiments/logs +``` + +## Job Management + +### Check Job Status + +```bash +# SLURM +ssh $CLUSTER_LOGIN "squeue -u $USER" + +# PBS +ssh $CLUSTER_LOGIN "qstat -u $USER" +``` + +### Cancel Jobs + +```bash +# SLURM +ssh $CLUSTER_LOGIN "scancel " + +# PBS +ssh $CLUSTER_LOGIN "qdel " +``` + +### View Job Output + +Job outputs are stored in the exports directory: +```bash +ssh $CLUSTER_LOGIN "ls -la $CLUSTER_ISAACLAB_DIR/exports/" +ssh $CLUSTER_LOGIN "tail -f $CLUSTER_ISAACLAB_DIR/exports/" +``` + +## Environment Variables + +Key variables in `.env.cluster`: + +| Variable | Description | Example | +|----------|-------------|---------| +| `CLUSTER_USER` | Your cluster username | `jsmith` | +| `CLUSTER_LOGIN` | SSH login string | `jsmith@euler.ethz.ch` | +| `CLUSTER_ISAACLAB_DIR` | Experiment directory | `/cluster/scratch/$USER/isaaclab` | +| `CLUSTER_SIF_PATH` | Singularity images | `/cluster/home/$USER/.singularity` | +| `CLUSTER_JOB_SCHEDULER` | Job system | `SLURM` or `PBS` | +| `CLUSTER_PYTHON_EXECUTABLE` | Script to run | `scripts/rsl_rl/train.py` | +| `CLUSTER_ISAAC_SIM_CACHE_DIR` | Isaac Sim cache | `/cluster/scratch/$USER/isaac-sim-cache` | +| `REMOVE_CODE_COPY_AFTER_JOB` | Cleanup after job | `true` or `false` | + +## Customizing Job Submission + +### Resource Requirements + +Edit `submit_job_slurm.sh` or `submit_job_pbs.sh` to modify: + +For SLURM: +```bash +#SBATCH -n 1 # Number of tasks +#SBATCH --cpus-per-task=4 # CPUs per task +#SBATCH --gpus=rtx_3090:1 # GPU type and count +#SBATCH --time=03:00:00 # Maximum runtime +#SBATCH --mem-per-cpu=4048 # Memory per CPU (MB) +``` + +For PBS: +```bash +#PBS -l select=1:ncpus=8:mpiprocs=1:ngpus=1 +#PBS -l walltime=01:00:00 +``` + +### Module Loading + +Add any required cluster modules in the submission scripts: +```bash +module load eth_proxy # Example for ETH clusters +module load cuda/11.8 # Load specific CUDA version +``` + +## Troubleshooting + +### Container Push Fails + +```bash +# Check Docker image exists +docker images | grep isaac-lab-ext_template + +# Verify SSH connection +ssh $CLUSTER_LOGIN "echo 'Connection successful'" + +# Check available space +ssh $CLUSTER_LOGIN "df -h $CLUSTER_SIF_PATH" +``` + +### Job Submission Issues + +```bash +# Verify Singularity image on cluster +ssh $CLUSTER_LOGIN "ls -la $CLUSTER_SIF_PATH/*.tar" + +# Check job script was created +ssh $CLUSTER_LOGIN "ls -la $CLUSTER_ISAACLAB_DIR/*.sh" + +# View error logs +ssh $CLUSTER_LOGIN "cat $CLUSTER_ISAACLAB_DIR/exports/*.err" +``` + +### Performance Tips + +1. **Use appropriate `--num_envs`**: Balance between GPU memory and parallelism +2. **Enable headless mode**: Add `--headless` for better performance +3. **Monitor GPU usage**: Check with `nvidia-smi` during training +4. **Use local scratch**: Configure `CLUSTER_ISAAC_SIM_CACHE_DIR` to use fast local storage + +## Advanced Usage + +### Custom Job Scripts + +For complex workflows, create custom submission scripts: + +```bash +# Copy and modify submission scripts +cp submit_job_slurm.sh submit_job_custom.sh +# Edit resource requirements, add pre/post processing, etc. +``` + +### Multi-GPU Training + +Configure multi-GPU jobs in the submission scripts: +- SLURM: Modify `#SBATCH --gres=gpu:X` +- PBS: Modify `#PBS -l select=1:ncpus=X:ngpus=Y` + +### Batch Job Submission + +Submit multiple experiments: + +```bash +for task in Task1 Task2 Task3; do + ./cluster_interface.sh job ext_template --task $task --num_envs 32000 +done +``` + +### Environment-Specific Settings + +Override cluster settings for specific runs: + +```bash +# Temporary override +CLUSTER_PYTHON_EXECUTABLE=scripts/custom_script.py \ + ./cluster_interface.sh job ext_template + +# Different cluster configuration +ENV_CLUSTER_PATH=.env.cluster.gpu2 \ + ./cluster_interface.sh push ext_template +``` + +## Best Practices + +1. **Test Locally First**: Verify your code works in the Docker container before cluster submission +2. **Start Small**: Test with fewer environments before scaling up +3. **Monitor Resources**: Check cluster quotas and job limits +4. **Use Checkpointing**: Save models periodically for long-running jobs +5. **Clean Up**: Remove old experiments to save cluster storage \ No newline at end of file diff --git a/docker/cluster/sync_experiments.sh b/docker/cluster/sync_experiments.sh new file mode 100644 index 00000000..5caab471 --- /dev/null +++ b/docker/cluster/sync_experiments.sh @@ -0,0 +1,144 @@ +#!/bin/bash +# sync_logs.sh: Synchronize logs from remote cluster experiments. +# Usage: ./sync_logs.sh [--remove] [local_log_folder] +# --remove Remove remote experiment directories after sync. +# local_log_folder Destination folder for logs (default: ./logs) + +set -e + +usage() { + echo "Usage: $0 [--remove] [local_log_folder]" + echo " --remove Remove remote experiment directories after sync." + echo " local_log_folder Destination folder for logs (default: ./logs)" + exit 1 +} + +# Process optional flags +REMOVE_REMOTE=false +while [[ $# -gt 0 ]]; do + case "$1" in + --remove) + REMOVE_REMOTE=true + shift + ;; + -h|--help) + usage + ;; + *) + break + ;; + esac +done + +# Local destination folder (default to ./logs if not provided) +LOCAL_DEST=${1:-./logs} +mkdir -p "$LOCAL_DEST" + +# Load .env.cluster (assumed to be in the same directory as this script) +SCRIPT_DIR=$(dirname "$0") +if [ -f "$SCRIPT_DIR/.env.cluster" ]; then + source "$SCRIPT_DIR/.env.cluster" +else + echo "Error: .env.cluster not found in $SCRIPT_DIR" + exit 1 +fi + +# Check required environment variables are set +if [ -z "$CLUSTER_ISAACLAB_DIR" ] || [ -z "$EXTENSION_NAME" ] || [ -z "$CLUSTER_LOGIN" ]; then + echo "Error: Missing required environment variables in .env.cluster." + exit 1 +fi + +# Determine the remote base directory. +# We assume that your timestamped experiment folders (e.g. ext_template_20250214_1500) +# are created in the parent directory of CLUSTER_ISAACLAB_DIR. +REMOTE_BASE=$(dirname "$CLUSTER_ISAACLAB_DIR") + +# Define search patterns for experiment directories +echo "Searching for remote experiment directories in ${REMOTE_BASE} matching ${EXTENSION_NAME} or ${EXTENSION_NAME}_* ..." + +# List matching directories on the remote cluster with logs folders in one go +# We search for both exact name (e.g., ext_template) and name with a suffix (e.g., ext_template_20240101_1200) +echo "Scanning for experiment directories with logs folders..." +REMOTE_DIRS_WITH_LOGS=$(ssh "$CLUSTER_LOGIN" " + for dir in ${REMOTE_BASE}/${EXTENSION_NAME} ${REMOTE_BASE}/${EXTENSION_NAME}_*; do + if [ -d \"\$dir\" ] && [ -d \"\$dir/logs\" ]; then + echo \"\$dir\" + fi + done +" 2>/dev/null || true) + +# Also get directories without logs for reporting +REMOTE_DIRS_NO_LOGS=$(ssh "$CLUSTER_LOGIN" " + for dir in ${REMOTE_BASE}/${EXTENSION_NAME} ${REMOTE_BASE}/${EXTENSION_NAME}_*; do + if [ -d \"\$dir\" ] && [ ! -d \"\$dir/logs\" ]; then + echo \"\$dir\" + fi + done +" 2>/dev/null || true) + +if [ -z "$REMOTE_DIRS_WITH_LOGS" ] && [ -z "$REMOTE_DIRS_NO_LOGS" ]; then + echo "No remote experiment directories found matching patterns ${EXTENSION_NAME} or ${EXTENSION_NAME}_* in ${REMOTE_BASE}" + exit 0 +fi + +# Report directories without logs upfront +if [ -n "$REMOTE_DIRS_NO_LOGS" ]; then + echo "Found experiment directories without logs folders:" + for dir in $REMOTE_DIRS_NO_LOGS; do + echo " - $(basename "$dir") (no logs folder)" + done +fi + +if [ -z "$REMOTE_DIRS_WITH_LOGS" ]; then + echo "No experiment directories with logs folders found." + exit 0 +fi + +echo "Found $(echo "$REMOTE_DIRS_WITH_LOGS" | wc -l) experiment directories with logs folders." + +# Loop over each directory with logs +for remote_dir in $REMOTE_DIRS_WITH_LOGS; do + BASENAME=$(basename "$remote_dir") + REMOTE_LOG_DIR="${remote_dir}/logs/" + LOCAL_SUBDIR="${LOCAL_DEST}/${BASENAME}" + + echo "" + echo "=== Processing experiment: ${BASENAME} ===" + + # Calculate transfer size + echo -n "Calculating transfer size... " + transfer_info=$(ssh "$CLUSTER_LOGIN" "du -sh ${REMOTE_LOG_DIR} 2>/dev/null" | awk '{print $1}' || echo "unknown") + echo "done (${transfer_info})" + + # Ensure local sub-directory exists + mkdir -p "$LOCAL_SUBDIR" + + # Synchronize the log folder with minimal output + echo "Syncing logs to ${LOCAL_SUBDIR}/" + rsync -avz --info=progress2 --no-inc-recursive "$CLUSTER_LOGIN:${REMOTE_LOG_DIR}" "$LOCAL_SUBDIR/" | \ + while IFS= read -r line; do + if [[ "$line" =~ ^[[:space:]]*[0-9,]+[[:space:]]+[0-9]+%[[:space:]]+[0-9.]+[A-Za-z]+/s[[:space:]]+[0-9:]+[[:space:]]*$ ]]; then + # Progress line - show with carriage return for real-time update + echo -ne "\r Progress: $line" + elif [[ "$line" =~ sent.*received.*bytes ]]; then + # Final summary + echo -e "\n Transfer complete: $line" + fi + done + RSYNC_STATUS=${PIPESTATUS[0]} + + if [ $RSYNC_STATUS -eq 0 ]; then + echo "βœ“ Successfully synced logs for ${BASENAME}" + if [ "$REMOVE_REMOTE" = true ]; then + echo " Removing remote directory: $remote_dir" + ssh "$CLUSTER_LOGIN" "rm -rf ${remote_dir}" + echo " βœ“ Remote directory removed" + fi + else + echo "βœ— Error during rsync for ${BASENAME}. Exit code: $RSYNC_STATUS" + echo " Logs for ${BASENAME} may be incomplete or missing." + fi +done + +echo "All experiments have been processed." \ No newline at end of file From 72c80036b4e12bf4cb36cdce67139a7bb2c5d5b0 Mon Sep 17 00:00:00 2001 From: Idate96 Date: Mon, 2 Jun 2025 14:50:52 +0200 Subject: [PATCH 03/10] fix: Apply rootless Docker permission fixes from moleworks_ext - Remove extends directive from isaac-lab-ext-dev-rootless service - Add full service definition with privileged: false for rootless mode - Include all necessary service properties (network_mode, ipc, deploy, etc.) - This ensures proper permissions without requiring privileged access --- docker/docker-compose.yaml | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index e2abab60..ad15858e 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -143,10 +143,19 @@ services: tty: true # Rootless variant of the development container - # Uses the same image but with different environment and volume configuration isaac-lab-ext-dev-rootless: - extends: - service: isaac-lab-ext-dev + env_file: + - .env.ext_template-dev + build: + context: ../ + dockerfile: docker/Dockerfile.ext-dev + args: + EXTENSION_NAME_ARG: ${EXTENSION_NAME} + EXT_PATH_ARG: ${EXT_PATH} + DOCKER_EXT_PATH_ARG: ${DOCKER_EXT_PATH} + DOCKER_USER_NAME_ARG: ${DOCKER_USER_NAME} + DOCKER_USER_HOME_ARG: ${DOCKER_USER_HOME} + image: isaac-lab-${EXTENSION_NAME}-dev container_name: isaac-lab-${EXTENSION_NAME}-dev-rootless volumes: - <<: *default-isaac-lab-volumes @@ -161,7 +170,6 @@ services: - ${SSH_AUTH_SOCK}:/ssh-agent - /lib/modules:/lib/modules - /etc/localtime:/etc/localtime:ro - # Reduced host system access for rootless mode environment: <<: *default-isaac-lab-environment DISPLAY: ${DISPLAY} @@ -171,11 +179,16 @@ services: NVIDIA_DRIVER_CAPABILITIES: all WANDB_API_KEY: ${WANDB_API_KEY} WANDB_USERNAME: ${WANDB_USERNAME} - # Force rootless mode DOCKER_ROOTLESS_MODE: "true" - # Override home to /root for rootless DOCKER_USER_HOME: /root HOME: /root + privileged: false + network_mode: host + ipc: host + deploy: *default-isaac-lab-deploy + shm_size: '2gb' + stdin_open: true + tty: true volumes: isaac-cache-kit: From a5e3832b44a0e7682ce7ca9be73519b3cb952707 Mon Sep 17 00:00:00 2001 From: Idate96 Date: Mon, 2 Jun 2025 17:15:34 +0200 Subject: [PATCH 04/10] Improve Docker container UX and Python environment setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Update Python aliases to use isaaclab.sh -p wrapper for proper environment setup - Enhance container UX with 🐳 DOCKER prompt and welcome banner - Fix bashrc sourcing to ensure proper shell environment loading - Support Docker prompt indicator when home directory is mounted - Add explicit entrypoint declarations in docker-compose services --- docker/Dockerfile.ext | 23 ++----- docker/Dockerfile.ext-dev | 129 ++++------------------------------- docker/bashrc | 14 ++-- docker/docker-compose.yaml | 2 + docker/dynamic_entrypoint.sh | 17 +++-- 5 files changed, 42 insertions(+), 143 deletions(-) diff --git a/docker/Dockerfile.ext b/docker/Dockerfile.ext index a0233e49..9a298794 100644 --- a/docker/Dockerfile.ext +++ b/docker/Dockerfile.ext @@ -45,24 +45,11 @@ RUN git clone https://github.com/leggedrobotics/rsl_rl.git /tmp/rsl_rl && \ # (Do not remove the DOCKER_EXT_PATH folder; keep it for the bind mount) -# System packages installation +# Minimal system packages for container functionality RUN apt-get update && \ apt-get install -y --no-install-recommends \ - figlet \ - gosu \ - apt-utils \ - libeigen3-dev \ - locate \ - wget \ - pkg-config \ - dialog \ - tasksel \ - curl \ - python3-pip \ - rsync - -# Python packages installation -RUN ${ISAACLAB_PATH}/_isaac_sim/python.sh -m pip install warp-lang ruamel.yaml + gosu && \ + rm -rf /var/lib/apt/lists/* #== # Environment @@ -73,8 +60,8 @@ COPY docker/entrypoint.sh /entrypoint.sh RUN chmod +x /entrypoint.sh # Set up Python alias -RUN echo "alias python3='${ISAACLAB_PATH}/_isaac_sim/python.sh'" >> /home/bash.bashrc && \ - echo "alias python='${ISAACLAB_PATH}/_isaac_sim/python.sh'" >> /home/bash.bashrc +RUN echo "alias python3='${ISAACLAB_PATH}/isaaclab.sh -p'" >> /home/bash.bashrc && \ + echo "alias python='${ISAACLAB_PATH}/isaaclab.sh -p'" >> /home/bash.bashrc # Set working directory WORKDIR ${DOCKER_USER_HOME} diff --git a/docker/Dockerfile.ext-dev b/docker/Dockerfile.ext-dev index 96a929bd..d9d661a8 100644 --- a/docker/Dockerfile.ext-dev +++ b/docker/Dockerfile.ext-dev @@ -24,35 +24,15 @@ ENV EXT_PATH=${EXT_PATH_ARG} \ RUN useradd -d ${DOCKER_USER_HOME} -s /bin/bash ${DOCKER_USER_NAME} || true # ========================= -# Install System Dependencies +# Install Minimal System Dependencies # ========================= RUN apt-get update && \ apt-get install -y --no-install-recommends \ - figlet \ gosu \ - apt-utils \ - libeigen3-dev \ - locate \ - wget \ - pkg-config \ - dialog \ - tasksel \ - curl \ - python3-pip \ git \ git-lfs \ - gnupg2 \ - tmux \ - libopen3d-dev \ - software-properties-common \ - rsync \ - python3-colcon-common-extensions \ - python3-colcon-mixin && \ - wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.0-1_all.deb && \ - dpkg -i cuda-keyring_1.0-1_all.deb && \ - rm -f cuda-keyring_1.0-1_all.deb && \ - apt-get update && \ - apt-get install -y --no-install-recommends cuda-toolkit-11-8 && \ + curl \ + python3-colcon-common-extensions && \ rm -rf /var/lib/apt/lists/* # ========================= @@ -60,33 +40,6 @@ RUN apt-get update && \ # ========================= RUN git lfs install -# ========================= -# Remove Conflicting Eigen Packages -# ========================= -RUN apt-get remove -y ros-humble-eigen3-cmake-module && \ - apt-get update && \ - apt-get install -y --no-install-recommends libeigen3-dev - -# ========================= -# Install Pinocchio via Robotpkg -# ========================= -RUN apt-get update && \ - apt-get install -y --no-install-recommends lsb-release curl && \ - mkdir -p /etc/apt/keyrings && \ - curl http://robotpkg.openrobots.org/packages/debian/robotpkg.asc \ - | tee /etc/apt/keyrings/robotpkg.asc && \ - echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/robotpkg.asc] http://robotpkg.openrobots.org/packages/debian/pub $(lsb_release -cs) robotpkg" \ - | tee /etc/apt/sources.list.d/robotpkg.list && \ - apt-get update && \ - apt-get install -y --no-install-recommends robotpkg-py3*-pinocchio -RUN rm -f /etc/apt/sources.list.d/robotpkg.list - -# ========================= -# Set CUDA Environment Variables -# ========================= -ENV PATH=/usr/local/cuda-11.8/bin${PATH:+:${PATH}} \ - LD_LIBRARY_PATH=/usr/local/cuda-11.8/lib64${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}} - # ========================= # Install Colcon Mixin # ========================= @@ -94,75 +47,23 @@ RUN colcon mixin add default https://raw.githubusercontent.com/colcon/colcon-mix colcon mixin update default # ========================= -# Install Python Dependencies +# Install Minimal Python Dependencies # ========================= -RUN python3 -m pip install --no-cache-dir \ - simple-parsing \ - cupy-cuda11x \ - scipy \ - shapely \ - ros2-numpy \ - panda3d_viewer \ - ruamel.yaml \ - --upgrade transforms3d \ - torch +# Users should add their own dependencies as needed # ========================= -# Install Development Tools +# Install Basic Development Tools # ========================= RUN ${ISAACLAB_PATH}/_isaac_sim/python.sh -m pip install --no-cache-dir \ - ruff \ - black \ - mypy \ - bandit \ - vulture \ - pre-commit \ pytest \ - pytest-cov \ - warp-lang + ruff -# ========================= -# Install Node.js and Claude Code -# ========================= -RUN curl -fsSL https://deb.nodesource.com/setup_20.x | bash - && \ - apt-get install -y nodejs && \ - npm config set registry https://registry.npmjs.org/ && \ - npm install -g @anthropic-ai/claude-code && \ - rm -rf /var/lib/apt/lists/* # ========================= -# Install ROS2 Packages -# ========================= -RUN rm -f /etc/apt/sources.list.d/deb.sury.org.list && \ - apt-get update && \ - apt-get install -y --allow-downgrades libbrotli1=1.0.9-2build6 && \ - apt-get install -y \ - libfontconfig1-dev libfreetype6-dev \ - ros-humble-xacro \ - ros-humble-vision-opencv \ - ros-humble-joint-state-publisher-gui \ - ros-humble-filters \ - ros-humble-nav2-msgs \ - ros-humble-tf-transformations \ - ros-humble-gazebo-ros-pkgs \ - ros-humble-gazebo-plugins \ - ros-humble-turtlebot3* \ - ros-humble-turtlebot3-simulations \ - ros-humble-octomap-msgs \ - ros-humble-octomap \ - ros-humble-octomap-rviz-plugins \ - ros-humble-octomap-server \ - ros-humble-tf2 \ - ros-humble-tf2-geometry-msgs \ - ros-humble-tf2-sensor-msgs \ - ros-humble-rqt-graph \ - ros-humble-pcl-ros \ - ros-humble-nav2-costmap-2d \ - ros-humble-robot-state-publisher \ - ros-humble-rviz-common \ - ros-humble-rviz2 \ - ros-humble-zenoh-cpp-vendor && \ - rm -rf /var/lib/apt/lists/* +# ROS2 Core Packages Only +# ========================= +# Users should install additional ROS2 packages as needed +# Example: apt-get install ros-humble-xacro ros-humble-tf2 # ========================= # Setup Extension Directories @@ -222,10 +123,10 @@ RUN chmod +x /entrypoint.sh # ========================= # Set Up Python Aliases # ========================= -RUN echo "alias python3='${ISAACLAB_PATH}/_isaac_sim/python.sh'" >> /home/bash.bashrc && \ - echo "alias python='${ISAACLAB_PATH}/_isaac_sim/python.sh'" >> /home/bash.bashrc && \ - echo "alias python3='${ISAACLAB_PATH}/_isaac_sim/python.sh'" >> /root/.bashrc && \ - echo "alias python='${ISAACLAB_PATH}/_isaac_sim/python.sh'" >> /root/.bashrc +RUN echo "alias python3='${ISAACLAB_PATH}/isaaclab.sh -p'" >> /home/bash.bashrc && \ + echo "alias python='${ISAACLAB_PATH}/isaaclab.sh -p'" >> /home/bash.bashrc && \ + echo "alias python3='${ISAACLAB_PATH}/isaaclab.sh -p'" >> /root/.bashrc && \ + echo "alias python='${ISAACLAB_PATH}/isaaclab.sh -p'" >> /root/.bashrc # ========================= # Final Configuration diff --git a/docker/bashrc b/docker/bashrc index e96ded13..35ff52a9 100644 --- a/docker/bashrc +++ b/docker/bashrc @@ -84,10 +84,16 @@ parse_git_branch() { } export PS1=$PS1"\[\e[91m\]\$(parse_git_branch)\[\e[00m\]$ " -export PS1="(D) "$PS1 - -# Claude Code available - use 'claude' command for AI coding assistance -echo "πŸ€– Claude Code is available! Type 'claude' to start AI coding assistance." +export PS1="\[\e[95m\]🐳 DOCKER\[\e[00m\] "$PS1 + +# Docker container welcome message +echo -e "\n\e[95m╔════════════════════════════════════════════════════════════╗\e[0m" +echo -e "\e[95mβ•‘ 🐳 You are now inside a Docker container! 🐳 β•‘\e[0m" +echo -e "\e[95mβ•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•\e[0m" +echo -e "\e[93mContainer: ext_template-dev\e[0m" +echo -e "\e[92mIsaacLab path: /workspace/isaaclab\e[0m" +echo -e "\e[92mExtension path: /workspace/ext_template\e[0m" +echo -e "\e[96mπŸ€– Claude Code is available! Type 'claude' for AI assistance.\e[0m\n" FILE_ROS2=/opt/ros/humble/setup.bash source $FILE_ROS2 diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index ad15858e..41b522c4 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -141,6 +141,7 @@ services: - ALL stdin_open: true tty: true + entrypoint: ["/entrypoint.sh"] # Rootless variant of the development container isaac-lab-ext-dev-rootless: @@ -189,6 +190,7 @@ services: shm_size: '2gb' stdin_open: true tty: true + entrypoint: ["/entrypoint.sh"] volumes: isaac-cache-kit: diff --git a/docker/dynamic_entrypoint.sh b/docker/dynamic_entrypoint.sh index f77b93ef..62422291 100755 --- a/docker/dynamic_entrypoint.sh +++ b/docker/dynamic_entrypoint.sh @@ -73,11 +73,10 @@ else fi chown $USER_NAME:$USER_NAME "$USER_HOME" - # Set up user's bashrc - if [ ! -f "$USER_HOME/.bashrc" ]; then - cp /home/bash.bashrc "$USER_HOME/.bashrc" - chown $USER_NAME:$USER_NAME "$USER_HOME/.bashrc" - fi + # Always copy the container's bashrc to ensure Docker prompt is available + # This is needed because the home directory might be mounted from host + cp /home/bash.bashrc "$USER_HOME/.bashrc_container" + chown $USER_NAME:$USER_NAME "$USER_HOME/.bashrc_container" echo "Setting up permissions in the background..." @@ -112,6 +111,10 @@ EOF trap /usr/local/bin/fix-permissions EXIT fi - # Execute as the user - exec gosu $USER_NAME bash --rcfile $USER_HOME/.bashrc + # Ensure PATH is set for the user + export PATH="/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin" + + # Execute as the user with the container's bashrc + # Use --rcfile to explicitly load the container's bashrc + exec gosu $USER_NAME /bin/bash --rcfile "$USER_HOME/.bashrc_container" fi \ No newline at end of file From 1c44bc81af7cbcbf99ebf3f72bb6436edb6dceaf Mon Sep 17 00:00:00 2001 From: Idate96 Date: Mon, 2 Jun 2025 17:19:35 +0200 Subject: [PATCH 05/10] Fix Docker entrypoint and remove figlet dependency - Use dynamic_entrypoint.sh for development containers - Remove figlet dependency that was causing startup errors - Ensure proper Docker visual indicators are displayed --- docker/docker-compose.yaml | 4 ++-- docker/dynamic_entrypoint.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index 41b522c4..a88e95a2 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -141,7 +141,7 @@ services: - ALL stdin_open: true tty: true - entrypoint: ["/entrypoint.sh"] + entrypoint: ["/dynamic_entrypoint.sh"] # Rootless variant of the development container isaac-lab-ext-dev-rootless: @@ -190,7 +190,7 @@ services: shm_size: '2gb' stdin_open: true tty: true - entrypoint: ["/entrypoint.sh"] + entrypoint: ["/dynamic_entrypoint.sh"] volumes: isaac-cache-kit: diff --git a/docker/dynamic_entrypoint.sh b/docker/dynamic_entrypoint.sh index 62422291..410cbac6 100755 --- a/docker/dynamic_entrypoint.sh +++ b/docker/dynamic_entrypoint.sh @@ -7,7 +7,7 @@ # Mode is determined by DOCKER_ROOTLESS_MODE environment variable # Print welcome message -figlet Isaac Lab Extension +echo "=== Isaac Lab Extension ===" # Always add root sudo permissions echo "root ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers From cd58fdd903b345d1cd97c00680e177f4444911e0 Mon Sep 17 00:00:00 2001 From: Idate96 Date: Mon, 2 Jun 2025 17:24:56 +0200 Subject: [PATCH 06/10] Fix Docker entrypoint path in docker-compose.yaml - Correct entrypoint path from /dynamic_entrypoint.sh to /entrypoint.sh - Ensure containers start properly with Docker visual indicators --- docker/docker-compose.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docker/docker-compose.yaml b/docker/docker-compose.yaml index a88e95a2..41b522c4 100644 --- a/docker/docker-compose.yaml +++ b/docker/docker-compose.yaml @@ -141,7 +141,7 @@ services: - ALL stdin_open: true tty: true - entrypoint: ["/dynamic_entrypoint.sh"] + entrypoint: ["/entrypoint.sh"] # Rootless variant of the development container isaac-lab-ext-dev-rootless: @@ -190,7 +190,7 @@ services: shm_size: '2gb' stdin_open: true tty: true - entrypoint: ["/dynamic_entrypoint.sh"] + entrypoint: ["/entrypoint.sh"] volumes: isaac-cache-kit: From 4b9858d0b2c8039eb27634cd1fa0c872a9c3e531 Mon Sep 17 00:00:00 2001 From: Idate96 Date: Tue, 3 Jun 2025 18:41:59 +0200 Subject: [PATCH 07/10] Apply cluster fixes from moleworks_ext - Fix cluster user variable expansion in mount paths - Fix script argument passing through SSH and job submission - Add proper error handling for mount configuration parsing - Add debug output for troubleshooting cluster jobs --- docker/cluster/cluster_interface.sh | 29 +++++++++++++++---- docker/cluster/run_singularity.sh | 43 ++++++++++++++++++++++++++++- docker/cluster/submit_job_pbs.sh | 40 +++++++++++++++++++-------- docker/cluster/submit_job_slurm.sh | 31 +++++++++++++++++---- 4 files changed, 119 insertions(+), 24 deletions(-) diff --git a/docker/cluster/cluster_interface.sh b/docker/cluster/cluster_interface.sh index cb677fc7..70531b81 100755 --- a/docker/cluster/cluster_interface.sh +++ b/docker/cluster/cluster_interface.sh @@ -96,13 +96,14 @@ sync_mount_config() { if [ -f "$mount_config_path" ]; then echo "[INFO] Found mount configuration file" - # Check if we need to sync any external codebases - python3 - < "$temp_script" << 'PYTHON_SCRIPT' import json import sys import os -config_path = "$mount_config_path" +config_path = sys.argv[1] with open(config_path, 'r') as f: config = json.load(f) @@ -114,12 +115,19 @@ for mount_name, mount_config in config.get('mounts', {}).items(): print(f"[INFO] Will sync {mount_name} from {local_path}") sys.exit(0 if need_sync else 1) -EOF +PYTHON_SCRIPT - if [ $? -eq 0 ]; then + # Execute the Python script + # Save the result before any other commands + set +e # Temporarily disable exit on error + python3 "$temp_script" "$mount_config_path" + sync_exit_code=$? + set -e # Re-enable exit on error + if [ $sync_exit_code -eq 0 ]; then # Sync external codebases python3 "$CODEBASE_PATH/docker/cluster/sync_mounts.py" "$mount_config_path" "$CLUSTER_LOGIN" fi + rm -f "$temp_script" else echo "[INFO] No mount configuration found. Run './container.sh mount-setup' to configure external mounts." fi @@ -143,7 +151,16 @@ submit_job() { esac # No mount args needed - mount config is handled by .mount.config file - ssh $CLUSTER_LOGIN "cd $CLUSTER_ISAACLAB_DIR && bash $CLUSTER_ISAACLAB_DIR/docker/cluster/$job_script_file \"$CLUSTER_ISAACLAB_DIR\" \"isaac-lab-$profile\" \"\" \"--\" ${@}" + # Build the command with properly quoted arguments + ssh_cmd="cd $CLUSTER_ISAACLAB_DIR && bash $CLUSTER_ISAACLAB_DIR/docker/cluster/$job_script_file \"$CLUSTER_ISAACLAB_DIR\" \"isaac-lab-$profile\" \"\" \"--\"" + + # Add each argument properly quoted + for arg in "$@"; do + ssh_cmd="$ssh_cmd \"$arg\"" + done + + # Execute the SSH command + ssh $CLUSTER_LOGIN "$ssh_cmd" } # Function to list all available profiles diff --git a/docker/cluster/run_singularity.sh b/docker/cluster/run_singularity.sh index f638b82b..9f57507c 100755 --- a/docker/cluster/run_singularity.sh +++ b/docker/cluster/run_singularity.sh @@ -79,6 +79,7 @@ done echo "[DEBUG] run_singularity.sh: Final MOUNT_ISAACLAB_PATH: $MOUNT_ISAACLAB_PATH" echo "[DEBUG] run_singularity.sh: Final MOUNT_RSL_RL_PATH: $MOUNT_RSL_RL_PATH" echo "[DEBUG] run_singularity.sh: Final SCRIPT_ARGS: ${SCRIPT_ARGS[*]}" +echo "[DEBUG] run_singularity.sh: Number of script args: ${#SCRIPT_ARGS[@]}" #== # Helper functions @@ -130,6 +131,7 @@ fi source "$ENV_CLUSTER_PATH" source "$ENV_BASE_PATH" +echo "[DEBUG] CLUSTER_USER: $CLUSTER_USER" echo "[DEBUG] CLUSTER_SIF_PATH: $CLUSTER_SIF_PATH" echo "[DEBUG] Container profile: $2" @@ -192,10 +194,20 @@ if [ -f "$MOUNT_CONFIG_FILE" ]; then ADDITIONAL_BINDS=$(python3 - < {mount_path}", file=sys.stderr) + container_path = mount_config['container_path'] # Handle special cases @@ -235,7 +255,15 @@ print(' '.join(binds)) EOF ) + # Check if Python script failed + if [ $? -ne 0 ]; then + echo "[ERROR] Failed to parse mount configuration" + exit 1 + fi + if [ -n "$ADDITIONAL_BINDS" ]; then + # Expand any remaining environment variables in the bind paths + ADDITIONAL_BINDS=$(echo "$ADDITIONAL_BINDS" | sed "s/\$CLUSTER_USER/$CLUSTER_USER/g") echo "[INFO] Adding mounts from unified config: $ADDITIONAL_BINDS" SINGULARITY_BINDS="$SINGULARITY_BINDS $ADDITIONAL_BINDS" fi @@ -275,11 +303,23 @@ fi # NOTE: ISAACLAB_PATH is normally set in `isaaclab.sh` but we directly call the isaac-sim python because we sync the entire # Isaac Lab directory to the compute node and remote the symbolic link to isaac-sim + +# Pass script args as environment variable to avoid expansion issues +SCRIPT_ARGS_STRING="${SCRIPT_ARGS[*]}" + +# Add debug output +echo "[DEBUG] SCRIPT_ARGS array has ${#SCRIPT_ARGS[@]} elements" +for i in "${!SCRIPT_ARGS[@]}"; do + echo "[DEBUG] SCRIPT_ARGS[$i] = '${SCRIPT_ARGS[$i]}'" +done +echo "[DEBUG] SCRIPT_ARGS_STRING = '$SCRIPT_ARGS_STRING'" + singularity exec \ $SINGULARITY_BINDS \ --nv --containall --writable-tmpfs \ --env "WANDB_MODE=${WANDB_MODE:-offline}" \ --env "ISAACLAB_PATH=/workspace/isaaclab" \ + --env "SCRIPT_ARGS_STRING=$SCRIPT_ARGS_STRING" \ $TMPDIR/$2.sif bash -c " # Source both bashrc files to get Python aliases source /etc/bash.bashrc 2>/dev/null || true @@ -327,7 +367,8 @@ singularity exec \ echo '[CONTAINER] Testing RSL-RL runners import:' python -c 'from rsl_rl.runners import OnPolicyRunner; print(\"OnPolicyRunner imported successfully\")' 2>/dev/null || echo 'Failed to import rsl_rl.runners' echo '[CONTAINER] Running main script: $CLUSTER_PYTHON_EXECUTABLE' - \$ISAACLAB_PATH/_isaac_sim/python.sh $CLUSTER_PYTHON_EXECUTABLE ${SCRIPT_ARGS[*]} + echo '[CONTAINER] Script arguments: \$SCRIPT_ARGS_STRING' + \$ISAACLAB_PATH/_isaac_sim/python.sh $CLUSTER_PYTHON_EXECUTABLE \$SCRIPT_ARGS_STRING " # copy resulting cache files back to host diff --git a/docker/cluster/submit_job_pbs.sh b/docker/cluster/submit_job_pbs.sh index bd9ba374..53e1b0b5 100755 --- a/docker/cluster/submit_job_pbs.sh +++ b/docker/cluster/submit_job_pbs.sh @@ -1,30 +1,48 @@ #!/usr/bin/env bash # in the case you need to load specific modules on the cluster, add them here -# e.g., `module load eth_proxy` +module load eth_proxy + +# Debug: Show what arguments this script received +echo "[submit_job_pbs.sh] Received arguments: $@" +echo "[submit_job_pbs.sh] Number of arguments: $#" + +# Parse arguments +dir="$1" +profile="$2" +# Skip args 3 and 4 (empty mount args and "--") +shift 4 +# Remaining args are the script arguments +script_args="$@" + +echo "[submit_job_pbs.sh] Directory: $dir" +echo "[submit_job_pbs.sh] Profile: $profile" +echo "[submit_job_pbs.sh] Script args: $script_args" # create job script with compute demands ### MODIFY HERE FOR YOUR JOB ### cat < job.sh #!/bin/bash -#PBS -l select=1:ncpus=8:mpiprocs=1:ngpus=1 -#PBS -l walltime=01:00:00 -#PBS -j oe +#PBS -l select=1:mem=100gb:ncpus=8:gpus=1 +#PBS -l walltime=08:00:00 #PBS -q gpu #PBS -N isaaclab-ext #PBS -m bea -M "user@mail" -# Variables passed from submit script -dir="$1" -profile="$2" -# Skip empty mount args and "--" delimiter -shift 4 -script_args="\$@" +# Variables embedded from submit script +dir="$dir" +profile="$profile" +script_args="$script_args" + +echo "[PBS JOB] Directory: \$dir" +echo "[PBS JOB] Profile: \$profile" +echo "[PBS JOB] Script arguments: \$script_args" # Mount configuration is now handled by .mount.config file bash "\$dir/docker/cluster/run_singularity.sh" "\$dir" "\$profile" "\$dir/docker/cluster/.env.cluster" "\$dir/docker/.env.ext_template" -- \$script_args EOT -qsub job.sh +# Submit the job +qsub < job.sh rm job.sh \ No newline at end of file diff --git a/docker/cluster/submit_job_slurm.sh b/docker/cluster/submit_job_slurm.sh index 3248c792..cc130a76 100755 --- a/docker/cluster/submit_job_slurm.sh +++ b/docker/cluster/submit_job_slurm.sh @@ -3,6 +3,22 @@ # in the case you need to load specific modules on the cluster, add them here module load eth_proxy +# Debug: Show what arguments this script received +echo "[submit_job_slurm.sh] Received arguments: $@" +echo "[submit_job_slurm.sh] Number of arguments: $#" + +# Parse arguments +dir="$1" +profile="$2" +# Skip args 3 and 4 (empty mount args and "--") +shift 4 +# Remaining args are the script arguments +script_args="$@" + +echo "[submit_job_slurm.sh] Directory: $dir" +echo "[submit_job_slurm.sh] Profile: $profile" +echo "[submit_job_slurm.sh] Script args: $script_args" + # create job script with compute demands ### MODIFY HERE FOR YOUR JOB ### cat < job.sh @@ -17,16 +33,19 @@ cat < job.sh #SBATCH --mail-user=name@mail #SBATCH --job-name="isaaclab-ext-$(date +"%Y-%m-%dT%H:%M")" -# Variables passed from submit script -dir="$1" -profile="$2" -# Skip empty mount args and "--" delimiter -shift 4 -script_args="\$@" +# Variables embedded from submit script +dir="$dir" +profile="$profile" +script_args="$script_args" + +echo "[SLURM JOB] Directory: \$dir" +echo "[SLURM JOB] Profile: \$profile" +echo "[SLURM JOB] Script arguments: \$script_args" # Mount configuration is now handled by .mount.config file bash "\$dir/docker/cluster/run_singularity.sh" "\$dir" "\$profile" "\$dir/docker/cluster/.env.cluster" "\$dir/docker/.env.ext_template" -- \$script_args EOT +# Submit the job sbatch < job.sh rm job.sh \ No newline at end of file From 856a65e3dcace937fa17e16b7b797d7ec836b945 Mon Sep 17 00:00:00 2001 From: Idate96 Date: Tue, 3 Jun 2025 18:45:58 +0200 Subject: [PATCH 08/10] Improve Docker and cluster documentation - Add NVIDIA Container Toolkit prerequisite with installation link - Clarify timestamped directory creation for cluster jobs - Fix job output location references for SLURM/PBS - Add troubleshooting for mount path variable expansion - Add Docker/Apptainer version compatibility information - Improve GPU troubleshooting with toolkit verification step --- docker/README.md | 450 +++++++++++++++------------------------ docker/cluster/README.md | 30 ++- 2 files changed, 195 insertions(+), 285 deletions(-) diff --git a/docker/README.md b/docker/README.md index d3e64a23..16c38d1d 100644 --- a/docker/README.md +++ b/docker/README.md @@ -4,384 +4,276 @@ This guide provides comprehensive documentation for building, running, and deplo ## Table of Contents -- [Overview](#overview) +- [Quick Start: Local Development](#quick-start-local-development) - [Prerequisites](#prerequisites) -- [Quick Start](#quick-start) -- [Container Architecture](#container-architecture) +- [Core Concepts](#core-concepts) - [Container Types](#container-types) - - [Permission System](#permission-system) -- [Building Containers](#building-containers) -- [Running Containers](#running-containers) - - [Development Workflow](#development-workflow) - - [Production Usage](#production-usage) -- [Mount Configuration System](#mount-configuration-system) - - [Quick Setup](#quick-setup) - - [Mount Management](#mount-management) - - [Technical Details](#technical-details) + - [User Permissions System](#user-permissions-system) +- [Usage Guide](#usage-guide) + - [The container.sh Script](#the-containersh-script) + - [Building the Containers](#building-the-containers) + - [Running Containers & Workflows](#running-containers--workflows) + - [Managing External Mounts](#managing-external-mounts) - [Cluster Deployment](#cluster-deployment) - [Troubleshooting](#troubleshooting) - [Migration Guide](#migration-guide) +- [Best Practices](#best-practices) -## Overview +## Quick Start: Local Development -The Docker setup provides two main containers: +This is the fastest way to get a development environment running. -1. **Production Container** (`isaac-lab-ext`) - Minimal container for training and cluster deployment -2. **Development Container** (`isaac-lab-ext-dev`) - Full-featured container with ROS2, development tools, and dual-mode support - -Both containers include: -- Isaac Lab and Isaac Sim integration -- RSL-RL for reinforcement learning -- Your extension package -- Optional mounting of external codebases - -## Prerequisites - -1. **Docker and Docker Compose** installed on your system -2. **NVIDIA GPU** with appropriate drivers -3. **Environment files** - Create by copying templates: +1. **Set up environment files:** ```bash + # Run this once to create your local .env files cp docker/.env.ext_template.template docker/.env.ext_template cp docker/.env.ext_template-dev.template docker/.env.ext_template-dev + # You can optionally edit these files with custom settings. ``` - Edit these files with your settings (they're git-ignored). -4. **IsaacLab base images** - Either pull from DockerHub or build locally (see [Building Containers](#building-containers)) -## Quick Start +2. **Run the development container:** + ```bash + # Start an interactive shell in the dev container + ./docker/run_dev.sh -### Using container.sh (Recommended) + # Or, run a specific command directly + ./docker/run_dev.sh python scripts/rsl_rl/train.py --task YourTask + ``` -```bash -# Build and run development container -./docker/container.sh -p ext-dev build -./docker/container.sh -p ext-dev run + > **Note:** For systems where you don't have root access for Docker (e.g., some university PCs), use the `--rootless` flag: + > ```bash + > ./docker/run_dev.sh --rootless + > ``` -# Run with a specific command -./docker/container.sh -p ext-dev run python scripts/rsl_rl/train.py --task YourTask +## Prerequisites -# Run in rootless mode (for systems without root Docker) -./docker/container.sh -p ext-dev-rootless run -``` +- [ ] NVIDIA GPU with current drivers installed +- [ ] Docker and Docker Compose installed +- [ ] NVIDIA Container Toolkit installed ([installation guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)) +- [ ] For local development, optionally symlink IsaacLab as `_isaaclab` in the repository root + +### Environment Files Setup -### Production Container +Before running any extension containers, you must create the environment files from their templates: ```bash -# Build and run production container -./docker/container.sh -p ext build -./docker/container.sh -p ext run +# Create the required .env files +cp docker/.env.ext_template.template docker/.env.ext_template +cp docker/.env.ext_template-dev.template docker/.env.ext_template-dev ``` -## Container Architecture +These files contain container-specific configuration and are required for the containers to start properly. You can edit them to customize settings if needed, but the defaults should work for most use cases. + +## Core Concepts ### Container Types -#### 1. Production Container (`Dockerfile.ext`) -- **Base Image**: `isaac-lab-base` -- **Purpose**: Cluster deployment and training -- **Features**: - - Minimal footprint for performance - - Includes RSL-RL - - No development tools or ROS2 -- **Use Cases**: Training jobs, performance testing, deployment - -#### 2. Development Container (`Dockerfile.ext-dev`) -- **Base Image**: `isaac-lab-ros2` -- **Purpose**: Local development with all features -- **Features**: - - ROS2 packages and integration - - Development tools (Claude Code, pytest, ruff, etc.) - - Pinocchio robotics library - - CUDA toolkit - - Git LFS support - - Dual-mode operation (root/rootless) -- **Use Cases**: Development, debugging, ROS2 integration, testing - -### Permission System - -The development container supports flexible permission management through two modes: - -#### Root Mode (Default) -- Traditional Docker behavior with user switching -- Preserves host user permissions -- Files created match host user ownership -- Uses `gosu` for seamless user switching - -#### Rootless Mode -- Everyone runs as root inside container -- Simplified permission model -- Suitable for Docker installations without root access -- Ideal for student PCs or restricted environments - -#### Quick Permission Reference - -| Use Case | Command | Result | -|----------|---------|--------| -| Personal dev machine | `./docker/container.sh -p ext-dev run` | Files owned by your user | -| Student PC (no root) | `./docker/container.sh -p ext-dev-rootless run` | Run as root inside container | -| Shared server | `FIX_PERMISSIONS=true ./docker/container.sh -p ext-dev run` | Auto-fix ownership on exit | - -#### Environment Variables - -- `DOCKER_ROOTLESS_MODE`: Enable rootless mode (true/false) -- `FIX_PERMISSIONS`: Auto-fix file permissions on exit (true/false) -- `LOCAL_UID`: Override user ID (default: current user) -- `LOCAL_GID`: Override group ID (default: current group) - -## Building Containers - -### Step 1: Obtain Base Images +Two primary containers are provided: -```bash -# Option 1: Pull from DockerHub (recommended) -docker pull jmanan/isaac-lab-base:latest -docker tag jmanan/isaac-lab-base:latest isaac-lab-base +- **Production (`isaac-lab-ext`)**: + - **Dockerfile:** `Dockerfile.ext` + - **Purpose:** A minimal, lightweight container designed for headless training and cluster deployment + - **Includes:** Isaac Lab, RSL-RL, and your extension + - **Excludes:** ROS2, GUI tools, and other development dependencies -docker pull jmanan/isaac-lab-ros2:latest -docker tag jmanan/isaac-lab-ros2:latest isaac-lab-ros2 +- **Development (`isaac-lab-ext-dev`)**: + - **Dockerfile:** `Dockerfile.ext-dev` + - **Purpose:** A full-featured environment for local development and debugging + - **Includes:** Everything in the production container, plus ROS2 Humble, development tools (like pytest, ruff), and GPU-accelerated libraries -# Option 2: Build locally (if you have IsaacLab source) -cd /path/to/isaaclab -./docker/container.py build isaac-lab-base -./docker/container.py build isaac-lab-ros2 -``` +### User Permissions System -### Step 2: Build Extension Containers +To prevent file ownership issues when mounting local directories, the development container synchronizes the container's user with your host's user ID (UID) and group ID (GID). -```bash -# Build production container -./docker/container.sh -p ext build +- **Default Mode (User-Preserving):** Files created inside the container are owned by you on the host. This is the recommended mode for personal machines. +- **Rootless Mode (`--rootless`):** Runs as root inside the container. This simplifies permissions and is ideal for environments where you cannot manage Docker users. -# Build development container -./docker/container.sh -p ext-dev build -``` +| Use Case | Command | Outcome | +|----------|---------|---------| +| Personal Dev Machine | `./docker/run_dev.sh` | Files created match your host user | +| Restricted PC (No Root) | `./docker/run_dev.sh --rootless` | Runs as root inside the container | +| Auto-Fix Permissions | `./docker/run_dev.sh --fix-perms` | chowns files to your user on exit | +| Shared Server | `./docker/run_dev.sh -u 2000 -g 2000` | Uses a custom UID/GID | -## Running Containers +## Usage Guide -### Development Workflow +### The container.sh Script -```bash -# Standard development mode -./docker/container.sh -p ext-dev run +The `./docker/container.sh` script is the main interface for managing containers. The `./docker/run_dev.sh` script is a convenient wrapper around it. -# Rootless mode for restricted systems -./docker/container.sh -p ext-dev-rootless run +**Basic Syntax:** `./docker/container.sh -p ` -# Run specific command -./docker/container.sh -p ext-dev run python scripts/rsl_rl/train.py --task YourTask +| Command | Description | Example | +|---------|-------------|---------| +| `build` | Builds the container image | `./docker/container.sh -p ext-dev build` | +| `run` | Runs the container | `./docker/container.sh -p ext-dev run` | +| `attach` | Attaches to a running container | `./docker/container.sh -p ext-dev attach` | +| `exec` | Executes a command in a running container | `./docker/container.sh -p ext-dev exec nvidia-smi` | +| `logs` | Shows container logs | `./docker/container.sh -p ext-dev logs` | +| `mount-setup` | Interactively configure mounts | `./docker/container.sh mount-setup` | -# Attach to running container -./docker/container.sh -p ext-dev attach +### Building the Containers -# Execute command in running container -./docker/container.sh -p ext-dev exec nvidia-smi -``` +First, ensure you have the base images from IsaacLab. If not, build them from the IsaacLab-Internal repository: -### Production Usage +#### Prerequisites: Building Base IsaacLab Containers -```bash -# Run production container -./docker/container.sh -p ext run +Before building extension containers, you need to build the base IsaacLab containers from the [IsaacLab-Internal repository](https://github.com/leggedrobotics/IsaacLab-Internal). -# Training example -./docker/container.sh -p ext run python scripts/rsl_rl/train.py \ - --task YourTask --num_envs 1024 -``` +1. **Clone and navigate to IsaacLab-Internal:** + ```bash + git clone https://github.com/leggedrobotics/IsaacLab-Internal + cd IsaacLab-Internal/docker + ``` -### Claude Code AI Assistant +2. **Build the base containers:** + ```bash + # Build the ROS2 base container + docker compose --env-file docker/.env.ros2 --file docker/docker-compose.yaml build isaac-lab-ros2 + + # Build and run the base container + docker compose --env-file docker/.env.base --file docker/docker-compose.yaml run isaac-lab-base + ``` + +#### Building Extension Containers -The development container includes Claude Code for AI-assisted development: +Once you have the base images, you can build the extension containers: ```bash -# Inside the container -claude +# Build the development container (most common) +./docker/container.sh -p ext-dev build + +# Build the production container +./docker/container.sh -p ext build ``` -## Mount Configuration System +### Running Containers & Workflows -The unified mount system allows optional mounting of external IsaacLab and RSL-RL codebases in both Docker and Singularity environments. +**Development:** +```bash +# Start an interactive shell +./docker/container.sh -p ext-dev run -### Quick Setup +# Run a specific training task +./docker/container.sh -p ext-dev run python scripts/rsl_rl/train.py --task=YourTask +``` +**Production:** ```bash -# Interactive setup -./docker/container.sh mount-setup - -# This will: -# 1. Create .mount.config with your preferences -# 2. Validate paths -# 3. Generate docker-compose.override.yaml +# Run a headless training job in the production container +./docker/container.sh -p ext run python scripts/rsl_rl/train.py \ + --task=YourTask --num_envs 1024 ``` -### Mount Management +### Managing External Mounts + +You can optionally mount external checkouts of isaaclab or rsl_rl to override the versions built into the container. This is useful for development. ```bash -# Show current configuration -./docker/container.sh mount-show +# Start the interactive setup wizard (recommended) +./docker/container.sh mount-setup -# Enable/disable mounts +# Manually enable/disable a mount ./docker/container.sh mount-enable isaaclab -./docker/container.sh mount-disable rsl_rl - -# Set mount paths -./docker/container.sh mount-set isaaclab ~/my-isaaclab -./docker/container.sh mount-set rsl_rl ~/my-rsl-rl +./docker/container.sh mount-disable isaaclab -# Validate configuration -./docker/container.sh mount-validate -``` +# Manually set a mount path +./docker/container.sh mount-set isaaclab ~/dev/my-isaaclab -### Technical Details - -#### Configuration File (.mount.config) - -```json -{ - "mounts": { - "isaaclab": { - "enabled": false, - "local_path": "/path/to/isaaclab", - "container_path": "/workspace/isaaclab", - "mount_type": "source", // Mounts only source/ subdirectory - "description": "External IsaacLab installation" - }, - "rsl_rl": { - "enabled": false, - "local_path": "/path/to/rsl_rl", - "container_path": "/workspace/isaaclab/_isaac_sim/.../rsl_rl", - "mount_type": "full", // Completely overrides built-in - "description": "External RSL-RL installation" - } - } -} +# Check your current mount configuration +./docker/container.sh mount-show ``` -#### How It Works - -1. **Configuration**: User preferences stored in `.mount.config` -2. **Docker**: Generates `docker-compose.override.yaml` with bind mounts -3. **Singularity**: Reads config and adds `-B` bind flags -4. **Validation**: Paths validated before container startup - -The system uses Docker Compose's override mechanism, automatically merging: -- Base configuration: `docker-compose.yaml` -- User mounts: `docker-compose.override.yaml` (auto-generated) +This system works by creating a `.mount.config` file and auto-generating a `docker-compose.override.yaml` which is git-ignored. ## Cluster Deployment -For detailed cluster operations, see [docker/cluster/README.md](cluster/README.md). - -### Quick Cluster Workflow +For detailed cluster operations, see `docker/cluster/README.md`. +**Quick Workflow:** ```bash -# 1. Configure mounts (optional) +# 1. (Optional) Configure mounts for cluster usage ./docker/container.sh mount-setup -# 2. Push container to cluster +# 2. Navigate to the cluster directory cd docker/cluster + +# 3. Push the container image to the cluster's registry ./cluster_interface.sh push ext_template -# 3. Submit job +# 4. Submit a job ./cluster_interface.sh job ext_template --task YourTask --num_envs 64000 - -# 4. Sync logs back -./sync_experiments.sh --remove ~/experiments/logs ``` ## Troubleshooting -### Permission Issues +### Problem: Docker/Apptainer Version Compatibility -```bash -# Enable automatic permission fixing -export FIX_PERMISSIONS=true -./docker/container.sh -p ext-dev run +When pushing to cluster, you may see warnings about non-tested Docker/Apptainer versions. The tested combinations are: +- Docker 24.0.7 + Apptainer 1.2.5 +- Docker β‰₯ 27.0.0 + Apptainer β‰₯ 1.3.4 -# Use rootless mode -./docker/container.sh -p ext-dev-rootless run +If you encounter issues with other versions, consider updating to a tested combination. -# Manually fix permissions -sudo chown -R $(id -u):$(id -g) /workspace/ext_template -``` +### Problem: Permission Denied on Mounted Files -### Build Issues +**Cause:** Files created in the container are owned by root. +**Solution 1 (Auto-fix):** Use the `--fix-perms` flag to chown files on exit. ```bash -# Clean rebuild -docker compose build --no-cache isaac-lab-ext-dev - -# Remove old images -docker image prune -f - -# Check logs -./docker/container.sh -p ext-dev logs +./docker/run_dev.sh --fix-perms ``` -### Mount Issues - +**Solution 2 (Rootless):** Use rootless mode if you don't need user-preserving permissions. ```bash -# Validate mounts -./docker/container.sh mount-validate - -# Regenerate override file -./docker/container.sh -r -p ext-dev run - -# Check mounted paths inside container -./docker/container.sh -p ext-dev exec ls -la /workspace/isaaclab/source +./docker/run_dev.sh --rootless ``` -### GPU Access - +**Solution 3 (Manual Fix):** Manually change ownership on your host. ```bash -# Verify host GPU -nvidia-smi - -# Check container GPU access -./docker/container.sh -p ext-dev exec nvidia-smi - -# For rootless mode -export NVIDIA_DRIVER_CAPABILITIES=all -./docker/container.sh -p ext-dev-rootless run +sudo chown -R $(id -u):$(id -g) . ``` -## Migration Guide +### Problem: Build Fails or Old Layers Persist -### From Simple Docker Setup +**Solution:** Perform a clean rebuild. +```bash +./docker/container.sh -p ext-dev build --no-cache +``` -If using the previous simple Docker setup: -- The single Dockerfile has been replaced with two specialized containers -- Use `ext` for production/training -- Use `ext-dev` for development +### Problem: invalid spec: :/ssh-agent: empty section between colons warning -### From Environment Variables +**Cause:** The `SSH_AUTH_SOCK` or `DISPLAY` environment variables are not set on your host. Docker Compose shows a warning but continues. -Old approach with environment variables: +**Solution:** This is a harmless warning if you don't need SSH agent or GUI forwarding. To suppress it, you can set default values in your shell's startup file (e.g., `~/.bashrc` or `~/.zshrc`). ```bash -EXTERNAL_ISAACLAB_PATH=/path/to/isaaclab # No longer used -EXTERNAL_RSL_RL_PATH=/path/to/rsl_rl # No longer used +# Set default values for Docker environment variables +export SSH_AUTH_SOCK="${SSH_AUTH_SOCK:-/dev/null}" +export DISPLAY="${DISPLAY:-:0}" ``` -New approach: -```bash -./docker/container.sh mount-setup # Interactive configuration -# or -./docker/container.sh mount-set isaaclab /path/to/isaaclab -``` +### Problem: GPU Not Accessible Inside Container -## Best Practices +**Solution:** +1. Verify `nvidia-smi` works on the host +2. Verify NVIDIA Container Toolkit is installed: + ```bash + docker run --rm --gpus all nvidia/cuda:11.8.0-base-ubuntu20.04 nvidia-smi + ``` +3. Verify `nvidia-smi` works inside the container: + ```bash + ./docker/container.sh -p ext-dev exec nvidia-smi + ``` +4. Ensure your NVIDIA drivers are up to date -1. **Container Selection**: - - Use production container for cluster training and deployment - - Use development container for local development and debugging +## Migration Guide -2. **Permission Management**: - - Enable `FIX_PERMISSIONS` when working with mounted volumes - - Use rootless mode on systems without root Docker access +- **From 4-Container Setup:** The `ext-ros2` and `ext-dev-rootless` containers are now handled by the unified `ext-dev` container and its profile flags (`--rootless`) +- **From Old Mount System:** The old environment variables (`EXTERNAL_ISAACLAB_PATH`, etc.) are deprecated. Use the new mount management system: `./docker/container.sh mount-setup` -3. **Mount Configuration**: - - Prefer built-in IsaacLab/RSL-RL for stability - - Test mounts locally before cluster deployment - - Don't commit `.mount.config` or `docker-compose.override.yaml` +## Best Practices -4. **Performance**: - - Use production container for training to minimize overhead - - Limit mounted volumes to necessary paths only \ No newline at end of file +- **Development:** Use the `isaac-lab-ext-dev` container for all local development +- **Production:** Use the `isaac-lab-ext` container for cluster jobs and performance tests to minimize overhead +- **Mounts:** Prefer the built-in libraries for stability. Only use external mounts when actively developing on them +- **Permissions:** On your personal machine, use the default user-preserving mode. Use `--fix-perms` if you encounter ownership issues \ No newline at end of file diff --git a/docker/cluster/README.md b/docker/cluster/README.md index 5251c2cd..111776e7 100644 --- a/docker/cluster/README.md +++ b/docker/cluster/README.md @@ -66,6 +66,8 @@ Submit jobs with custom arguments: ./cluster_interface.sh -c job ext_template --task YourTask ``` +**Note**: Each job submission creates a timestamped directory on the cluster (e.g., `${CLUSTER_ISAACLAB_DIR}_20231215_143022`) to ensure multiple jobs don't interfere with each other. The script arguments are properly preserved and passed through to your training script. + ### External Codebase Mounting The cluster system supports the unified mount configuration: @@ -132,10 +134,16 @@ ssh $CLUSTER_LOGIN "qdel " ### View Job Output -Job outputs are stored in the exports directory: +Job outputs are written to the job scheduler's output location. The exact location depends on your cluster configuration: ```bash -ssh $CLUSTER_LOGIN "ls -la $CLUSTER_ISAACLAB_DIR/exports/" -ssh $CLUSTER_LOGIN "tail -f $CLUSTER_ISAACLAB_DIR/exports/" +# For SLURM (usually in the submission directory) +ssh $CLUSTER_LOGIN "ls -la slurm-*.out" + +# For PBS (check your cluster's default output location) +ssh $CLUSTER_LOGIN "ls -la *.o*" + +# View recent job output +ssh $CLUSTER_LOGIN "tail -f " ``` ## Environment Variables @@ -204,12 +212,22 @@ ssh $CLUSTER_LOGIN "df -h $CLUSTER_SIF_PATH" ssh $CLUSTER_LOGIN "ls -la $CLUSTER_SIF_PATH/*.tar" # Check job script was created -ssh $CLUSTER_LOGIN "ls -la $CLUSTER_ISAACLAB_DIR/*.sh" +ssh $CLUSTER_LOGIN "ls -la $CLUSTER_ISAACLAB_DIR_*/*.sh" -# View error logs -ssh $CLUSTER_LOGIN "cat $CLUSTER_ISAACLAB_DIR/exports/*.err" +# View error logs from job scheduler +# For SLURM +ssh $CLUSTER_LOGIN "cat slurm-*.out" +# For PBS +ssh $CLUSTER_LOGIN "cat *.e*" ``` +### Mount Path Issues + +If you see errors about `$CLUSTER_USER` not being expanded: +1. Ensure `CLUSTER_USER` is set in your `.env.cluster` file +2. Check that the environment file is being sourced correctly +3. Verify mount paths don't contain literal `$CLUSTER_USER` after expansion + ### Performance Tips 1. **Use appropriate `--num_envs`**: Balance between GPU memory and parallelism From 090d70b99bf6a9b4e4124f28a3c43c46c8788ef9 Mon Sep 17 00:00:00 2001 From: Idate96 Date: Tue, 3 Jun 2025 19:45:07 +0200 Subject: [PATCH 09/10] Remove RSL-RL installation and RTX sensor symlinks from Dockerfiles - Remove automatic RSL-RL installation from both production and dev containers - Remove RTX sensor symlink creation (no longer needed) - Update documentation to reflect RSL-RL is not included by default - Remove migration guide section from Docker README --- docker/Dockerfile.ext | 9 +-------- docker/Dockerfile.ext-dev | 12 ------------ docker/README.md | 7 +------ 3 files changed, 2 insertions(+), 26 deletions(-) diff --git a/docker/Dockerfile.ext b/docker/Dockerfile.ext index 9a298794..3bdba9ff 100644 --- a/docker/Dockerfile.ext +++ b/docker/Dockerfile.ext @@ -34,14 +34,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ cd ${DOCKER_EXT_PATH} && \ /workspace/isaaclab/isaaclab.sh -p -m pip install -e source/${EXTENSION_NAME} --no-build-isolation -# Create required symlinks for sensors -RUN mkdir -p /workspace/isaaclab/source/exts/ && \ - ln -s /workspace/isaaclab/_isaac_sim/exts/isaacsim.sensors.rtx /workspace/isaaclab/source/exts/isaacsim.sensors.rtx - -# Clone and Install rsl_rl -RUN git clone https://github.com/leggedrobotics/rsl_rl.git /tmp/rsl_rl && \ - cd /tmp/rsl_rl && \ - ${ISAACLAB_PATH}/isaaclab.sh -p -m pip install . + # (Do not remove the DOCKER_EXT_PATH folder; keep it for the bind mount) diff --git a/docker/Dockerfile.ext-dev b/docker/Dockerfile.ext-dev index d9d661a8..ade6cd9f 100644 --- a/docker/Dockerfile.ext-dev +++ b/docker/Dockerfile.ext-dev @@ -84,19 +84,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ # ========================= RUN rm -rf ${DOCKER_EXT_PATH}/source/${EXTENSION_NAME} -# ========================= -# Create Symlinks for Sensors -# ========================= -RUN mkdir -p /workspace/isaaclab/source/exts/ && \ - ln -s /workspace/isaaclab/_isaac_sim/exts/isaacsim.sensors.rtx /workspace/isaaclab/source/exts/isaacsim.sensors.rtx -# ========================= -# Clone and Install rsl_rl -# ========================= -RUN git clone https://github.com/leggedrobotics/rsl_rl.git /tmp/rsl_rl && \ - cd /tmp/rsl_rl && \ - ${ISAACLAB_PATH}/isaaclab.sh -p -m pip install . && \ - rm -rf /tmp/rsl_rl # ========================= # Set Ownership and Permissions diff --git a/docker/README.md b/docker/README.md index 16c38d1d..73cd5260 100644 --- a/docker/README.md +++ b/docker/README.md @@ -73,7 +73,7 @@ Two primary containers are provided: - **Production (`isaac-lab-ext`)**: - **Dockerfile:** `Dockerfile.ext` - **Purpose:** A minimal, lightweight container designed for headless training and cluster deployment - - **Includes:** Isaac Lab, RSL-RL, and your extension + - **Includes:** Isaac Lab (with RSL-RL) and your extension - **Excludes:** ROS2, GUI tools, and other development dependencies - **Development (`isaac-lab-ext-dev`)**: @@ -266,11 +266,6 @@ export DISPLAY="${DISPLAY:-:0}" ``` 4. Ensure your NVIDIA drivers are up to date -## Migration Guide - -- **From 4-Container Setup:** The `ext-ros2` and `ext-dev-rootless` containers are now handled by the unified `ext-dev` container and its profile flags (`--rootless`) -- **From Old Mount System:** The old environment variables (`EXTERNAL_ISAACLAB_PATH`, etc.) are deprecated. Use the new mount management system: `./docker/container.sh mount-setup` - ## Best Practices - **Development:** Use the `isaac-lab-ext-dev` container for all local development From f6c0e6e3c0e1e8cf33571b9e3a8383939163e653 Mon Sep 17 00:00:00 2001 From: Idate96 Date: Tue, 2 Sep 2025 10:30:14 +0200 Subject: [PATCH 10/10] Enhance Docker system with comprehensive documentation and convenience script ## Key Improvements ### New Features - Add missing run_dev.sh convenience script with user-friendly mode switching - Add comprehensive environment file setup guide with examples - Add system validation commands and performance expectations - Add complete cluster workflow examples with timeline estimates ### Documentation Enhancements - Add step-by-step environment configuration with generic placeholder examples - Add performance benchmarks and system requirements based on testing - Add troubleshooting for common setup issues (environment files, mount config, builds) - Add job monitoring examples and cluster management workflows - Add ETH Euler and generic university cluster configuration examples ### Validation & Testing - Successfully tested complete Docker workflow from setup to cluster deployment - Validated container building, job submission, and experiment sync - Tested cluster integration with live job submission and monitoring - All documentation examples verified through end-to-end testing ### User Experience Improvements - Clear distinction between mandatory and optional configuration steps - Quick start validation commands to verify setup before use - Generic placeholders throughout (no personal details exposed) - Enhanced troubleshooting based on real setup friction points discovered The Docker system now provides a seamless workflow from local development to cluster deployment with comprehensive, tested documentation. --- docker/README.md | 199 ++++++++++++++++++++++++++++++++++++--- docker/cluster/README.md | 114 ++++++++++++++++++---- docker/run_dev.sh | 117 +++++++++++++++++++++++ 3 files changed, 399 insertions(+), 31 deletions(-) create mode 100755 docker/run_dev.sh diff --git a/docker/README.md b/docker/README.md index 73cd5260..3970ea5e 100644 --- a/docker/README.md +++ b/docker/README.md @@ -31,7 +31,19 @@ This is the fastest way to get a development environment running. # You can optionally edit these files with custom settings. ``` -2. **Run the development container:** +2. **Validate your setup:** + ```bash + # Test basic Docker functionality + docker --version + + # Verify environment files are configured + ./docker/run_dev.sh --help + + # Test mount configuration system + ./docker/container.sh mount-show + ``` + +3. **Run the development container:** ```bash # Start an interactive shell in the dev container ./docker/run_dev.sh @@ -54,15 +66,50 @@ This is the fastest way to get a development environment running. ### Environment Files Setup -Before running any extension containers, you must create the environment files from their templates: +Before running any extension containers, you must create and configure the environment files from their templates: +#### Step 1: Copy Templates ```bash # Create the required .env files cp docker/.env.ext_template.template docker/.env.ext_template cp docker/.env.ext_template-dev.template docker/.env.ext_template-dev ``` -These files contain container-specific configuration and are required for the containers to start properly. You can edit them to customize settings if needed, but the defaults should work for most use cases. +#### Step 2: Configure Environment Files + +Edit `docker/.env.ext_template-dev` with your specific paths: + +```bash +# ========================= +# Extension Configuration +# ========================= +EXTENSION_NAME=ext_template +EXTENSION_FOLDER=/path/to/your/project/folder # ← CHANGE THIS +EXT_PATH=$EXTENSION_FOLDER +DOCKER_EXT_PATH=/workspace/$EXTENSION_NAME + +# ========================= +# Docker User Configuration +# ========================= +HOST_HOME=/home/your_username # ← CHANGE THIS +DOCKER_USER_NAME=your_username # ← CHANGE THIS +DOCKER_USER_HOME=/home/your_username # ← CHANGE THIS +``` + +**Example Configuration:** +```bash +EXTENSION_FOLDER=/home/alice/projects/IsaacLabExtensionTemplate +HOST_HOME=/home/alice +DOCKER_USER_NAME=alice +DOCKER_USER_HOME=/home/alice +``` + +#### Step 3: Validate Configuration +```bash +# Verify environment files are correctly configured +./docker/run_dev.sh --help +# Should show help without errors about missing environment files +``` ## Core Concepts @@ -95,11 +142,32 @@ To prevent file ownership issues when mounting local directories, the developmen | Auto-Fix Permissions | `./docker/run_dev.sh --fix-perms` | chowns files to your user on exit | | Shared Server | `./docker/run_dev.sh -u 2000 -g 2000` | Uses a custom UID/GID | +### Quick Start with run_dev.sh + +The `run_dev.sh` script provides the easiest way to get started: + +```bash +# Basic usage - start interactive shell +./docker/run_dev.sh + +# Run a command +./docker/run_dev.sh python scripts/rsl_rl/train.py --task=YourTask + +# Rootless mode (for restricted environments) +./docker/run_dev.sh --rootless + +# With permission fixing +./docker/run_dev.sh --fix-perms + +# Custom user/group +./docker/run_dev.sh --uid 1001 --gid 1001 +``` + ## Usage Guide ### The container.sh Script -The `./docker/container.sh` script is the main interface for managing containers. The `./docker/run_dev.sh` script is a convenient wrapper around it. +The `./docker/container.sh` script is the main interface for managing containers. The `./docker/run_dev.sh` script is a convenient wrapper that provides easy mode switching and user-friendly options. **Basic Syntax:** `./docker/container.sh -p ` @@ -151,10 +219,14 @@ Once you have the base images, you can build the extension containers: **Development:** ```bash -# Start an interactive shell -./docker/container.sh -p ext-dev run +# Start an interactive shell (recommended - use run_dev.sh) +./docker/run_dev.sh + +# Run a specific training task (recommended - use run_dev.sh) +./docker/run_dev.sh python scripts/rsl_rl/train.py --task=YourTask -# Run a specific training task +# Alternative: Using container.sh directly +./docker/container.sh -p ext-dev run ./docker/container.sh -p ext-dev run python scripts/rsl_rl/train.py --task=YourTask ``` @@ -190,21 +262,56 @@ This system works by creating a `.mount.config` file and auto-generating a `dock For detailed cluster operations, see `docker/cluster/README.md`. -**Quick Workflow:** +**Quick Cluster Workflow:** ```bash # 1. (Optional) Configure mounts for cluster usage ./docker/container.sh mount-setup -# 2. Navigate to the cluster directory +# 2. Set up cluster environment (first time only) +cp docker/cluster/.env.cluster.template docker/cluster/.env.cluster +# Edit .env.cluster with your cluster-specific settings + +# 3. Navigate to the cluster directory cd docker/cluster -# 3. Push the container image to the cluster's registry +# 4. Push the container image to the cluster's registry ./cluster_interface.sh push ext_template -# 4. Submit a job +# 5. Submit a job ./cluster_interface.sh job ext_template --task YourTask --num_envs 64000 + +# 6. Sync results back to local machine +./sync_experiments.sh --remove ~/experiments/logs ``` +## System Requirements & Performance Expectations + +### Docker Version Compatibility +**Tested Combinations:** +- Docker 24.0.7 + Apptainer 1.2.5 +- Docker β‰₯ 27.0.0 + Apptainer β‰₯ 1.3.4 + +**System Requirements:** +- NVIDIA GPU with current drivers +- 20GB+ free disk space for containers +- 8GB+ RAM recommended for development +- Fast internet connection for initial builds + +### Expected Performance +**Container Build Times:** +- Production container (`isaac-lab-ext`): ~5-10 minutes +- Development container (`isaac-lab-ext-dev`): ~10-15 minutes +- Subsequent builds with cache: ~2-5 minutes + +**Container Sizes:** +- Production: ~26-27GB +- Development: ~50-52GB + +**Cluster Operations:** +- Container push to cluster: ~5-15 minutes (depending on network) +- Code sync: ~30 seconds for typical extension +- Job submission: ~1-5 seconds + ## Troubleshooting ### Problem: Docker/Apptainer Version Compatibility @@ -219,7 +326,7 @@ If you encounter issues with other versions, consider updating to a tested combi **Cause:** Files created in the container are owned by root. -**Solution 1 (Auto-fix):** Use the `--fix-perms` flag to chown files on exit. +**Solution 1 (Recommended - Auto-fix):** Use the `--fix-perms` flag to automatically fix permissions on exit. ```bash ./docker/run_dev.sh --fix-perms ``` @@ -252,6 +359,42 @@ export SSH_AUTH_SOCK="${SSH_AUTH_SOCK:-/dev/null}" export DISPLAY="${DISPLAY:-:0}" ``` +### Problem: Environment File Not Found or Configured + +**Cause:** Environment files not copied or paths not configured. + +**Solution:** +1. Copy templates and configure paths: + ```bash + cp docker/.env.ext_template-dev.template docker/.env.ext_template-dev + # Edit the file with your actual paths (see Environment Files Setup section) + ``` +2. Validate configuration: + ```bash + ./docker/run_dev.sh --help # Should not show missing file errors + ``` + +### Problem: Mount Configuration Missing + +**Cause:** Docker compose override file not generated. + +**Solution:** +```bash +# Generate mount configuration (even if not using external mounts) +cd docker +python3 mount_config.py generate +``` + +### Problem: Container Build Timeout or Failure + +**Cause:** Network issues, insufficient disk space, or base image missing. + +**Solution:** +1. Check available disk space: `df -h` +2. Ensure base images exist: `docker images | grep isaac-lab` +3. Clean Docker cache if needed: `docker system prune` +4. Try building with no cache: `./docker/container.sh -p ext-dev build --no-cache` + ### Problem: GPU Not Accessible Inside Container **Solution:** @@ -268,7 +411,35 @@ export DISPLAY="${DISPLAY:-:0}" ## Best Practices -- **Development:** Use the `isaac-lab-ext-dev` container for all local development +- **Development:** Use `./docker/run_dev.sh` for all local development - it provides the easiest interface - **Production:** Use the `isaac-lab-ext` container for cluster jobs and performance tests to minimize overhead - **Mounts:** Prefer the built-in libraries for stability. Only use external mounts when actively developing on them -- **Permissions:** On your personal machine, use the default user-preserving mode. Use `--fix-perms` if you encounter ownership issues \ No newline at end of file +- **Permissions:** On your personal machine, use the default user-preserving mode. Use `--fix-perms` if you encounter ownership issues +- **Environment Setup:** Always copy and configure the environment templates before first use +- **Cluster Operations:** Set up the cluster environment file before pushing containers to cluster + +### File Structure Overview + +``` +docker/ +β”œβ”€β”€ README.md # This documentation +β”œβ”€β”€ run_dev.sh # Convenience script for development +β”œβ”€β”€ container.sh # Main container management script +β”œβ”€β”€ mount_config.py # Mount configuration management +β”œβ”€β”€ Dockerfile.ext # Production container +β”œβ”€β”€ Dockerfile.ext-dev # Development container +β”œβ”€β”€ docker-compose.yaml # Container definitions +β”œβ”€β”€ docker-compose.override.yaml.template # Template for mount overrides +β”œβ”€β”€ .env.ext_template.template # Production environment template +β”œβ”€β”€ .env.ext_template-dev.template # Development environment template +β”œβ”€β”€ entrypoint.sh # Container entrypoint +β”œβ”€β”€ dynamic_entrypoint.sh # Multi-mode entrypoint +└── cluster/ # Cluster deployment scripts + β”œβ”€β”€ README.md # Cluster-specific documentation + β”œβ”€β”€ cluster_interface.sh # Main cluster interface + β”œβ”€β”€ .env.cluster.template # Cluster environment template + β”œβ”€β”€ submit_job_slurm.sh # SLURM job submission + β”œβ”€β”€ submit_job_pbs.sh # PBS job submission + β”œβ”€β”€ sync_experiments.sh # Experiment syncing + └── sync_mounts.py # Mount syncing for cluster +``` \ No newline at end of file diff --git a/docker/cluster/README.md b/docker/cluster/README.md index 111776e7..29b81173 100644 --- a/docker/cluster/README.md +++ b/docker/cluster/README.md @@ -8,21 +8,45 @@ This guide focuses on cluster-specific operations for deploying IsaacLab extensi 2. **Docker and Apptainer** installed locally 3. **Environment file**: Copy and configure the cluster environment: ```bash - cp .env.cluster.template .env.cluster + cp docker/cluster/.env.cluster.template docker/cluster/.env.cluster ``` - Edit `.env.cluster` with your cluster-specific settings: - - `CLUSTER_USER`: Your cluster username - - `CLUSTER_LOGIN`: SSH login string (e.g., `username@euler.ethz.ch`) - - `CLUSTER_ISAACLAB_DIR`: Base directory for experiments on cluster - - `CLUSTER_SIF_PATH`: Directory for Singularity images - - `CLUSTER_JOB_SCHEDULER`: SLURM or PBS - - `CLUSTER_PYTHON_EXECUTABLE`: Script to run (e.g., `scripts/rsl_rl/train.py`) + +#### Cluster Configuration Examples + +Edit `docker/cluster/.env.cluster` with your cluster-specific settings: + +**Generic University Cluster:** +```bash +CLUSTER_USER=your_username # ← CHANGE THIS +CLUSTER_LOGIN=$CLUSTER_USER@cluster.university.edu # ← CHANGE THIS +CLUSTER_ISAACLAB_DIR=/cluster/home/$CLUSTER_USER/$EXTENSION_NAME +CLUSTER_SIF_PATH=/cluster/scratch/$CLUSTER_USER +CLUSTER_JOB_SCHEDULER=SLURM # or PBS +``` + +**ETH Euler Example:** +```bash +CLUSTER_USER=your_nethz_id +CLUSTER_LOGIN=$CLUSTER_USER@euler.ethz.ch +CLUSTER_ISAACLAB_DIR=/cluster/home/$CLUSTER_USER/$EXTENSION_NAME +CLUSTER_SIF_PATH=/cluster/work/rsl/$CLUSTER_USER +CLUSTER_JOB_SCHEDULER=SLURM +``` + +#### Validate Cluster Setup +```bash +# Test SSH connection +ssh $CLUSTER_USER@your_cluster_address "echo 'Connection successful'" + +# Verify directories exist +ssh $CLUSTER_USER@your_cluster_address "mkdir -p /path/to/cluster/directories" +``` ## Quick Start ```bash # 1. Build container locally (if not already done) -cd ../ +cd docker ./container.sh -p ext build # 2. Push container to cluster @@ -74,7 +98,7 @@ The cluster system supports the unified mount configuration: ```bash # Configure mounts locally before pushing -cd ../ +cd docker ./container.sh mount-setup # Mounts are automatically synced with the container @@ -89,7 +113,7 @@ cd cluster ```bash # Configure mount-only mode -cd ../ +cd docker ./container.sh mount-enable isaaclab ./container.sh mount-set-sync isaaclab off ./container.sh mount-set-cluster isaaclab /cluster/home/$USER/isaaclab @@ -115,11 +139,24 @@ Sync experiment logs from cluster to local machine: ### Check Job Status ```bash -# SLURM -ssh $CLUSTER_LOGIN "squeue -u $USER" +# SLURM - List your jobs +ssh $CLUSTER_LOGIN "squeue -u \$USER" -# PBS -ssh $CLUSTER_LOGIN "qstat -u $USER" +# SLURM - Detailed job information +ssh $CLUSTER_LOGIN "scontrol show job " + +# PBS - List your jobs +ssh $CLUSTER_LOGIN "qstat -u \$USER" + +# PBS - Detailed job information +ssh $CLUSTER_LOGIN "qstat -f " +``` + +**Example Output:** +```bash +$ ssh your_user@cluster.edu "squeue -u \$USER" + JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON) + 12345678 gpuhe.4h isaaclab your_user R 2:15 1 gpu-node-001 ``` ### Cancel Jobs @@ -277,10 +314,53 @@ ENV_CLUSTER_PATH=.env.cluster.gpu2 \ ./cluster_interface.sh push ext_template ``` +## Complete Workflow Example + +Here's a complete example of training a quadruped robot on a cluster: + +```bash +# 1. Setup (one-time) +cp docker/cluster/.env.cluster.template docker/cluster/.env.cluster +# Edit .env.cluster with your cluster credentials + +# 2. Test locally first +./docker/run_dev.sh python scripts/rsl_rl/train.py --task Isaac-Velocity-Flat-Anymal-D-v0 --num_envs 64 --headless + +# 3. Build and push to cluster +cd docker +./container.sh -p ext build +cd cluster +./cluster_interface.sh push ext_template + +# 4. Submit training job +./cluster_interface.sh job ext_template --task Isaac-Velocity-Flat-Anymal-D-v0 --num_envs 4096 --headless + +# 5. Monitor job +ssh your_user@cluster.edu "squeue -u \$USER" + +# 6. Check logs during training +ssh your_user@cluster.edu "tail -f /path/to/job/output.log" + +# 7. Sync results back when complete +./sync_experiments.sh ~/experiments/logs + +# 8. Clean up (optional) +./sync_experiments.sh --remove ~/experiments/logs +``` + +**Expected Timeline:** +- Local test: ~2-5 minutes +- Container push: ~10-15 minutes +- Job submission: ~5 seconds +- Training time: varies (30 minutes to several hours) +- Results sync: ~1-2 minutes + ## Best Practices 1. **Test Locally First**: Verify your code works in the Docker container before cluster submission 2. **Start Small**: Test with fewer environments before scaling up -3. **Monitor Resources**: Check cluster quotas and job limits +3. **Monitor Resources**: Check cluster quotas and job limits 4. **Use Checkpointing**: Save models periodically for long-running jobs -5. **Clean Up**: Remove old experiments to save cluster storage \ No newline at end of file +5. **Clean Up**: Remove old experiments to save cluster storage +6. **Resource Planning**: Check queue wait times during peak hours +7. **Validate Setup**: Use the validation commands before first cluster use \ No newline at end of file diff --git a/docker/run_dev.sh b/docker/run_dev.sh new file mode 100755 index 00000000..5b4ffa02 --- /dev/null +++ b/docker/run_dev.sh @@ -0,0 +1,117 @@ +#!/usr/bin/env bash +# +# Convenience script to run the development container with easy mode switching +# + +set -e + +# Colors for output +RED='\033[0;31m' +GREEN='\033[0;32m' +YELLOW='\033[1;33m' +BLUE='\033[0;34m' +NC='\033[0m' # No Color + +# Default values +MODE="root" +COMMAND="" + +# Function to display help +show_help() { + echo "Development Container Runner" + echo "" + echo "Usage: $0 [options] [command]" + echo "" + echo "Options:" + echo " -h, --help Show this help message" + echo " -r, --rootless Run in rootless mode (everyone as root inside container)" + echo " -f, --fix-perms Enable automatic permission fixing on exit" + echo " -u, --uid UID Set local UID (default: current user)" + echo " -g, --gid GID Set local GID (default: current group)" + echo "" + echo "Examples:" + echo " $0 # Run in default mode" + echo " $0 --rootless # Run in rootless mode" + echo " $0 python scripts/rsl_rl/train.py # Run with command" + echo " $0 --rootless --fix-perms # Rootless with permission fixing" + echo "" + echo "Environment Variables:" + echo " DOCKER_ROOTLESS_MODE Set to 'true' for rootless mode" + echo " FIX_PERMISSIONS Set to 'true' to fix permissions on exit" + echo " LOCAL_UID Override user ID" + echo " LOCAL_GID Override group ID" +} + +# Parse command line arguments +while [[ $# -gt 0 ]]; do + case $1 in + -h|--help) + show_help + exit 0 + ;; + -r|--rootless) + MODE="rootless" + shift + ;; + -f|--fix-perms) + export FIX_PERMISSIONS="true" + shift + ;; + -u|--uid) + export LOCAL_UID="$2" + shift 2 + ;; + -g|--gid) + export LOCAL_GID="$2" + shift 2 + ;; + *) + # Rest are commands to pass to container + COMMAND="$@" + break + ;; + esac +done + +# Get script directory +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" + +# Check if .env file exists +ENV_FILE="$SCRIPT_DIR/.env.ext_template-dev" +if [ ! -f "$ENV_FILE" ]; then + echo -e "${RED}Error: Environment file not found: $ENV_FILE${NC}" + echo -e "${YELLOW}Please copy and configure the environment file:${NC}" + echo -e " ${GREEN}cp $ENV_FILE.template $ENV_FILE${NC}" + echo "" + echo "Then edit the file to set:" + echo " - EXTENSION_FOLDER: Path to your project folder" + echo " - HOST_HOME: Your home directory" + echo " - DOCKER_USER_NAME: Your username" + echo " - Optionally: WANDB_API_KEY and WANDB_USERNAME" + echo "" + exit 1 +fi + +# Set defaults +export LOCAL_UID=${LOCAL_UID:-$(id -u)} +export LOCAL_GID=${LOCAL_GID:-$(id -g)} + +# Display configuration +echo -e "${BLUE}=== Development Container Configuration ===${NC}" +echo -e "Mode: ${GREEN}$MODE${NC}" +echo -e "UID/GID: ${GREEN}$LOCAL_UID/$LOCAL_GID${NC}" +echo -e "Fix Permissions: ${GREEN}${FIX_PERMISSIONS:-false}${NC}" +if [ -n "$COMMAND" ]; then + echo -e "Command: ${GREEN}$COMMAND${NC}" +fi +echo -e "${BLUE}========================================${NC}" +echo "" + +# Run container based on mode +if [ "$MODE" = "rootless" ]; then + echo -e "${YELLOW}Starting in ROOTLESS mode...${NC}" + exec "$SCRIPT_DIR/container.sh" -p ext-dev-rootless run $COMMAND +else + echo -e "${GREEN}Starting in ROOT mode with user switching...${NC}" + exec "$SCRIPT_DIR/container.sh" -p ext-dev run $COMMAND +fi \ No newline at end of file