pytorch · vmoens · Dec 18, 2025 · Dec 18, 2025 · Dec 18, 2025 · Dec 18, 2025
diff --git a/.github/unittest/linux/scripts/run_all.sh b/.github/unittest/linux/scripts/run_all.sh
@@ -258,6 +258,8 @@ fi
 # ==================================================================================== #
 # ================================ Run tests ========================================= #
 
+TORCHRL_TEST_SUITE="${TORCHRL_TEST_SUITE:-all}" # all|distributed|nondistributed
+
 export PYTORCH_TEST_WITH_SLOW='1'
 python -m torch.utils.collect_env
 
@@ -269,27 +271,45 @@ pytest test/smoke_test_deps.py -v --durations 200 -k 'test_gym or test_dm_contro
 # Track if any tests fail
 EXIT_STATUS=0
 
-# Run distributed tests first (GPU only) to surface errors early
-if [ "${CU_VERSION:-}" != cpu ] ; then
+run_distributed_tests() {
+  # Distributed tests are GPU-only in our CI.
+  if [ "${CU_VERSION:-}" == cpu ] ; then
+    echo "TORCHRL_TEST_SUITE=${TORCHRL_TEST_SUITE}: distributed tests require GPU (CU_VERSION != cpu)."
+    return 1
+  fi
   python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_distributed.py \
     --instafail --durations 200 -vv --capture no \
-    --timeout=120 --mp_fork_if_no_cuda || EXIT_STATUS=$?
-fi
+    --timeout=120 --mp_fork_if_no_cuda
+}
 
-# Run remaining tests (always run even if distributed tests failed)
-if [ "${CU_VERSION:-}" != cpu ] ; then
-  python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \
-    --instafail --durations 200 -vv --capture no --ignore test/test_rlhf.py \
-    --ignore test/test_distributed.py \
-    --ignore test/llm \
-    --timeout=120 --mp_fork_if_no_cuda || EXIT_STATUS=$?
-else
+run_non_distributed_tests() {
+  # Note: we always ignore distributed tests here (they can be run in a separate job).
   python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \
     --instafail --durations 200 -vv --capture no --ignore test/test_rlhf.py \
     --ignore test/test_distributed.py \
     --ignore test/llm \
-    --timeout=120 --mp_fork_if_no_cuda || EXIT_STATUS=$?
-fi
+    --timeout=120 --mp_fork_if_no_cuda
+}
+
+case "${TORCHRL_TEST_SUITE}" in
+  all)
+    # Run distributed tests first (GPU only) to surface errors early, then the rest.
+    if [ "${CU_VERSION:-}" != cpu ] ; then
+      run_distributed_tests || EXIT_STATUS=$?
+    fi
+    run_non_distributed_tests || EXIT_STATUS=$?
+    ;;
+  distributed)
+    run_distributed_tests || EXIT_STATUS=$?
+    ;;
+  nondistributed)
+    run_non_distributed_tests || EXIT_STATUS=$?
+    ;;
+  *)
+    echo "Unknown TORCHRL_TEST_SUITE='${TORCHRL_TEST_SUITE}'. Expected: all|distributed|nondistributed."
+    exit 2
+    ;;
+esac
 
 # Fail the workflow if any tests failed
 if [ $EXIT_STATUS -ne 0 ]; then

diff --git a/.github/workflows/test-linux.yml b/.github/workflows/test-linux.yml
@@ -82,6 +82,9 @@ jobs:
         fi
         export TD_GET_DEFAULTS_TO_NONE=1
 
+        # Run everything except distributed tests; those run in parallel in tests-gpu-distributed.
+        export TORCHRL_TEST_SUITE=nondistributed
+
         # Remove the following line when the GPU tests are working inside docker, and uncomment the above lines
         #export CU_VERSION="cpu"
 
@@ -91,6 +94,44 @@ jobs:
         ## setup_env.sh
         bash .github/unittest/linux/scripts/run_all.sh
 
+  tests-gpu-distributed:
+    strategy:
+      matrix:
+        python_version: ["3.12"]
+        cuda_arch_version: ["13.0"]
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      repository: pytorch/rl
+      docker-image: "nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04"
+      gpu-arch-type: cuda
+      gpu-arch-version: ${{ matrix.cuda_arch_version }}
+      timeout: 120
+      script: |
+        # Set env vars from matrix
+        export PYTHON_VERSION=${{ matrix.python_version }}
+        # Commenting these out for now because the GPU test are not working inside docker
+        export CUDA_ARCH_VERSION=${{ matrix.cuda_arch_version }}
+        export CU_VERSION="cu${CUDA_ARCH_VERSION:0:2}${CUDA_ARCH_VERSION:3:1}"
+        if [[ "${{ github.ref }}" =~ release/* ]]; then
+          export RELEASE=1
+          export TORCH_VERSION=stable
+        else
+          export RELEASE=0
+          export TORCH_VERSION=nightly
+        fi
+        export TD_GET_DEFAULTS_TO_NONE=1
+
+        # Only distributed tests (runs in parallel with tests-gpu).
+        export TORCHRL_TEST_SUITE=distributed
+
+        echo "PYTHON_VERSION: $PYTHON_VERSION"
+        echo "CU_VERSION: $CU_VERSION"
+
+        ## setup_env.sh
+        bash .github/unittest/linux/scripts/run_all.sh
+
   tests-olddeps:
     strategy:
       matrix:
@@ -196,5 +237,46 @@ jobs:
         echo "CU_VERSION: $CU_VERSION"
         export TD_GET_DEFAULTS_TO_NONE=1
 
+        # Run everything except distributed tests; those run in parallel in tests-stable-gpu-distributed.
+        export TORCHRL_TEST_SUITE=nondistributed
+
+        ## setup_env.sh
+        bash .github/unittest/linux/scripts/run_all.sh
+
+  tests-stable-gpu-distributed:
+    strategy:
+      matrix:
+        python_version: ["3.12"] # "3.9", "3.10", "3.11"
+        cuda_arch_version: ["13.0"] # "11.6", "11.7"
+      fail-fast: false
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.g5.4xlarge.nvidia.gpu
+      repository: pytorch/rl
+      docker-image: "nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04"
+      gpu-arch-type: cuda
+      gpu-arch-version: ${{ matrix.cuda_arch_version }}
+      timeout: 120
+      script: |
+        # Set env vars from matrix
+        export PYTHON_VERSION=${{ matrix.python_version }}
+        # Commenting these out for now because the GPU test are not working inside docker
+        export CUDA_ARCH_VERSION=${{ matrix.cuda_arch_version }}
+        export CU_VERSION="cu${CUDA_ARCH_VERSION:0:2}${CUDA_ARCH_VERSION:3:1}"
+
+        if [[ "${{ github.ref }}" =~ release/* ]]; then
+          export RELEASE=1
+          export TORCH_VERSION=stable
+        else
+          export RELEASE=0
+          export TORCH_VERSION=nightly
+        fi
+
+        export TD_GET_DEFAULTS_TO_NONE=1
+        export TORCHRL_TEST_SUITE=distributed
+
+        echo "PYTHON_VERSION: $PYTHON_VERSION"
+        echo "CU_VERSION: $CU_VERSION"
+
         ## setup_env.sh
         bash .github/unittest/linux/scripts/run_all.sh