diff --git a/.github/unittest/linux/scripts/run_all.sh b/.github/unittest/linux/scripts/run_all.sh index eb491684112..5ead4cea70b 100755 --- a/.github/unittest/linux/scripts/run_all.sh +++ b/.github/unittest/linux/scripts/run_all.sh @@ -258,6 +258,8 @@ fi # ==================================================================================== # # ================================ Run tests ========================================= # +TORCHRL_TEST_SUITE="${TORCHRL_TEST_SUITE:-all}" # all|distributed|nondistributed + export PYTORCH_TEST_WITH_SLOW='1' python -m torch.utils.collect_env @@ -269,27 +271,45 @@ pytest test/smoke_test_deps.py -v --durations 200 -k 'test_gym or test_dm_contro # Track if any tests fail EXIT_STATUS=0 -# Run distributed tests first (GPU only) to surface errors early -if [ "${CU_VERSION:-}" != cpu ] ; then +run_distributed_tests() { + # Distributed tests are GPU-only in our CI. + if [ "${CU_VERSION:-}" == cpu ] ; then + echo "TORCHRL_TEST_SUITE=${TORCHRL_TEST_SUITE}: distributed tests require GPU (CU_VERSION != cpu)." + return 1 + fi python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_distributed.py \ --instafail --durations 200 -vv --capture no \ - --timeout=120 --mp_fork_if_no_cuda || EXIT_STATUS=$? -fi + --timeout=120 --mp_fork_if_no_cuda +} -# Run remaining tests (always run even if distributed tests failed) -if [ "${CU_VERSION:-}" != cpu ] ; then - python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \ - --instafail --durations 200 -vv --capture no --ignore test/test_rlhf.py \ - --ignore test/test_distributed.py \ - --ignore test/llm \ - --timeout=120 --mp_fork_if_no_cuda || EXIT_STATUS=$? -else +run_non_distributed_tests() { + # Note: we always ignore distributed tests here (they can be run in a separate job). python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \ --instafail --durations 200 -vv --capture no --ignore test/test_rlhf.py \ --ignore test/test_distributed.py \ --ignore test/llm \ - --timeout=120 --mp_fork_if_no_cuda || EXIT_STATUS=$? -fi + --timeout=120 --mp_fork_if_no_cuda +} + +case "${TORCHRL_TEST_SUITE}" in + all) + # Run distributed tests first (GPU only) to surface errors early, then the rest. + if [ "${CU_VERSION:-}" != cpu ] ; then + run_distributed_tests || EXIT_STATUS=$? + fi + run_non_distributed_tests || EXIT_STATUS=$? + ;; + distributed) + run_distributed_tests || EXIT_STATUS=$? + ;; + nondistributed) + run_non_distributed_tests || EXIT_STATUS=$? + ;; + *) + echo "Unknown TORCHRL_TEST_SUITE='${TORCHRL_TEST_SUITE}'. Expected: all|distributed|nondistributed." + exit 2 + ;; +esac # Fail the workflow if any tests failed if [ $EXIT_STATUS -ne 0 ]; then diff --git a/.github/workflows/test-linux.yml b/.github/workflows/test-linux.yml index 9bfec8b4455..14aaadb580f 100644 --- a/.github/workflows/test-linux.yml +++ b/.github/workflows/test-linux.yml @@ -82,6 +82,9 @@ jobs: fi export TD_GET_DEFAULTS_TO_NONE=1 + # Run everything except distributed tests; those run in parallel in tests-gpu-distributed. + export TORCHRL_TEST_SUITE=nondistributed + # Remove the following line when the GPU tests are working inside docker, and uncomment the above lines #export CU_VERSION="cpu" @@ -91,6 +94,44 @@ jobs: ## setup_env.sh bash .github/unittest/linux/scripts/run_all.sh + tests-gpu-distributed: + strategy: + matrix: + python_version: ["3.12"] + cuda_arch_version: ["13.0"] + fail-fast: false + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + runner: linux.g5.4xlarge.nvidia.gpu + repository: pytorch/rl + docker-image: "nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04" + gpu-arch-type: cuda + gpu-arch-version: ${{ matrix.cuda_arch_version }} + timeout: 120 + script: | + # Set env vars from matrix + export PYTHON_VERSION=${{ matrix.python_version }} + # Commenting these out for now because the GPU test are not working inside docker + export CUDA_ARCH_VERSION=${{ matrix.cuda_arch_version }} + export CU_VERSION="cu${CUDA_ARCH_VERSION:0:2}${CUDA_ARCH_VERSION:3:1}" + if [[ "${{ github.ref }}" =~ release/* ]]; then + export RELEASE=1 + export TORCH_VERSION=stable + else + export RELEASE=0 + export TORCH_VERSION=nightly + fi + export TD_GET_DEFAULTS_TO_NONE=1 + + # Only distributed tests (runs in parallel with tests-gpu). + export TORCHRL_TEST_SUITE=distributed + + echo "PYTHON_VERSION: $PYTHON_VERSION" + echo "CU_VERSION: $CU_VERSION" + + ## setup_env.sh + bash .github/unittest/linux/scripts/run_all.sh + tests-olddeps: strategy: matrix: @@ -196,5 +237,46 @@ jobs: echo "CU_VERSION: $CU_VERSION" export TD_GET_DEFAULTS_TO_NONE=1 + # Run everything except distributed tests; those run in parallel in tests-stable-gpu-distributed. + export TORCHRL_TEST_SUITE=nondistributed + + ## setup_env.sh + bash .github/unittest/linux/scripts/run_all.sh + + tests-stable-gpu-distributed: + strategy: + matrix: + python_version: ["3.12"] # "3.9", "3.10", "3.11" + cuda_arch_version: ["13.0"] # "11.6", "11.7" + fail-fast: false + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + runner: linux.g5.4xlarge.nvidia.gpu + repository: pytorch/rl + docker-image: "nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04" + gpu-arch-type: cuda + gpu-arch-version: ${{ matrix.cuda_arch_version }} + timeout: 120 + script: | + # Set env vars from matrix + export PYTHON_VERSION=${{ matrix.python_version }} + # Commenting these out for now because the GPU test are not working inside docker + export CUDA_ARCH_VERSION=${{ matrix.cuda_arch_version }} + export CU_VERSION="cu${CUDA_ARCH_VERSION:0:2}${CUDA_ARCH_VERSION:3:1}" + + if [[ "${{ github.ref }}" =~ release/* ]]; then + export RELEASE=1 + export TORCH_VERSION=stable + else + export RELEASE=0 + export TORCH_VERSION=nightly + fi + + export TD_GET_DEFAULTS_TO_NONE=1 + export TORCHRL_TEST_SUITE=distributed + + echo "PYTHON_VERSION: $PYTHON_VERSION" + echo "CU_VERSION: $CU_VERSION" + ## setup_env.sh bash .github/unittest/linux/scripts/run_all.sh