Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 34 additions & 14 deletions .github/unittest/linux/scripts/run_all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -258,6 +258,8 @@ fi
# ==================================================================================== #
# ================================ Run tests ========================================= #

TORCHRL_TEST_SUITE="${TORCHRL_TEST_SUITE:-all}" # all|distributed|nondistributed

export PYTORCH_TEST_WITH_SLOW='1'
python -m torch.utils.collect_env

Expand All @@ -269,27 +271,45 @@ pytest test/smoke_test_deps.py -v --durations 200 -k 'test_gym or test_dm_contro
# Track if any tests fail
EXIT_STATUS=0

# Run distributed tests first (GPU only) to surface errors early
if [ "${CU_VERSION:-}" != cpu ] ; then
run_distributed_tests() {
# Distributed tests are GPU-only in our CI.
if [ "${CU_VERSION:-}" == cpu ] ; then
echo "TORCHRL_TEST_SUITE=${TORCHRL_TEST_SUITE}: distributed tests require GPU (CU_VERSION != cpu)."
return 1
fi
python .github/unittest/helpers/coverage_run_parallel.py -m pytest test/test_distributed.py \
--instafail --durations 200 -vv --capture no \
--timeout=120 --mp_fork_if_no_cuda || EXIT_STATUS=$?
fi
--timeout=120 --mp_fork_if_no_cuda
}

# Run remaining tests (always run even if distributed tests failed)
if [ "${CU_VERSION:-}" != cpu ] ; then
python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \
--instafail --durations 200 -vv --capture no --ignore test/test_rlhf.py \
--ignore test/test_distributed.py \
--ignore test/llm \
--timeout=120 --mp_fork_if_no_cuda || EXIT_STATUS=$?
else
run_non_distributed_tests() {
# Note: we always ignore distributed tests here (they can be run in a separate job).
python .github/unittest/helpers/coverage_run_parallel.py -m pytest test \
--instafail --durations 200 -vv --capture no --ignore test/test_rlhf.py \
--ignore test/test_distributed.py \
--ignore test/llm \
--timeout=120 --mp_fork_if_no_cuda || EXIT_STATUS=$?
fi
--timeout=120 --mp_fork_if_no_cuda
}

case "${TORCHRL_TEST_SUITE}" in
all)
# Run distributed tests first (GPU only) to surface errors early, then the rest.
if [ "${CU_VERSION:-}" != cpu ] ; then
run_distributed_tests || EXIT_STATUS=$?
fi
run_non_distributed_tests || EXIT_STATUS=$?
;;
distributed)
run_distributed_tests || EXIT_STATUS=$?
;;
nondistributed)
run_non_distributed_tests || EXIT_STATUS=$?
;;
*)
echo "Unknown TORCHRL_TEST_SUITE='${TORCHRL_TEST_SUITE}'. Expected: all|distributed|nondistributed."
exit 2
;;
esac

# Fail the workflow if any tests failed
if [ $EXIT_STATUS -ne 0 ]; then
Expand Down
82 changes: 82 additions & 0 deletions .github/workflows/test-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,9 @@ jobs:
fi
export TD_GET_DEFAULTS_TO_NONE=1

# Run everything except distributed tests; those run in parallel in tests-gpu-distributed.
export TORCHRL_TEST_SUITE=nondistributed

# Remove the following line when the GPU tests are working inside docker, and uncomment the above lines
#export CU_VERSION="cpu"

Expand All @@ -91,6 +94,44 @@ jobs:
## setup_env.sh
bash .github/unittest/linux/scripts/run_all.sh

tests-gpu-distributed:
strategy:
matrix:
python_version: ["3.12"]
cuda_arch_version: ["13.0"]
fail-fast: false
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
runner: linux.g5.4xlarge.nvidia.gpu
repository: pytorch/rl
docker-image: "nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04"
gpu-arch-type: cuda
gpu-arch-version: ${{ matrix.cuda_arch_version }}
timeout: 120
script: |
# Set env vars from matrix
export PYTHON_VERSION=${{ matrix.python_version }}
# Commenting these out for now because the GPU test are not working inside docker
export CUDA_ARCH_VERSION=${{ matrix.cuda_arch_version }}
export CU_VERSION="cu${CUDA_ARCH_VERSION:0:2}${CUDA_ARCH_VERSION:3:1}"
if [[ "${{ github.ref }}" =~ release/* ]]; then
export RELEASE=1
export TORCH_VERSION=stable
else
export RELEASE=0
export TORCH_VERSION=nightly
fi
export TD_GET_DEFAULTS_TO_NONE=1

# Only distributed tests (runs in parallel with tests-gpu).
export TORCHRL_TEST_SUITE=distributed

echo "PYTHON_VERSION: $PYTHON_VERSION"
echo "CU_VERSION: $CU_VERSION"

## setup_env.sh
bash .github/unittest/linux/scripts/run_all.sh

tests-olddeps:
strategy:
matrix:
Expand Down Expand Up @@ -196,5 +237,46 @@ jobs:
echo "CU_VERSION: $CU_VERSION"
export TD_GET_DEFAULTS_TO_NONE=1

# Run everything except distributed tests; those run in parallel in tests-stable-gpu-distributed.
export TORCHRL_TEST_SUITE=nondistributed

## setup_env.sh
bash .github/unittest/linux/scripts/run_all.sh

tests-stable-gpu-distributed:
strategy:
matrix:
python_version: ["3.12"] # "3.9", "3.10", "3.11"
cuda_arch_version: ["13.0"] # "11.6", "11.7"
fail-fast: false
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
with:
runner: linux.g5.4xlarge.nvidia.gpu
repository: pytorch/rl
docker-image: "nvidia/cuda:13.0.2-cudnn-devel-ubuntu24.04"
gpu-arch-type: cuda
gpu-arch-version: ${{ matrix.cuda_arch_version }}
timeout: 120
script: |
# Set env vars from matrix
export PYTHON_VERSION=${{ matrix.python_version }}
# Commenting these out for now because the GPU test are not working inside docker
export CUDA_ARCH_VERSION=${{ matrix.cuda_arch_version }}
export CU_VERSION="cu${CUDA_ARCH_VERSION:0:2}${CUDA_ARCH_VERSION:3:1}"

if [[ "${{ github.ref }}" =~ release/* ]]; then
export RELEASE=1
export TORCH_VERSION=stable
else
export RELEASE=0
export TORCH_VERSION=nightly
fi

export TD_GET_DEFAULTS_TO_NONE=1
export TORCHRL_TEST_SUITE=distributed

echo "PYTHON_VERSION: $PYTHON_VERSION"
echo "CU_VERSION: $CU_VERSION"

## setup_env.sh
bash .github/unittest/linux/scripts/run_all.sh
Loading