Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
271 changes: 99 additions & 172 deletions .buildkite/rllib.rayci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,75 +24,67 @@ steps:
tags: cibase

# tests
- label: ":brain: rllib: algorithm, model and others"
- label: ":brain: rllib: component testing"
tags: rllib_directly
parallelism: 4
instance_type: large
commands:
# All tests to with tags for components without gpu or multi_gpu tags
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
--except-tags learning_tests,memory_leak_tests,examples,tests_dir,documentation,multi_gpu,no_cpu,torch_2.x_only_benchmark,manual
--only-tags env,evaluation,models,offline,policy,utils,algorithms,callbacks,core
--except-tags gpu,multi_gpu,manual
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--build-name rllibbuild-py3.10
--python-version 3.10
depends_on: rllibbuild

- label: ":brain: rllib: learning tests pytorch"
tags: rllib
parallelism: 5
instance_type: large
commands:
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
--only-tags fake_gpus,learning_tests_discrete,crashing_cartpole,stateless_cartpole,learning_tests_continuous
--except-tags tf_only,tf2_only,gpu,multi_gpu,learning_tests_pytorch_use_all_core
--test-arg --framework=torch
--build-name rllibbuild-py3.10
--python-version 3.10
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}"
--only-tags learning_tests_pytorch_use_all_core
--except-tags tf_only,tf2_only,gpu,multi_gpu
--test-arg --framework=torch
--skip-ray-installation
--build-name rllibbuild-py3.10
--python-version 3.10
depends_on: rllibbuild

- label: ":brain: rllib: examples"
tags: rllib
parallelism: 6
instance_type: large
commands:
# Tests all examples without gpu, multi_gpu or examples_use_all_cores tag
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 2
--only-tags examples
--except-tags multi_gpu,gpu,examples_use_all_core
--except-tags gpu,multi_gpu,manual,examples_use_all_core
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--build-name rllibbuild-py3.10
--python-version 3.10
# Tests all examples without gpu or multi_gpu
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}"
--only-tags examples_use_all_core
--skip-ray-installation
--except-tags multi_gpu,gpu
--except-tags gpu,multi_gpu,manual
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--build-name rllibbuild-py3.10
--python-version 3.10
--skip-ray-installation # reuse the same docker image as the previous run
depends_on: rllibbuild

- label: ":brain: rllib: tests dir"
tags: rllib_directly
parallelism: 2
- label: ":brain: rllib: learning tests"
tags: rllib
parallelism: 5
instance_type: large
commands:
# learning tests without a gpu
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}" --parallelism-per-worker 3
--only-tags tests_dir
--except-tags multi_gpu,manual
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--only-tags learning_tests
--except-tags gpu,multi_gpu,learning_tests_use_all_core,manual
--test-arg --framework=torch
--build-name rllibbuild-py3.10
--python-version 3.10
# learning tests without a gpu but use all cores
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}"
--only-tags learning_tests_use_all_core
--except-tags gpu,multi_gpu,manual
--test-arg --framework=torch
--build-name rllibbuild-py3.10
--python-version 3.10
--skip-ray-installation # reuse the same docker image as the previous run
depends_on: rllibbuild

- label: ":brain: rllib: gpu tests"
Expand All @@ -103,60 +95,35 @@ steps:
parallelism: 5
instance_type: gpu
commands:
# All gpu tagged tests
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}"
--only-tags gpu
--except-tags multi_gpu,manual
--test-env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--test-env=RLLIB_NUM_GPUS=1
--build-name rllibgpubuild-py3.10
--python-version 3.10
depends_on: rllibgpubuild

- label: ":brain: rllib: data tests"
- label: ":brain: rllib: multi-gpu tests"
tags:
- data
- rllib
- disabled # Tests of this tag do not exist any more.
instance_type: large
- rllib_gpu
- gpu
- skip-on-microcheck
parallelism: 5
instance_type: gpu-large
commands:
# learning tests pytorch
# All multi-gpu tagged tests
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--parallelism-per-worker 3
--only-tags learning_tests_with_ray_data
--except-tags multi_gpu,gpu,tf_only,tf2_only
--test-arg --framework=torch
--build-name rllibbuild-py3.10
--python-version 3.10

# rllib unittests
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--parallelism-per-worker 3
--only-tags ray_data
--except-tags learning_tests_with_ray_data,multi_gpu,gpu
--skip-ray-installation
--build-name rllibbuild-py3.10
--python-version 3.10
# reuse the same docker image as the previous run
depends_on: rllibbuild

- label: ":brain: rllib: benchmarks"
tags: rllib
instance_type: medium
commands:
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --only-tags torch_2.x_only_benchmark
--build-name rllibbuild-py3.10
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}"
--parallelism-per-worker 2
--gpus 4
--only-tags multi_gpu
--except-tags manual
--build-name rllibgpubuild-py3.10
--python-version 3.10
depends_on: rllibbuild

# - label: ":brain: rllib: memory leak pytorch tests"
# tags: rllib
# instance_type: medium
# commands:
# - bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
# --only-tags memory_leak_tests
# --except-tags flaky
# --test-arg --framework=torch
# depends_on: rllibbuild
depends_on: rllibgpubuild

- label: ":brain: rllib: doc tests"
tags:
Expand All @@ -167,59 +134,79 @@ steps:
commands:
# doc tests
- bazel run //ci/ray_ci:test_in_docker -- python/ray/... //doc/... rllib
--except-tags gpu
--only-tags doctest
--except-tags gpu,manual
--parallelism-per-worker 2
--build-name rllibbuild-py3.10
--python-version 3.10
# doc examples
- bazel run //ci/ray_ci:test_in_docker -- //doc/... rllib
--except-tags gpu,post_wheel_build,timeseries_libs,doctest
--parallelism-per-worker 2
--skip-ray-installation
--build-name rllibbuild-py3.10
--python-version 3.10
--skip-ray-installation # reuse the same docker image as the previous run
# documentation test
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--only-tags documentation
--parallelism-per-worker 2
--skip-ray-installation
--build-name rllibbuild-py3.10
--python-version 3.10
--skip-ray-installation # reuse the same docker image as the previous run
depends_on: rllibbuild

- label: ":brain: rllib: multi-gpu tests"
- label: ":brain: rllib: flaky component & examples tests"
key: rllib_flaky_tests_02
tags:
- rllib_gpu
- gpu
- skip-on-microcheck
parallelism: 5
instance_type: gpu-large
- rllib
- rllib_flaky
- skip-on-premerge
instance_type: large
commands:
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib
--workers "$${BUILDKITE_PARALLEL_JOB_COUNT}" --worker-id "$${BUILDKITE_PARALLEL_JOB}"
--parallelism-per-worker 2
--gpus 4
--only-tags multi_gpu
--build-name rllibgpubuild-py3.10
# flaky components
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests --parallelism-per-worker 3
--only-tags env,evaluation,models,offline,policy,utils,algorithms,callbacks,core
--except-tags learning_tests,examples,documentation,gpu,multi_gpu,manual
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--build-name rllibbuild-py3.10
--python-version 3.10
depends_on: rllibgpubuild

- label: ":brain: rllib: flaky multi-gpu tests"
key: rllib_flaky_multi_gpu_tests
# flaky examples
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests --parallelism-per-worker 3
--only-tags examples
--except-tags multi_gpu,gpu,manual,examples_use_all_core
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--build-name rllibbuild-py3.10
--python-version 3.10
--skip-ray-installation # reuse the same docker image as the previous run

# flaky examples use all core
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests --parallelism-per-worker 3
--only-tags examples_use_all_core
--except-tags gpu,multi_gpu,manual
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--build-name rllibbuild-py3.10
--python-version 3.10
--skip-ray-installation # reuse the same docker image as the previous run
depends_on: rllibbuild
soft_fail: true

- label: ":brain: rllib: flaky learning tests"
key: rllib_flaky_tests_01
tags:
- rllib_gpu
- gpu
- rllib
- rllib_flaky
- skip-on-premerge
instance_type: gpu-large
instance_type: large
commands:
# torch
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests
--parallelism-per-worker 2
--gpus 4
--only-tags multi_gpu
--build-name rllibgpubuild-py3.10
--only-tags learning_tests
--except-tags gpu,multi_gpu,manual
--test-arg --framework=torch
--build-name rllibbuild-py3.10
--python-version 3.10
depends_on: rllibgpubuild
depends_on: rllibbuild
soft_fail: true

- label: ":brain: rllib: flaky gpu tests"
Expand All @@ -233,89 +220,29 @@ steps:
commands:
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests
--only-tags gpu
--except-tags multi_gpu,manual
--test-env=RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--test-env=RLLIB_NUM_GPUS=1
--build-name rllibgpubuild-py3.10
--python-version 3.10
depends_on: rllibgpubuild
soft_fail: true

- label: ":brain: rllib: flaky tests (learning tests)"
key: rllib_flaky_tests_01
- label: ":brain: rllib: flaky multi-gpu tests"
key: rllib_flaky_multi_gpu_tests
tags:
- rllib
- rllib_gpu
- gpu
- rllib_flaky
- skip-on-premerge
instance_type: large
instance_type: gpu-large
commands:
# torch
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests
--only-tags fake_gpus,learning_tests_discrete,learning_tests_with_ray_data,crashing_cartpole,stateless_cartpole,learning_tests_continuous
--except-tags tf_only,tf2_only,multi_gpu,gpu
--test-arg --framework=torch
--build-name rllibbuild-py3.10
--python-version 3.10

# tf2-static-graph
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests
--only-tags tf_only
--except-tags torch_only,tf2_only,no_tf_static_graph,multi_gpu,gpu
--test-arg --framework=tf
--build-name rllibbuild-py3.10
--python-version 3.10
--skip-ray-installation # reuse the same docker image as the previous run
# tf2-eager-tracing
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests
--only-tags tf2_only
--except-tags fake_gpus,torch_only,multi_gpu,no_tf_eager_tracing,gpu
--test-arg --framework=tf2
--build-name rllibbuild-py3.10
--python-version 3.10
--skip-ray-installation # reuse the same docker image as the previous run
depends_on: rllibbuild
soft_fail: true

- label: ":brain: rllib: flaky tests (examples/rlmodule/models/tests_dir)"
key: rllib_flaky_tests_02
tags:
- rllib
- rllib_flaky
- skip-on-premerge
instance_type: large
commands:
# examples
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests --parallelism-per-worker 3
--only-tags examples
--except-tags multi_gpu,gpu
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--build-name rllibbuild-py3.10
--python-version 3.10

# rlmodule tests
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests --parallelism-per-worker 3
--only-tags rlm
--except-tags multi_gpu,gpu
--test-env RLLIB_ENABLE_RL_MODULE=1
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--build-name rllibbuild-py3.10
--python-version 3.10
--skip-ray-installation # reuse the same docker image as the previous run

# algorithm, models
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests --parallelism-per-worker 3
--except-tags learning_tests,memory_leak_tests,examples,tests_dir,documentation,multi_gpu,gpu,no_cpu,torch_2.x_only_benchmark,manual
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--build-name rllibbuild-py3.10
--python-version 3.10
--skip-ray-installation # reuse the same docker image as the previous run

# tests/ dir
- bazel run //ci/ray_ci:test_in_docker -- //rllib/... rllib --run-flaky-tests --parallelism-per-worker 3
--only-tags tests_dir
--except-tags multi_gpu,gpu,manual
--test-env RAY_USE_MULTIPROCESSING_CPU_COUNT=1
--build-name rllibbuild-py3.10
--parallelism-per-worker 2
--gpus 4
--only-tags multi_gpu
--except-tags manual
--build-name rllibgpubuild-py3.10
--python-version 3.10
--skip-ray-installation # reuse the same docker image as the previous run
depends_on: rllibbuild
depends_on: rllibgpubuild
soft_fail: true
Loading