Skip to content

Commit 0abd3f6

Browse files
clee2000pytorchmergebot
authored andcommitted
[CI] Reduce CI_SERIAL_LIST list (#124085)
Add serial marker for individual tests so the test file can be removed from the ci serial list Run serial marked tests first in serial Run all other tests afterwards in parallel Slowly reduce list and mark individual tests as serial instead Hope # of serial tests is small so sharding evenness doesn't get too messed up Hopefully can do 3 procs for sm86 and cpu? serial no longer looks like a real word to me Pull Request resolved: pytorch/pytorch#124085 Approved by: https://github.com/seemethere, https://github.com/malfet
1 parent 946b50c commit 0abd3f6

File tree

4 files changed

+42
-10
lines changed

4 files changed

+42
-10
lines changed

pytest.ini

+3
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,6 @@ filterwarnings =
1919
ignore:Module already imported so cannot be rewritten.*hypothesis:pytest.PytestAssertRewriteWarning
2020

2121
xfail_strict = True
22+
23+
markers =
24+
serial: marks tests as needs to be run serially (deselect with '-m "not serial"')

test/inductor/test_torchinductor.py

+2
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
IS_WINDOWS,
7171
IS_X86,
7272
parametrize,
73+
serialTest,
7374
skipIfRocm,
7475
subtest,
7576
TEST_WITH_ASAN,
@@ -9278,6 +9279,7 @@ def fn(tensor, index, source):
92789279
@config.patch(
92799280
"triton.autotune_pointwise", True
92809281
) # needed to introduce config that exceed max shared memory usage
9282+
@serialTest()
92819283
def test_large_block_sizes(self):
92829284
"""
92839285
Inductor will try triton configs like x = 64 and y = 1024 which will

test/run_test.py

+23-10
Original file line numberDiff line numberDiff line change
@@ -246,9 +246,6 @@ def __contains__(self, item):
246246
"test_module_hooks", # OOM
247247
"inductor/test_max_autotune",
248248
"inductor/test_cutlass_backend", # slow due to many nvcc compilation steps
249-
"inductor/test_torchinductor", # OOM on test_large_block_sizes
250-
"inductor/test_torchinductor_dynamic_shapes", # OOM on test_large_block_sizes
251-
"inductor/test_torchinductor_codegen_dynamic_shapes", # OOM on test_large_block_sizes
252249
"test_profiler", # test_source_multithreaded is probably not compatible with parallelism
253250
]
254251
# A subset of onnx tests that cannot run in parallel due to high memory usage.
@@ -1591,6 +1588,11 @@ def parallel_test_completion_callback(failure):
15911588
):
15921589
pool.terminate()
15931590

1591+
keep_going_message = (
1592+
"\n\nTip: You can keep running tests even on failure by passing --keep-going to run_test.py.\n"
1593+
"If running on CI, add the 'keep-going' label to your PR and rerun your jobs."
1594+
)
1595+
15941596
try:
15951597
for test in selected_tests_serial:
15961598
options_clone = copy.deepcopy(options)
@@ -1603,19 +1605,29 @@ def parallel_test_completion_callback(failure):
16031605
and not options.continue_through_error
16041606
and not RERUN_DISABLED_TESTS
16051607
):
1606-
raise RuntimeError(
1607-
failure.message
1608-
+ "\n\nTip: You can keep running tests even on failure by "
1609-
"passing --keep-going to run_test.py.\n"
1610-
"If running on CI, add the 'keep-going' label to "
1611-
"your PR and rerun your jobs."
1612-
)
1608+
raise RuntimeError(failure.message + keep_going_message)
1609+
1610+
# Run tests marked as serial first
1611+
for test in selected_tests_parallel:
1612+
options_clone = copy.deepcopy(options)
1613+
if can_run_in_pytest(test):
1614+
options_clone.pytest = True
1615+
options_clone.additional_unittest_args.extend(["-m", "serial"])
1616+
failure = run_test_module(test, test_directory, options_clone)
1617+
test_failed = handle_error_messages(failure)
1618+
if (
1619+
test_failed
1620+
and not options.continue_through_error
1621+
and not RERUN_DISABLED_TESTS
1622+
):
1623+
raise RuntimeError(failure.message + keep_going_message)
16131624

16141625
os.environ["NUM_PARALLEL_PROCS"] = str(NUM_PROCS)
16151626
for test in selected_tests_parallel:
16161627
options_clone = copy.deepcopy(options)
16171628
if can_run_in_pytest(test):
16181629
options_clone.pytest = True
1630+
options_clone.additional_unittest_args.extend(["-m", "not serial"])
16191631
pool.apply_async(
16201632
run_test_module,
16211633
args=(test, test_directory, options_clone),
@@ -1718,6 +1730,7 @@ def __str__(self):
17181730
if IS_CI:
17191731
gen_ci_artifact([x.to_json() for x in include], [x.to_json() for x in exclude])
17201732

1733+
print_to_stderr(f"Running parallel tests on {NUM_PROCS} processes")
17211734
print_to_stderr(test_batch)
17221735
print_to_stderr(test_batch_exclude)
17231736

torch/testing/_internal/common_utils.py

+14
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,11 @@
9797
import torch.utils._pytree as pytree
9898

9999
from .composite_compliance import no_dispatch
100+
try:
101+
import pytest
102+
has_pytest = True
103+
except ImportError:
104+
has_pytest = False
100105

101106

102107
# Class to keep track of test flags configurable by environment variables.
@@ -1384,6 +1389,15 @@ def wrapper(*args, **kwargs):
13841389

13851390
return decorator
13861391

1392+
def serialTest(condition=True):
1393+
"""
1394+
Decorator for running tests serially. Requires pytest
1395+
"""
1396+
def decorator(fn):
1397+
if has_pytest and condition:
1398+
return pytest.mark.serial(fn)
1399+
return fn
1400+
return decorator
13871401

13881402
def unMarkDynamoStrictTest(cls=None):
13891403
def decorator(cls):

0 commit comments

Comments
 (0)