Skip to content
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
c18a2ba
tests: fix skipif condition for `deepspeed`
Borda Sep 10, 2025
4c4ec25
split test_trainer_compiled_model
Borda Sep 10, 2025
7268bbc
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 10, 2025
8c9d7f5
test_trainer_compiled_model_deepspeed
Borda Sep 10, 2025
c87cb86
Merge branch 'fix/skipif-deepspeed' of https://github.com/Lightning-A…
Borda Sep 10, 2025
13383e0
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 10, 2025
898195c
cuda-toolkit
Borda Sep 10, 2025
f4fab99
Merge branch 'fix/skipif-deepspeed' of https://github.com/Lightning-A…
Borda Sep 10, 2025
5979850
update
Borda Sep 10, 2025
c32023c
--fix-missing
Borda Sep 10, 2025
d8ce97d
devel
Borda Sep 10, 2025
52bb67d
15
Borda Sep 10, 2025
fa3e058
Apply suggestion from @bhimrazy
SkafteNicki Sep 11, 2025
35de488
typo
Borda Sep 11, 2025
b0139a6
Merge branch 'master' into fix/skipif-deepspeed
Borda Sep 11, 2025
4de2b73
Merge branch 'master' into fix/skipif-deepspeed
ethanwharris Sep 12, 2025
5397750
Merge branch 'master' into fix/skipif-deepspeed
Borda Sep 12, 2025
9f18bde
Merge branch 'master' into fix/skipif-deepspeed
Borda Sep 15, 2025
d79740a
Merge branch 'master' into fix/skipif-deepspeed
Borda Sep 15, 2025
70084d7
Merge branch 'master' into fix/skipif-deepspeed
Borda Sep 15, 2025
fd23a0b
Merge branch 'master' into fix/skipif-deepspeed
Borda Sep 16, 2025
902b04f
Empty-Commit
Borda Sep 17, 2025
e29fb7a
update torch.load to include weights_only parameter in deepspeed utility
deependujha Sep 22, 2025
c0cb287
Merge branch 'master' into fix/skipif-deepspeed
Borda Sep 25, 2025
fb27f6a
Merge branch 'master' into fix/skipif-deepspeed
Borda Sep 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions .azure/gpu-tests-fabric.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ jobs:
displayName: "extend env. vars 4 future"

- bash: |
set -ex
echo $(DEVICES)
echo $CUDA_VISIBLE_DEVICES
echo $CUDA_VERSION_MM
Expand All @@ -96,6 +97,10 @@ jobs:
python --version
pip --version
pip list
# toto: rather use devel base image
apt-get update -qq --fix-missing
apt-get install -y cuda-toolkit
nvcc --version
displayName: "Image info & NVIDIA"

- bash: |
Expand Down Expand Up @@ -156,7 +161,7 @@ jobs:
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_fabric/ -v --durations=50
workingDirectory: tests/
displayName: "Testing: fabric standard"
timeoutInMinutes: "10"
timeoutInMinutes: "15"

- bash: |
wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
Expand All @@ -165,7 +170,7 @@ jobs:
env:
PL_RUN_STANDALONE_TESTS: "1"
displayName: "Testing: fabric standalone"
timeoutInMinutes: "10"
timeoutInMinutes: "15"

- bash: |
python -m coverage report
Expand Down
7 changes: 6 additions & 1 deletion .azure/gpu-tests-pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ jobs:
displayName: "extend env. vars 4 future"

- bash: |
set -ex
echo $(DEVICES)
echo $CUDA_VISIBLE_DEVICES
echo $CUDA_VERSION_MM
Expand All @@ -100,6 +101,10 @@ jobs:
python --version
pip --version
pip list
# toto: rather use devel base image
apt-get update -qq --fix-missing
apt-get install -y cuda-toolkit
nvcc --version
displayName: "Image info & NVIDIA"

- bash: |
Expand Down Expand Up @@ -194,7 +199,7 @@ jobs:
env:
PL_USE_MOCKED_MNIST: "1"
displayName: "Testing: PyTorch standalone tasks"
timeoutInMinutes: "10"
timeoutInMinutes: "15"

- bash: |
python -m coverage report
Expand Down
6 changes: 3 additions & 3 deletions .lightning/workflows/fabric.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,17 @@ parametrize:
matrix: {}
include:
# note that this is setting also all oldest requirements which is linked to python == 3.10
- image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
- image: "nvidia/cuda:12.1.1-devel-ubuntu22.04"
PACKAGE_NAME: "fabric"
python_version: "3.10"
machine: "A100_X_2"
- image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
- image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
PACKAGE_NAME: "fabric"
python_version: "3.12"
machine: "L4_X_2"
# - image: "nvidia/cuda:12.6-runtime-ubuntu22.04"
# PACKAGE_NAME: "fabric"
- image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
- image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
PACKAGE_NAME: "lightning"
python_version: "3.12"
machine: "L4_X_2"
Expand Down
6 changes: 3 additions & 3 deletions .lightning/workflows/pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,17 @@ parametrize:
matrix: {}
include:
# note that this also sets oldest requirements which are linked to Python == 3.10
- image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
- image: "nvidia/cuda:12.1.1-devel-ubuntu22.04"
PACKAGE_NAME: "pytorch"
python_version: "3.10"
machine: "A100_X_2"
- image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
- image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
PACKAGE_NAME: "pytorch"
python_version: "3.12"
machine: "L4_X_2"
# - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
# PACKAGE_NAME: "pytorch"
- image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
- image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
PACKAGE_NAME: "lightning"
python_version: "3.12"
machine: "L4_X_2"
Expand Down
2 changes: 1 addition & 1 deletion src/lightning/fabric/utilities/testing/_runif.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def _runif_reasons(
reasons.append("Standalone execution")
kwargs["standalone"] = True

if deepspeed and not (_DEEPSPEED_AVAILABLE and not _TORCH_GREATER_EQUAL_2_4):
if deepspeed and not (_DEEPSPEED_AVAILABLE and _TORCH_GREATER_EQUAL_2_4):
reasons.append("Deepspeed")

if dynamo:
Expand Down
68 changes: 48 additions & 20 deletions tests/tests_pytorch/utilities/test_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,12 @@
# limitations under the License.
import os
import sys
from contextlib import nullcontext
from unittest import mock

import pytest
import torch
from lightning_utilities.core.imports import RequirementCache

from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_4
from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2
from lightning.pytorch import LightningModule, Trainer
from lightning.pytorch.demos.boring_classes import BoringModel
from lightning.pytorch.utilities.compile import from_compiled, to_uncompiled
Expand All @@ -34,7 +32,7 @@
@pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found")
@RunIf(dynamo=True, deepspeed=True)
@mock.patch("lightning.pytorch.trainer.call._call_and_handle_interrupt")
def test_trainer_compiled_model(_, tmp_path, monkeypatch, mps_count_0):
def test_trainer_compiled_model_deepspeed(_, tmp_path, monkeypatch, mps_count_0):
trainer_kwargs = {
"default_root_dir": tmp_path,
"fast_dev_run": True,
Expand Down Expand Up @@ -69,22 +67,52 @@ def test_trainer_compiled_model(_, tmp_path, monkeypatch, mps_count_0):
assert trainer.model._compiler_ctx is None

# some strategies do not support it
if RequirementCache("deepspeed"):
compiled_model = torch.compile(model)
mock_cuda_count(monkeypatch, 2)

# TODO: Update deepspeed to avoid deprecation warning for `torch.cuda.amp.custom_fwd` on import
warn_context = (
pytest.warns(FutureWarning, match="torch.cuda.amp.*is deprecated")
if _TORCH_GREATER_EQUAL_2_4
else nullcontext()
)

with warn_context:
trainer = Trainer(strategy="deepspeed", accelerator="cuda", **trainer_kwargs)

with pytest.raises(RuntimeError, match="Using a compiled model is incompatible with the current strategy.*"):
trainer.fit(compiled_model)
compiled_model = torch.compile(model)
mock_cuda_count(monkeypatch, 2)

trainer = Trainer(strategy="deepspeed", accelerator="cuda", **trainer_kwargs)

with pytest.raises(RuntimeError, match="Using a compiled model is incompatible with the current strategy.*"):
trainer.fit(compiled_model)


# https://github.com/pytorch/pytorch/issues/95708
@pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found")
@RunIf(dynamo=True)
@mock.patch("lightning.pytorch.trainer.call._call_and_handle_interrupt")
def test_trainer_compiled_model_ddp(_, tmp_path, monkeypatch, mps_count_0):
trainer_kwargs = {
"default_root_dir": tmp_path,
"fast_dev_run": True,
"logger": False,
"enable_checkpointing": False,
"enable_model_summary": False,
"enable_progress_bar": False,
}

model = BoringModel()
compiled_model = torch.compile(model)
assert model._compiler_ctx is compiled_model._compiler_ctx # shared reference

# can train with compiled model
trainer = Trainer(**trainer_kwargs)
trainer.fit(compiled_model)
assert trainer.model._compiler_ctx["compiler"] == "dynamo"

# the compiled model can be uncompiled
to_uncompiled_model = to_uncompiled(compiled_model)
assert model._compiler_ctx is None
assert compiled_model._compiler_ctx is None
assert to_uncompiled_model._compiler_ctx is None

# the compiled model needs to be passed
with pytest.raises(ValueError, match="required to be a compiled LightningModule"):
to_uncompiled(to_uncompiled_model)

# the uncompiled model can be fitted
trainer = Trainer(**trainer_kwargs)
trainer.fit(model)
assert trainer.model._compiler_ctx is None

# ddp does
trainer = Trainer(strategy="ddp", **trainer_kwargs)
Expand Down
Loading