Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
c18a2ba
tests: fix skipif condition for `deepspeed`
Borda Sep 10, 2025
4c4ec25
split test_trainer_compiled_model
Borda Sep 10, 2025
7268bbc
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 10, 2025
8c9d7f5
test_trainer_compiled_model_deepspeed
Borda Sep 10, 2025
c87cb86
Merge branch 'fix/skipif-deepspeed' of https://github.com/Lightning-A…
Borda Sep 10, 2025
13383e0
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Sep 10, 2025
898195c
cuda-toolkit
Borda Sep 10, 2025
f4fab99
Merge branch 'fix/skipif-deepspeed' of https://github.com/Lightning-A…
Borda Sep 10, 2025
5979850
update
Borda Sep 10, 2025
c32023c
--fix-missing
Borda Sep 10, 2025
d8ce97d
devel
Borda Sep 10, 2025
52bb67d
15
Borda Sep 10, 2025
fa3e058
Apply suggestion from @bhimrazy
SkafteNicki Sep 11, 2025
35de488
typo
Borda Sep 11, 2025
b0139a6
Merge branch 'master' into fix/skipif-deepspeed
Borda Sep 11, 2025
4de2b73
Merge branch 'master' into fix/skipif-deepspeed
ethanwharris Sep 12, 2025
5397750
Merge branch 'master' into fix/skipif-deepspeed
Borda Sep 12, 2025
9f18bde
Merge branch 'master' into fix/skipif-deepspeed
Borda Sep 15, 2025
d79740a
Merge branch 'master' into fix/skipif-deepspeed
Borda Sep 15, 2025
70084d7
Merge branch 'master' into fix/skipif-deepspeed
Borda Sep 15, 2025
fd23a0b
Merge branch 'master' into fix/skipif-deepspeed
Borda Sep 16, 2025
902b04f
Empty-Commit
Borda Sep 17, 2025
e29fb7a
update torch.load to include weights_only parameter in deepspeed utility
deependujha Sep 22, 2025
c0cb287
Merge branch 'master' into fix/skipif-deepspeed
Borda Sep 25, 2025
fb27f6a
Merge branch 'master' into fix/skipif-deepspeed
Borda Sep 26, 2025
e99244c
Merge branch 'master' into fix/skipif-deepspeed
Borda Oct 2, 2025
244fb5e
Merge branch 'master' into fix/skipif-deepspeed
deependujha Oct 4, 2025
41f9d3a
Merge branch 'master' into fix/skipif-deepspeed
deependujha Oct 17, 2025
b729558
Merge branch 'master' into fix/skipif-deepspeed
t-vi Oct 17, 2025
921e112
add huggingface_hub as a dependency
deependujha Oct 17, 2025
ca584e0
update
deependujha Oct 17, 2025
f500faf
update
deependujha Oct 17, 2025
66fcf38
fte
deependujha Oct 17, 2025
152bfe4
double or nothing
deependujha Oct 17, 2025
2a8e19e
mistakenly deleted notedbook submodule
deependujha Oct 17, 2025
4f274e2
meow
deependujha Oct 17, 2025
282d6cf
borrow code from pr:21239
deependujha Oct 18, 2025
07f9433
update
deependujha Oct 18, 2025
4294c89
undo chnages
deependujha Oct 20, 2025
76ca330
update
deependujha Oct 20, 2025
ede7e3a
Merge branch 'master' into fix/skipif-deepspeed
Borda Oct 20, 2025
8c54ae9
gc: devel
Borda Oct 21, 2025
f056ec7
Empty-Commit
Borda Oct 21, 2025
aa4156c
update
deependujha Oct 21, 2025
d4b3920
update
deependujha Oct 21, 2025
3d2e157
use deterministic ports
deependujha Oct 21, 2025
cb28a14
Merge branch 'master' into fix/skipif-deepspeed
deependujha Oct 21, 2025
edec91b
update
deependujha Oct 21, 2025
d5b21b3
update
deependujha Oct 21, 2025
772d50c
cleanup artifacts
deependujha Oct 21, 2025
03dd693
update
deependujha Oct 21, 2025
2b1e31e
ho ja bhai
deependujha Oct 21, 2025
ea345de
update
deependujha Oct 21, 2025
0a8cb95
Merge branch 'master' into fix/skipif-deepspeed
deependujha Oct 21, 2025
a2ae7ce
update
deependujha Oct 21, 2025
64f87ba
remove changes
deependujha Oct 23, 2025
69cb738
Merge branch 'master' into fix/skipif-deepspeed
deependujha Oct 23, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions .azure/gpu-tests-fabric.yml
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ jobs:
displayName: "extend env. vars 4 future"

- bash: |
set -ex
echo $(DEVICES)
echo $CUDA_VISIBLE_DEVICES
echo $CUDA_VERSION_MM
Expand All @@ -96,6 +97,10 @@ jobs:
python --version
pip --version
pip list
# todo: rather use devel base image
apt-get update -qq --fix-missing
apt-get install -y cuda-toolkit
nvcc --version
displayName: "Image info & NVIDIA"

- bash: |
Expand Down Expand Up @@ -156,7 +161,7 @@ jobs:
- bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest tests_fabric/ -v --durations=50
workingDirectory: tests/
displayName: "Testing: fabric standard"
timeoutInMinutes: "10"
timeoutInMinutes: "15"

- bash: |
wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/run_standalone_tests.sh
Expand All @@ -165,7 +170,7 @@ jobs:
env:
PL_RUN_STANDALONE_TESTS: "1"
displayName: "Testing: fabric standalone"
timeoutInMinutes: "10"
timeoutInMinutes: "15"

- bash: |
python -m coverage report
Expand Down
7 changes: 6 additions & 1 deletion .azure/gpu-tests-pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ jobs:
displayName: "extend env. vars 4 future"
- bash: |
set -ex
echo $(DEVICES)
echo $CUDA_VISIBLE_DEVICES
echo $CUDA_VERSION_MM
Expand All @@ -95,6 +96,10 @@ jobs:
python --version
pip --version
pip list
# todo: rather use devel base image
apt-get update -qq --fix-missing
apt-get install -y cuda-toolkit
nvcc --version
displayName: "Image info & NVIDIA"
- bash: |
Expand Down Expand Up @@ -189,7 +194,7 @@ jobs:
env:
PL_USE_MOCKED_MNIST: "1"
displayName: "Testing: PyTorch standalone tasks"
timeoutInMinutes: "10"
timeoutInMinutes: "15"

- bash: |
python -m coverage report
Expand Down
4 changes: 2 additions & 2 deletions .github/checkgroup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ subprojects:
- "!*.md"
- "!**/*.md"
checks:
- "pytorch.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, pytorch, 3.10)"
- "pytorch.yml / Lit Job (nvidia/cuda:12.1.1-devel-ubuntu22.04, pytorch, 3.10)"
- "pytorch.yml / Lit Job (lightning, 3.12)"
- "pytorch.yml / Lit Job (pytorch, 3.12)"

Expand Down Expand Up @@ -148,7 +148,7 @@ subprojects:
- "!*.md"
- "!**/*.md"
checks:
- "fabric.yml / Lit Job (nvidia/cuda:12.1.1-runtime-ubuntu22.04, fabric, 3.10)"
- "fabric.yml / Lit Job (nvidia/cuda:12.1.1-devel-ubuntu22.04, fabric, 3.10)"
- "fabric.yml / Lit Job (fabric, 3.12)"
- "fabric.yml / Lit Job (lightning, 3.12)"

Expand Down
8 changes: 4 additions & 4 deletions .lightning/workflows/fabric.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,18 @@ trigger:

timeout: "60" # minutes
machine: "L4_X_2"
image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
parametrize:
matrix: {}
include:
# note that this is setting also all oldest requirements which is linked to python == 3.10
- image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
- image: "nvidia/cuda:12.1.1-devel-ubuntu22.04"
PACKAGE_NAME: "fabric"
python_version: "3.10"
- PACKAGE_NAME: "fabric"
python_version: "3.12"
# - image: "nvidia/cuda:12.6-runtime-ubuntu22.04"
# PACKAGE_NAME: "fabric"
#- image: "nvidia/cuda:12.6-runtime-ubuntu22.04"
# PACKAGE_NAME: "fabric"
- PACKAGE_NAME: "lightning"
python_version: "3.12"
exclude: []
Expand Down
8 changes: 4 additions & 4 deletions .lightning/workflows/pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,18 @@ trigger:

timeout: "60" # minutes
machine: "L4_X_2"
image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
parametrize:
matrix: {}
include:
# note that this also sets oldest requirements which are linked to Python == 3.10
- image: "nvidia/cuda:12.1.1-runtime-ubuntu22.04"
- image: "nvidia/cuda:12.1.1-devel-ubuntu22.04"
PACKAGE_NAME: "pytorch"
python_version: "3.10"
- PACKAGE_NAME: "pytorch"
python_version: "3.12"
# - image: "nvidia/cuda:12.6.3-runtime-ubuntu22.04"
# PACKAGE_NAME: "pytorch"
#- image: "nvidia/cuda:12.6.3-devel-ubuntu22.04"
# PACKAGE_NAME: "pytorch"
- PACKAGE_NAME: "lightning"
python_version: "3.12"
exclude: []
Expand Down
1 change: 1 addition & 0 deletions requirements/fabric/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ pytest-random-order ==1.2.0
click ==8.1.8; python_version < "3.11"
click ==8.3.0; python_version > "3.10"
tensorboardX >=2.6, <2.7.0 # todo: relax it back to `>=2.2` after fixing tests
huggingface-hub
1 change: 1 addition & 0 deletions requirements/pytorch/test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ uvicorn # for `ServableModuleValidator` # not setting version as re-defined in
tensorboard >=2.11, <2.21.0 # for `TensorBoardLogger`

torch-tensorrt; platform_system == "Linux" and python_version >= "3.12"
huggingface-hub
2 changes: 1 addition & 1 deletion src/lightning/fabric/utilities/testing/_runif.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def _runif_reasons(
reasons.append("Standalone execution")
kwargs["standalone"] = True

if deepspeed and not (_DEEPSPEED_AVAILABLE and not _TORCH_GREATER_EQUAL_2_4):
if deepspeed and not (_DEEPSPEED_AVAILABLE and _TORCH_GREATER_EQUAL_2_4):
reasons.append("Deepspeed")

if dynamo:
Expand Down
3 changes: 3 additions & 0 deletions tests/tests_fabric/strategies/test_deepspeed_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,9 @@ def _assert_saved_model_is_equal(fabric, model, checkpoint_path):
single_ckpt_path = checkpoint_path / "single_model.pt"
# the tag is hardcoded in DeepSpeedStrategy
convert_zero_checkpoint_to_fp32_state_dict(checkpoint_path, single_ckpt_path, tag="checkpoint")

is_ckpt_path_a_file = os.path.isfile(single_ckpt_path)
single_ckpt_path = single_ckpt_path if is_ckpt_path_a_file else single_ckpt_path / "pytorch_model.bin"
state_dict = torch.load(single_ckpt_path, weights_only=False)
else:
# 'checkpoint' is the tag, hardcoded in DeepSpeedStrategy
Expand Down
5 changes: 4 additions & 1 deletion tests/tests_pytorch/strategies/test_deepspeed.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,7 @@ def on_train_start(self, trainer, pl_module) -> None:
trainer.fit(model)
trainer.test(model)
assert list(lr_monitor.lrs) == ["lr-SGD"]
assert len(set(lr_monitor.lrs["lr-SGD"])) == 8
assert len(lr_monitor.lrs["lr-SGD"]) == 8


@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True)
Expand Down Expand Up @@ -1029,6 +1029,9 @@ def _assert_save_model_is_equal(model, tmp_path, trainer):
if trainer.is_global_zero:
single_ckpt_path = os.path.join(tmp_path, "single_model.pt")
convert_zero_checkpoint_to_fp32_state_dict(checkpoint_path, single_ckpt_path)

if not os.path.isfile(single_ckpt_path):
single_ckpt_path = os.path.join(single_ckpt_path, "pytorch_model.bin")
state_dict = torch.load(single_ckpt_path, weights_only=False)

model = model.cpu()
Expand Down
68 changes: 48 additions & 20 deletions tests/tests_pytorch/utilities/test_compile.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,12 @@
# limitations under the License.
import os
import sys
from contextlib import nullcontext
from unittest import mock

import pytest
import torch
from lightning_utilities.core.imports import RequirementCache

from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2, _TORCH_GREATER_EQUAL_2_4
from lightning.fabric.utilities.imports import _TORCH_GREATER_EQUAL_2_2
from lightning.pytorch import LightningModule, Trainer
from lightning.pytorch.demos.boring_classes import BoringModel
from lightning.pytorch.utilities.compile import from_compiled, to_uncompiled
Expand All @@ -34,7 +32,7 @@
@pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found")
@RunIf(dynamo=True, deepspeed=True)
@mock.patch("lightning.pytorch.trainer.call._call_and_handle_interrupt")
def test_trainer_compiled_model(_, tmp_path, monkeypatch, mps_count_0):
def test_trainer_compiled_model_deepspeed(_, tmp_path, monkeypatch, mps_count_0):
trainer_kwargs = {
"default_root_dir": tmp_path,
"fast_dev_run": True,
Expand Down Expand Up @@ -69,22 +67,52 @@ def test_trainer_compiled_model(_, tmp_path, monkeypatch, mps_count_0):
assert trainer.model._compiler_ctx is None

# some strategies do not support it
if RequirementCache("deepspeed"):
compiled_model = torch.compile(model)
mock_cuda_count(monkeypatch, 2)

# TODO: Update deepspeed to avoid deprecation warning for `torch.cuda.amp.custom_fwd` on import
warn_context = (
pytest.warns(FutureWarning, match="torch.cuda.amp.*is deprecated")
if _TORCH_GREATER_EQUAL_2_4
else nullcontext()
)

with warn_context:
trainer = Trainer(strategy="deepspeed", accelerator="cuda", **trainer_kwargs)

with pytest.raises(RuntimeError, match="Using a compiled model is incompatible with the current strategy.*"):
trainer.fit(compiled_model)
compiled_model = torch.compile(model)
mock_cuda_count(monkeypatch, 2)

trainer = Trainer(strategy="deepspeed", accelerator="cuda", **trainer_kwargs)

with pytest.raises(RuntimeError, match="Using a compiled model is incompatible with the current strategy.*"):
trainer.fit(compiled_model)


# https://github.com/pytorch/pytorch/issues/95708
@pytest.mark.skipif(sys.platform == "darwin", reason="fatal error: 'omp.h' file not found")
@RunIf(dynamo=True)
@mock.patch("lightning.pytorch.trainer.call._call_and_handle_interrupt")
def test_trainer_compiled_model_ddp(_, tmp_path, monkeypatch, mps_count_0):
trainer_kwargs = {
"default_root_dir": tmp_path,
"fast_dev_run": True,
"logger": False,
"enable_checkpointing": False,
"enable_model_summary": False,
"enable_progress_bar": False,
}

model = BoringModel()
compiled_model = torch.compile(model)
assert model._compiler_ctx is compiled_model._compiler_ctx # shared reference

# can train with compiled model
trainer = Trainer(**trainer_kwargs)
trainer.fit(compiled_model)
assert trainer.model._compiler_ctx["compiler"] == "dynamo"

# the compiled model can be uncompiled
to_uncompiled_model = to_uncompiled(compiled_model)
assert model._compiler_ctx is None
assert compiled_model._compiler_ctx is None
assert to_uncompiled_model._compiler_ctx is None

# the compiled model needs to be passed
with pytest.raises(ValueError, match="required to be a compiled LightningModule"):
to_uncompiled(to_uncompiled_model)

# the uncompiled model can be fitted
trainer = Trainer(**trainer_kwargs)
trainer.fit(model)
assert trainer.model._compiler_ctx is None

# ddp does
trainer = Trainer(strategy="ddp", **trainer_kwargs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def on_fit_start(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -
trainer.fit(model)


@RunIf(min_cuda_gpus=1, deepspeed=True, rich=True)
@RunIf(min_cuda_gpus=1, standalone=True, deepspeed=True, rich=True)
@mock.patch("rich.table.Table.add_row", autospec=True)
def test_deepspeed_summary_with_rich_model_summary(mock_table_add_row, tmp_path):
from lightning.pytorch.callbacks import RichModelSummary
Expand Down
Loading