Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
707625f
build(requirements): require fsspec 2025.3.3
siemdejong May 1, 2025
1790d98
docs: update changelog
siemdejong May 1, 2025
75cecd8
tests(test_ckpt_for_fsspec): test for cross-device transactions
siemdejong May 2, 2025
25ab178
docs(model_checkpoint): add note on cross-device save
siemdejong May 2, 2025
8aaa2b7
build(requirements): fix fsspec version
siemdejong May 2, 2025
33628fd
Revert "build(requirements): fix fsspec version"
siemdejong May 2, 2025
38af2a0
docs(changelog): explain change more clearly
siemdejong May 4, 2025
cb8c8d6
Merge branch 'master' into bugfix/20270_invalid-cross-device-link
siemdejong May 14, 2025
0de7ab3
build(requirements): min/max version ffspec to 2025.5.0
siemdejong May 20, 2025
46bb277
Merge branch 'master' into bugfix/20270_invalid-cross-device-link
siemdejong May 20, 2025
8161507
docs(changelog): move to fixed
siemdejong May 20, 2025
1d710df
build(requirements): allow fsspec 2025.5 in ci
siemdejong May 20, 2025
75cfa4a
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] May 20, 2025
cd55e0b
docs{model_checkpoint): rephrase note
siemdejong May 20, 2025
d844bf6
Merge branch 'bugfix/20270_invalid-cross-device-link' of https://gith…
siemdejong May 20, 2025
e7d5f91
Merge branch 'master' into bugfix/20270_invalid-cross-device-link
siemdejong May 26, 2025
ecae062
fix: move to new testfunc and assert error if fsspec<2025.5
siemdejong May 26, 2025
7666ac0
docs: fix typo
siemdejong May 26, 2025
b46dbf1
Merge branch 'master' into bugfix/20270_invalid-cross-device-link
siemdejong May 27, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions src/lightning/fabric/utilities/cloud_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# limitations under the License.
"""Utilities related to data saving/loading."""

import errno
import io
import logging
from pathlib import Path
Expand Down Expand Up @@ -84,10 +85,16 @@ def _atomic_save(checkpoint: dict[str, Any], filepath: Union[str, Path]) -> None
log.debug(f"Saving checkpoint: {filepath}")
torch.save(checkpoint, bytesbuffer)

# We use a transaction here to avoid file corruption if the save gets interrupted
fs, urlpath = fsspec.core.url_to_fs(str(filepath))
with fs.transaction, fs.open(urlpath, "wb") as f:
f.write(bytesbuffer.getvalue())
try:
# We use a transaction here to avoid file corruption if the save gets interrupted
fs, urlpath = fsspec.core.url_to_fs(str(filepath))
with fs.transaction, fs.open(urlpath, "wb") as f:
f.write(bytesbuffer.getvalue())
except PermissionError as e:
if isinstance(e.__context__, OSError) and getattr(e.__context__, "errno", None) == errno.EXDEV:
raise RuntimeError(
'Upgrade fsspec to enable cross-device local checkpoints: pip install "fsspec[http]>=2025.5.0"',
) from e


def _is_object_storage(fs: AbstractFileSystem) -> bool:
Expand Down
3 changes: 2 additions & 1 deletion src/lightning/pytorch/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).

- Add enable_autolog_hparams argument to Trainer ([#20593](https://github.com/Lightning-AI/pytorch-lightning/pull/20593))

- For cross-device local checkpoints, instruct users to install `fsspec>=2025.5.0` if unavailable ([#20780](https://github.com/Lightning-AI/pytorch-lightning/pull/20780))


### Changed

-


### Removed

-
Expand Down
4 changes: 4 additions & 0 deletions src/lightning/pytorch/callbacks/model_checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,10 @@ class ModelCheckpoint(Checkpoint):
If the checkpoint's ``dirpath`` changed from what it was before while resuming the training,
only ``best_model_path`` will be reloaded and a warning will be issued.

If you provide a ``filename`` on a mounted device where changing permissions is not allowed (causing ``chmod``
to raise a ``PermissionError``), install `fsspec>=2025.5.0`. Then the error is caught, the file's permissions
remain unchanged, and the checkpoint is still saved. Otherwise, no checkpoint will be saved and training stops.

Raises:
MisconfigurationException:
If ``save_top_k`` is smaller than ``-1``,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,13 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import errno
import os
import re
from unittest import mock
from unittest.mock import ANY, Mock

import fsspec
import pytest
import torch

Expand Down Expand Up @@ -105,6 +108,31 @@ def test_hpc_max_ckpt_version(tmp_path):
)


def test_local_cross_device_checkpoint(tmpdir):
"""Test that the _CheckpointConnector can write local cross-device files or raises an error if fsspec<2025.5.0."""
model = BoringModel()
# hardcoding dir since `tmp_path` can be windows path
trainer = Trainer(
default_root_dir="memory://test_ckpt_for_fsspec", limit_train_batches=1, limit_val_batches=1, max_epochs=1
)
trainer.fit(model)
# Simulate the behavior of fsspec when writing to a local file system but other device.
with (
mock.patch("os.rename", side_effect=OSError(errno.EXDEV, "Invalid cross-device link")),
mock.patch("os.chmod", side_effect=PermissionError("Operation not permitted")),
):
if fsspec.__version__ < "2025.5.0":
with pytest.raises(
RuntimeError,
match=re.escape(
'Upgrade fsspec to enable cross-device local checkpoints: pip install "fsspec[http]>=2025.5.0"'
),
):
trainer.save_checkpoint(tmpdir + "/test_ckpt_for_fsspec/hpc_ckpt.ckpt")
else:
trainer.save_checkpoint(tmpdir + "/test_ckpt_for_fsspec/hpc_ckpt.ckpt")


def test_ckpt_for_fsspec():
"""Test that the _CheckpointConnector is able to write to fsspec file systems."""
model = BoringModel()
Expand Down
Loading