From 707625f23a68e3aa780abfdceba455cbebc7a565 Mon Sep 17 00:00:00 2001 From: siemdejong <28396796+siemdejong@users.noreply.github.com> Date: Thu, 1 May 2025 19:10:55 +0200 Subject: [PATCH 01/14] build(requirements): require fsspec 2025.3.3 Cross-device transactions via fsspec (used for example in ModelCheckpoint) resulted in permission errors. The permission errors were caused by attempts to change file modes on different filesystem. This was fixed in fsspec 2025.3.3. Closes #20270 --- requirements/fabric/base.txt | 2 +- requirements/pytorch/base.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index 3fe9168c48e11..67520f67991f9 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment torch >=2.1.0, <2.8.0 -fsspec[http] >=2022.5.0, <2025.4.0 +fsspec[http] >=2025.3.3, <2025.4.0 packaging >=20.0, <=25.0 typing-extensions >=4.4.0, <4.11.0 lightning-utilities >=0.10.0, <0.15.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 7bc20cec191d7..c952195cb4aa9 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -4,7 +4,7 @@ torch >=2.1.0, <2.8.0 tqdm >=4.57.0, <4.67.0 PyYAML >=5.4, <6.1.0 -fsspec[http] >=2022.5.0, <2025.4.0 +fsspec[http] >=2025.3.3, <2025.4.0 torchmetrics >=0.7.0, <1.8.0 packaging >=20.0, <=25.0 typing-extensions >=4.4.0, <4.11.0 From 1790d986a88d0644e325701da2c1fbf0376bb26d Mon Sep 17 00:00:00 2001 From: siemdejong <28396796+siemdejong@users.noreply.github.com> Date: Thu, 1 May 2025 20:10:36 +0200 Subject: [PATCH 02/14] docs: update changelog --- requirements/fabric/base.txt | 3 ++- requirements/pytorch/base.txt | 3 ++- src/lightning/fabric/CHANGELOG.md | 5 +++++ src/lightning/pytorch/CHANGELOG.md | 3 +-- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index 67520f67991f9..5ff496fefa2d7 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -2,7 +2,8 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment torch >=2.1.0, <2.8.0 -fsspec[http] >=2025.3.3, <2025.4.0 +#fsspec[http] >=2025.3.3, <2025.4.0 +fsspec[http] @ git+https://github.com/fsspec/filesystem_spec packaging >=20.0, <=25.0 typing-extensions >=4.4.0, <4.11.0 lightning-utilities >=0.10.0, <0.15.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index c952195cb4aa9..5dd41859e435a 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -4,7 +4,8 @@ torch >=2.1.0, <2.8.0 tqdm >=4.57.0, <4.67.0 PyYAML >=5.4, <6.1.0 -fsspec[http] >=2025.3.3, <2025.4.0 +#fsspec[http] >=2025.3.3, <2025.4.0 +fsspec[http] @ git+https://github.com/fsspec/filesystem_spec torchmetrics >=0.7.0, <1.8.0 packaging >=20.0, <=25.0 typing-extensions >=4.4.0, <4.11.0 diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index f67eec28deeeb..84c886a86b5cf 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -12,6 +12,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - +### Changed + +- Increased minimum `fsspec` version from 2022.5.0 to 2025.3.3 ([#20780](https://github.com/Lightning-AI/pytorch-lightning/pull/20780)) + + ### Removed - diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 5616defeffc8a..666be9c5a1d34 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -14,8 +14,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed -- - +- Increased minimum `fsspec` version from 2022.5.0 to 2025.3.3 ([#20780](https://github.com/Lightning-AI/pytorch-lightning/pull/20780)) ### Removed From 75cecd8b52ea21806120a7eb09bed132d125fe12 Mon Sep 17 00:00:00 2001 From: siemdejong <28396796+siemdejong@users.noreply.github.com> Date: Fri, 2 May 2025 09:30:14 +0200 Subject: [PATCH 03/14] tests(test_ckpt_for_fsspec): test for cross-device transactions --- .../trainer/connectors/test_checkpoint_connector.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/tests/tests_pytorch/trainer/connectors/test_checkpoint_connector.py b/tests/tests_pytorch/trainer/connectors/test_checkpoint_connector.py index 722742a3ccae0..5baa55cbc3d21 100644 --- a/tests/tests_pytorch/trainer/connectors/test_checkpoint_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_checkpoint_connector.py @@ -11,6 +11,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import errno import os from unittest import mock from unittest.mock import ANY, Mock @@ -105,7 +106,7 @@ def test_hpc_max_ckpt_version(tmp_path): ) -def test_ckpt_for_fsspec(): +def test_ckpt_for_fsspec(tmpdir): """Test that the _CheckpointConnector is able to write to fsspec file systems.""" model = BoringModel() # hardcoding dir since `tmp_path` can be windows path @@ -118,6 +119,13 @@ def test_ckpt_for_fsspec(): trainer.save_checkpoint("memory://test_ckpt_for_fsspec/hpc_ckpt_3.ckpt") trainer.save_checkpoint("memory://test_ckpt_for_fsspec/hpc_ckpt_33.ckpt") + # Simulate the behavior of fsspec when writing to a local file system but other device. + with ( + mock.patch("os.rename", side_effect=OSError(errno.EXDEV, "Invalid cross-device link")), + mock.patch("os.chmod", side_effect=PermissionError("Operation not permitted")), + ): + trainer.save_checkpoint(tmpdir + "/test_ckpt_for_fsspec/hpc_ckpt_18.ckpt") + assert trainer._checkpoint_connector._hpc_resume_path == "memory://test_ckpt_for_fsspec/hpc_ckpt_33.ckpt" assert ( trainer._checkpoint_connector._CheckpointConnector__max_ckpt_version_in_folder("memory://test_ckpt_for_fsspec") From 25ab178db3d83d4906fc16bb5c4d9c8d2142bc14 Mon Sep 17 00:00:00 2001 From: siemdejong <28396796+siemdejong@users.noreply.github.com> Date: Fri, 2 May 2025 09:30:56 +0200 Subject: [PATCH 04/14] docs(model_checkpoint): add note on cross-device save --- src/lightning/pytorch/callbacks/model_checkpoint.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py index 85bfb65c0ea6e..85f6980fff3b3 100644 --- a/src/lightning/pytorch/callbacks/model_checkpoint.py +++ b/src/lightning/pytorch/callbacks/model_checkpoint.py @@ -155,6 +155,10 @@ class ModelCheckpoint(Checkpoint): If the checkpoint's ``dirpath`` changed from what it was before while resuming the training, only ``best_model_path`` will be reloaded and a warning will be issued. + If you provide a ``filename`` on a mounted device where changing permissions is not allowed (causing ``chmod`` + to raise ``PermissionError``), Lightning will catch the error, leave the file's permissions as-is, and still + save the checkpoint. + Raises: MisconfigurationException: If ``save_top_k`` is smaller than ``-1``, From 8aaa2b7e34b2e97f7fa508f682ba6eeb141a7d03 Mon Sep 17 00:00:00 2001 From: siemdejong <28396796+siemdejong@users.noreply.github.com> Date: Fri, 2 May 2025 09:32:23 +0200 Subject: [PATCH 05/14] build(requirements): fix fsspec version --- requirements/fabric/base.txt | 3 +-- requirements/pytorch/base.txt | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index 5ff496fefa2d7..67520f67991f9 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -2,8 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment torch >=2.1.0, <2.8.0 -#fsspec[http] >=2025.3.3, <2025.4.0 -fsspec[http] @ git+https://github.com/fsspec/filesystem_spec +fsspec[http] >=2025.3.3, <2025.4.0 packaging >=20.0, <=25.0 typing-extensions >=4.4.0, <4.11.0 lightning-utilities >=0.10.0, <0.15.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 5dd41859e435a..c952195cb4aa9 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -4,8 +4,7 @@ torch >=2.1.0, <2.8.0 tqdm >=4.57.0, <4.67.0 PyYAML >=5.4, <6.1.0 -#fsspec[http] >=2025.3.3, <2025.4.0 -fsspec[http] @ git+https://github.com/fsspec/filesystem_spec +fsspec[http] >=2025.3.3, <2025.4.0 torchmetrics >=0.7.0, <1.8.0 packaging >=20.0, <=25.0 typing-extensions >=4.4.0, <4.11.0 From 33628fd1e1ea17b91435bb41cbaec5d79ad0288c Mon Sep 17 00:00:00 2001 From: siemdejong <28396796+siemdejong@users.noreply.github.com> Date: Fri, 2 May 2025 09:47:20 +0200 Subject: [PATCH 06/14] Revert "build(requirements): fix fsspec version" This reverts commit 8aaa2b7e34b2e97f7fa508f682ba6eeb141a7d03. --- requirements/fabric/base.txt | 3 ++- requirements/pytorch/base.txt | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index 67520f67991f9..5ff496fefa2d7 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -2,7 +2,8 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment torch >=2.1.0, <2.8.0 -fsspec[http] >=2025.3.3, <2025.4.0 +#fsspec[http] >=2025.3.3, <2025.4.0 +fsspec[http] @ git+https://github.com/fsspec/filesystem_spec packaging >=20.0, <=25.0 typing-extensions >=4.4.0, <4.11.0 lightning-utilities >=0.10.0, <0.15.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index c952195cb4aa9..5dd41859e435a 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -4,7 +4,8 @@ torch >=2.1.0, <2.8.0 tqdm >=4.57.0, <4.67.0 PyYAML >=5.4, <6.1.0 -fsspec[http] >=2025.3.3, <2025.4.0 +#fsspec[http] >=2025.3.3, <2025.4.0 +fsspec[http] @ git+https://github.com/fsspec/filesystem_spec torchmetrics >=0.7.0, <1.8.0 packaging >=20.0, <=25.0 typing-extensions >=4.4.0, <4.11.0 From 38af2a058c9a34ac79c24f2341e370a6b48d4fe1 Mon Sep 17 00:00:00 2001 From: siemdejong <28396796+siemdejong@users.noreply.github.com> Date: Sun, 4 May 2025 20:15:56 +0200 Subject: [PATCH 07/14] docs(changelog): explain change more clearly --- src/lightning/fabric/CHANGELOG.md | 2 +- src/lightning/pytorch/CHANGELOG.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 84c886a86b5cf..08c9969d4b6d6 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -14,7 +14,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed -- Increased minimum `fsspec` version from 2022.5.0 to 2025.3.3 ([#20780](https://github.com/Lightning-AI/pytorch-lightning/pull/20780)) +- Allow cross-device local checkpoints with `fsspec>=2025.5.0` ([#20780](https://github.com/Lightning-AI/pytorch-lightning/pull/20780)) ### Removed diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 666be9c5a1d34..7d54fa1252566 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -14,7 +14,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed -- Increased minimum `fsspec` version from 2022.5.0 to 2025.3.3 ([#20780](https://github.com/Lightning-AI/pytorch-lightning/pull/20780)) +- Allow cross-device local checkpoints with `fsspec>=2025.5.0` ([#20780](https://github.com/Lightning-AI/pytorch-lightning/pull/20780)) ### Removed From 0de7ab30d9f78036b3abb99352098b6dc85a998b Mon Sep 17 00:00:00 2001 From: siemdejong <28396796+siemdejong@users.noreply.github.com> Date: Tue, 20 May 2025 20:45:18 +0200 Subject: [PATCH 08/14] build(requirements): min/max version ffspec to 2025.5.0 --- requirements/fabric/base.txt | 3 +-- requirements/pytorch/base.txt | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index 5dcacb8ac6431..34bd128af4163 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -2,8 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment torch >=2.1.0, <2.8.0 -#fsspec[http] >=2025.3.3, <2025.4.0 -fsspec[http] @ git+https://github.com/fsspec/filesystem_spec +fsspec[http] >=2025.5.0, <2025.5.0 packaging >=20.0, <=25.0 typing-extensions >=4.4.0, <4.14.0 lightning-utilities >=0.10.0, <0.15.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 5bfdb6d191e4b..38025976b5150 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -4,8 +4,7 @@ torch >=2.1.0, <2.8.0 tqdm >=4.57.0, <4.68.0 PyYAML >=5.4, <6.1.0 -#fsspec[http] >=2025.3.3, <2025.4.0 -fsspec[http] @ git+https://github.com/fsspec/filesystem_spec +fsspec[http] >=2025.5.0, <2025.5.0 torchmetrics >=0.7.0, <1.8.0 packaging >=20.0, <=25.0 typing-extensions >=4.4.0, <4.14.0 From 81615079c3d66f188214ffa718772a2236c75eb6 Mon Sep 17 00:00:00 2001 From: siemdejong <28396796+siemdejong@users.noreply.github.com> Date: Tue, 20 May 2025 20:50:32 +0200 Subject: [PATCH 09/14] docs(changelog): move to fixed --- src/lightning/fabric/CHANGELOG.md | 7 ++++++- src/lightning/pytorch/CHANGELOG.md | 4 +++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 08c9969d4b6d6..f40e33d724c43 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -14,7 +14,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed -- Allow cross-device local checkpoints with `fsspec>=2025.5.0` ([#20780](https://github.com/Lightning-AI/pytorch-lightning/pull/20780)) +- ### Removed @@ -22,6 +22,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - +### Fixed + +- Allow cross-device local checkpoints with `fsspec>=2025.5.0` ([#20780](https://github.com/Lightning-AI/pytorch-lightning/pull/20780)) + + --- ## [2.5.1] - 2025-03-18 diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 848d563e8daf9..596a4e85cd973 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -14,7 +14,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed -- Allow cross-device local checkpoints with `fsspec>=2025.5.0` ([#20780](https://github.com/Lightning-AI/pytorch-lightning/pull/20780)) +- ### Removed @@ -25,6 +25,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed logger_connector has edge case where step can be a float ([#20692](https://github.com/Lightning-AI/pytorch-lightning/issues/20692)) +- Allow cross-device local checkpoints with `fsspec>=2025.5.0` ([#20780](https://github.com/Lightning-AI/pytorch-lightning/pull/20780)) + --- From 1d710df883fd1a13a52897faecff34c3b151681e Mon Sep 17 00:00:00 2001 From: siemdejong <28396796+siemdejong@users.noreply.github.com> Date: Tue, 20 May 2025 20:53:05 +0200 Subject: [PATCH 10/14] build(requirements): allow fsspec 2025.5 in ci --- requirements/fabric/base.txt | 2 +- requirements/pytorch/base.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index 34bd128af4163..c046c12970724 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment torch >=2.1.0, <2.8.0 -fsspec[http] >=2025.5.0, <2025.5.0 +fsspec[http] >=2025.5.0, <=2025.5.0 packaging >=20.0, <=25.0 typing-extensions >=4.4.0, <4.14.0 lightning-utilities >=0.10.0, <0.15.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 38025976b5150..6c9e0838d77c7 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -4,7 +4,7 @@ torch >=2.1.0, <2.8.0 tqdm >=4.57.0, <4.68.0 PyYAML >=5.4, <6.1.0 -fsspec[http] >=2025.5.0, <2025.5.0 +fsspec[http] >=2025.5.0, <=2025.5.0 torchmetrics >=0.7.0, <1.8.0 packaging >=20.0, <=25.0 typing-extensions >=4.4.0, <4.14.0 From 75cfa4aca1fb48c099d20165705f0b145980f262 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 May 2025 18:54:23 +0000 Subject: [PATCH 11/14] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/lightning/fabric/CHANGELOG.md | 2 +- src/lightning/pytorch/CHANGELOG.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index f40e33d724c43..2368f885d345b 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -14,7 +14,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed -- +- ### Removed diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 596a4e85cd973..3fe97dd7513c4 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -14,7 +14,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). ### Changed -- +- ### Removed From cd55e0b8521f799c0bcf7b288b664cabac8b20ac Mon Sep 17 00:00:00 2001 From: siemdejong <28396796+siemdejong@users.noreply.github.com> Date: Tue, 20 May 2025 20:59:49 +0200 Subject: [PATCH 12/14] docs{model_checkpoint): rephrase note --- src/lightning/pytorch/callbacks/model_checkpoint.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py index 85f6980fff3b3..d9bef1635715f 100644 --- a/src/lightning/pytorch/callbacks/model_checkpoint.py +++ b/src/lightning/pytorch/callbacks/model_checkpoint.py @@ -156,8 +156,8 @@ class ModelCheckpoint(Checkpoint): only ``best_model_path`` will be reloaded and a warning will be issued. If you provide a ``filename`` on a mounted device where changing permissions is not allowed (causing ``chmod`` - to raise ``PermissionError``), Lightning will catch the error, leave the file's permissions as-is, and still - save the checkpoint. + to raise a ``PermissionError``), the error is cached, the file's permissions remain unchanged, and the + checkpoint is still saved. Raises: MisconfigurationException: From ecae0620c2694c81c76892c41a00d4b4b7a8f425 Mon Sep 17 00:00:00 2001 From: siemdejong <28396796+siemdejong@users.noreply.github.com> Date: Mon, 26 May 2025 15:55:30 +0200 Subject: [PATCH 13/14] fix: move to new testfunc and assert error if fsspec<2025.5 --- requirements/fabric/base.txt | 2 +- requirements/pytorch/base.txt | 2 +- src/lightning/fabric/CHANGELOG.md | 10 ------ src/lightning/fabric/utilities/cloud_io.py | 15 +++++--- src/lightning/pytorch/CHANGELOG.md | 4 +-- .../pytorch/callbacks/model_checkpoint.py | 4 +-- .../connectors/test_checkpoint_connector.py | 36 ++++++++++++++----- 7 files changed, 45 insertions(+), 28 deletions(-) diff --git a/requirements/fabric/base.txt b/requirements/fabric/base.txt index c046c12970724..2561b2324b772 100644 --- a/requirements/fabric/base.txt +++ b/requirements/fabric/base.txt @@ -2,7 +2,7 @@ # in case you want to preserve/enforce restrictions on the latest compatible version, add "strict" as an in-line comment torch >=2.1.0, <2.8.0 -fsspec[http] >=2025.5.0, <=2025.5.0 +fsspec[http] >=2022.5.0, <2025.4.0 packaging >=20.0, <=25.0 typing-extensions >=4.4.0, <4.14.0 lightning-utilities >=0.10.0, <0.15.0 diff --git a/requirements/pytorch/base.txt b/requirements/pytorch/base.txt index 6c9e0838d77c7..00889581f6407 100644 --- a/requirements/pytorch/base.txt +++ b/requirements/pytorch/base.txt @@ -4,7 +4,7 @@ torch >=2.1.0, <2.8.0 tqdm >=4.57.0, <4.68.0 PyYAML >=5.4, <6.1.0 -fsspec[http] >=2025.5.0, <=2025.5.0 +fsspec[http] >=2022.5.0, <2025.4.0 torchmetrics >=0.7.0, <1.8.0 packaging >=20.0, <=25.0 typing-extensions >=4.4.0, <4.14.0 diff --git a/src/lightning/fabric/CHANGELOG.md b/src/lightning/fabric/CHANGELOG.md index 2368f885d345b..f67eec28deeeb 100644 --- a/src/lightning/fabric/CHANGELOG.md +++ b/src/lightning/fabric/CHANGELOG.md @@ -12,21 +12,11 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - -### Changed - -- - - ### Removed - -### Fixed - -- Allow cross-device local checkpoints with `fsspec>=2025.5.0` ([#20780](https://github.com/Lightning-AI/pytorch-lightning/pull/20780)) - - --- ## [2.5.1] - 2025-03-18 diff --git a/src/lightning/fabric/utilities/cloud_io.py b/src/lightning/fabric/utilities/cloud_io.py index 9d0a33afd0b77..637dfcd9b1671 100644 --- a/src/lightning/fabric/utilities/cloud_io.py +++ b/src/lightning/fabric/utilities/cloud_io.py @@ -13,6 +13,7 @@ # limitations under the License. """Utilities related to data saving/loading.""" +import errno import io import logging from pathlib import Path @@ -84,10 +85,16 @@ def _atomic_save(checkpoint: dict[str, Any], filepath: Union[str, Path]) -> None log.debug(f"Saving checkpoint: {filepath}") torch.save(checkpoint, bytesbuffer) - # We use a transaction here to avoid file corruption if the save gets interrupted - fs, urlpath = fsspec.core.url_to_fs(str(filepath)) - with fs.transaction, fs.open(urlpath, "wb") as f: - f.write(bytesbuffer.getvalue()) + try: + # We use a transaction here to avoid file corruption if the save gets interrupted + fs, urlpath = fsspec.core.url_to_fs(str(filepath)) + with fs.transaction, fs.open(urlpath, "wb") as f: + f.write(bytesbuffer.getvalue()) + except PermissionError as e: + if isinstance(e.__context__, OSError) and getattr(e.__context__, "errno", None) == errno.EXDEV: + raise RuntimeError( + 'Upgrade fsspec to enable cross-device local checkpoints: pip install "fsspec[http]>=2025.5.0"', + ) from e def _is_object_storage(fs: AbstractFileSystem) -> bool: diff --git a/src/lightning/pytorch/CHANGELOG.md b/src/lightning/pytorch/CHANGELOG.md index 3fe97dd7513c4..18ef679312a66 100644 --- a/src/lightning/pytorch/CHANGELOG.md +++ b/src/lightning/pytorch/CHANGELOG.md @@ -11,6 +11,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Add enable_autolog_hparams argument to Trainer ([#20593](https://github.com/Lightning-AI/pytorch-lightning/pull/20593)) +- For cross-device local checkpoints, instruct users to install `fsspec>=2025.5.0` if unavailable ([#20780](https://github.com/Lightning-AI/pytorch-lightning/pull/20780)) + ### Changed @@ -25,8 +27,6 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/). - Fixed logger_connector has edge case where step can be a float ([#20692](https://github.com/Lightning-AI/pytorch-lightning/issues/20692)) -- Allow cross-device local checkpoints with `fsspec>=2025.5.0` ([#20780](https://github.com/Lightning-AI/pytorch-lightning/pull/20780)) - --- diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py index d9bef1635715f..7ae3f9272bb6c 100644 --- a/src/lightning/pytorch/callbacks/model_checkpoint.py +++ b/src/lightning/pytorch/callbacks/model_checkpoint.py @@ -156,8 +156,8 @@ class ModelCheckpoint(Checkpoint): only ``best_model_path`` will be reloaded and a warning will be issued. If you provide a ``filename`` on a mounted device where changing permissions is not allowed (causing ``chmod`` - to raise a ``PermissionError``), the error is cached, the file's permissions remain unchanged, and the - checkpoint is still saved. + to raise a ``PermissionError``), install `fsspec>=2025.5.0`. Then the error is cached, the file's permissions + remain unchanged, and the checkpoint is still saved. Otherwise, no checkpoint will be saved and training stops. Raises: MisconfigurationException: diff --git a/tests/tests_pytorch/trainer/connectors/test_checkpoint_connector.py b/tests/tests_pytorch/trainer/connectors/test_checkpoint_connector.py index 5baa55cbc3d21..662fd99d1b12c 100644 --- a/tests/tests_pytorch/trainer/connectors/test_checkpoint_connector.py +++ b/tests/tests_pytorch/trainer/connectors/test_checkpoint_connector.py @@ -13,9 +13,11 @@ # limitations under the License. import errno import os +import re from unittest import mock from unittest.mock import ANY, Mock +import fsspec import pytest import torch @@ -106,25 +108,43 @@ def test_hpc_max_ckpt_version(tmp_path): ) -def test_ckpt_for_fsspec(tmpdir): - """Test that the _CheckpointConnector is able to write to fsspec file systems.""" +def test_local_cross_device_checkpoint(tmpdir): + """Test that the _CheckpointConnector can write local cross-device files or raises an error if fsspec<2025.5.0.""" model = BoringModel() # hardcoding dir since `tmp_path` can be windows path trainer = Trainer( default_root_dir="memory://test_ckpt_for_fsspec", limit_train_batches=1, limit_val_batches=1, max_epochs=1 ) trainer.fit(model) - trainer.save_checkpoint("memory://test_ckpt_for_fsspec/hpc_ckpt.ckpt") - trainer.save_checkpoint("memory://test_ckpt_for_fsspec/hpc_ckpt_0.ckpt") - trainer.save_checkpoint("memory://test_ckpt_for_fsspec/hpc_ckpt_3.ckpt") - trainer.save_checkpoint("memory://test_ckpt_for_fsspec/hpc_ckpt_33.ckpt") - # Simulate the behavior of fsspec when writing to a local file system but other device. with ( mock.patch("os.rename", side_effect=OSError(errno.EXDEV, "Invalid cross-device link")), mock.patch("os.chmod", side_effect=PermissionError("Operation not permitted")), ): - trainer.save_checkpoint(tmpdir + "/test_ckpt_for_fsspec/hpc_ckpt_18.ckpt") + if fsspec.__version__ < "2025.5.0": + with pytest.raises( + RuntimeError, + match=re.escape( + 'Upgrade fsspec to enable cross-device local checkpoints: pip install "fsspec[http]>=2025.5.0"' + ), + ): + trainer.save_checkpoint(tmpdir + "/test_ckpt_for_fsspec/hpc_ckpt.ckpt") + else: + trainer.save_checkpoint(tmpdir + "/test_ckpt_for_fsspec/hpc_ckpt.ckpt") + + +def test_ckpt_for_fsspec(): + """Test that the _CheckpointConnector is able to write to fsspec file systems.""" + model = BoringModel() + # hardcoding dir since `tmp_path` can be windows path + trainer = Trainer( + default_root_dir="memory://test_ckpt_for_fsspec", limit_train_batches=1, limit_val_batches=1, max_epochs=1 + ) + trainer.fit(model) + trainer.save_checkpoint("memory://test_ckpt_for_fsspec/hpc_ckpt.ckpt") + trainer.save_checkpoint("memory://test_ckpt_for_fsspec/hpc_ckpt_0.ckpt") + trainer.save_checkpoint("memory://test_ckpt_for_fsspec/hpc_ckpt_3.ckpt") + trainer.save_checkpoint("memory://test_ckpt_for_fsspec/hpc_ckpt_33.ckpt") assert trainer._checkpoint_connector._hpc_resume_path == "memory://test_ckpt_for_fsspec/hpc_ckpt_33.ckpt" assert ( From 7666ac016885bfc50d341fde5a3dd85e2b70200b Mon Sep 17 00:00:00 2001 From: siemdejong <28396796+siemdejong@users.noreply.github.com> Date: Mon, 26 May 2025 16:58:42 +0200 Subject: [PATCH 14/14] docs: fix typo --- src/lightning/pytorch/callbacks/model_checkpoint.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lightning/pytorch/callbacks/model_checkpoint.py b/src/lightning/pytorch/callbacks/model_checkpoint.py index 7ae3f9272bb6c..6b7b2831a2e04 100644 --- a/src/lightning/pytorch/callbacks/model_checkpoint.py +++ b/src/lightning/pytorch/callbacks/model_checkpoint.py @@ -156,7 +156,7 @@ class ModelCheckpoint(Checkpoint): only ``best_model_path`` will be reloaded and a warning will be issued. If you provide a ``filename`` on a mounted device where changing permissions is not allowed (causing ``chmod`` - to raise a ``PermissionError``), install `fsspec>=2025.5.0`. Then the error is cached, the file's permissions + to raise a ``PermissionError``), install `fsspec>=2025.5.0`. Then the error is caught, the file's permissions remain unchanged, and the checkpoint is still saved. Otherwise, no checkpoint will be saved and training stops. Raises: