[bug] fix rank/local rank parsing for docker env (#1747)

oelachqar · web-flow · commit 6e7d70fd990e · 2025-06-09T21:05:45.000-07:00
diff --git a/src/oumi/core/distributed.py b/src/oumi/core/distributed.py
@@ -74,13 +74,30 @@ def _get_use_orig_params(config: TrainingConfig) -> bool:
 #
 # Process Info
 #
+def _parse_rank(rank: Optional[str]) -> int:
+    """Parse the rank from the environment variable."""
+    if not rank:
+        return 0
+
+    # -1 is a special value that means "not set".
+    # It's used by the Accelerate launcher.
+    # Defaulting to 0.
+    if rank.strip() == "-1":
+        return 0
+
+    if not rank.isdigit():
+        raise ValueError(f"Rank must be a number. Actual: {rank}.")
+
+    return int(rank)
+
+
 @functools.cache  # same as @cache added in Python 3.9
 def get_device_rank_info() -> DeviceRankInfo:
     """Returns device rank and world size."""
     world_size = int(os.environ.get("WORLD_SIZE", 1))
     if world_size <= 0:
         raise ValueError(f"WORLD_SIZE must be positive. Actual: {world_size}.")
-    rank = int(os.environ.get("RANK", 0))
+    rank = _parse_rank(os.environ.get("RANK"))
     if rank < 0 or rank >= world_size:
         raise ValueError(
             f"RANK must be within this range [0, {world_size}). Actual: {rank}."
@@ -94,7 +111,7 @@ def get_device_rank_info() -> DeviceRankInfo:
     # Per https://pytorch.org/docs/stable/elastic/run.html
     # NEVER hard code any assumptions about the stable-ness of ranks or
     # some correlation between RANK and LOCAL_RANK.
-    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    local_rank = _parse_rank(os.environ.get("LOCAL_RANK"))
     if local_rank < 0 or local_rank >= local_world_size:
         raise ValueError(
             f"LOCAL_RANK must be within this range [0, {local_world_size}). "
diff --git a/tests/unit/core/test_distributed.py b/tests/unit/core/test_distributed.py
@@ -15,6 +15,7 @@
 from oumi.core.configs.params.training_params import TrainingParams
 from oumi.core.distributed import (
     DeviceRankInfo,
+    _parse_rank,
     all_gather_object,
     estimate_dataloader_num_workers,
     get_accelerate_env_vars,
@@ -484,3 +485,33 @@ def test_prepare_accelerate_fsdp_run_override():
             "`EXISTING_VALUE`, overriding to new value `NO`."
         )
     assert env_vars == expected_env_vars
+
+
+@pytest.mark.parametrize(
+    "rank_input,expected",
+    [("1", 1), ("5", 5), ("42", 42), ("100", 100), ("0", 0), ("-1", 0), ("  -1  ", 0)],
+)
+def test_parse_rank(rank_input, expected):
+    """Test that _parse_rank returns correct integer for valid positive rank strings."""
+    assert _parse_rank(rank_input) == expected
+
+
+def test_parse_rank_invalid_non_digit():
+    """Test that _parse_rank raises ValueError for non-digit strings."""
+    with pytest.raises(ValueError, match=r"Rank must be a number\. Actual: abc\."):
+        _parse_rank("abc")
+
+    with pytest.raises(ValueError, match=r"Rank must be a number\. Actual: 1a\."):
+        _parse_rank("1a")
+
+    with pytest.raises(ValueError, match=r"Rank must be a number\. Actual: a1\."):
+        _parse_rank("a1")
+
+
+def test_parse_rank_invalid_negative():
+    """Test that _parse_rank raises ValueError for negative numbers (except -1)."""
+    with pytest.raises(ValueError, match=r"Rank must be a number\. Actual: -2\."):
+        _parse_rank("-2")
+
+    with pytest.raises(ValueError, match=r"Rank must be a number\. Actual: -10\."):
+        _parse_rank("-10")