deps: Update vLLM to 0.8.3 (#1739)

wizeng23 · web-flow · commit 2d0bdb70bd18 · 2025-06-09T21:08:04.000-07:00
diff --git a/configs/recipes/vision/llama3_2_vision/sft/11b_full/gcp_job.yaml b/configs/recipes/vision/llama3_2_vision/sft/11b_full/gcp_job.yaml
@@ -44,6 +44,8 @@ envs:
 setup: |
   set -e
   pip install uv && uv pip install oumi[gpu] hf_transfer
+  # TODO: OPE-1329 - torch 2.6 causes this job to crash
+  uv pip install torch==2.5.0 torchvision==0.20.0
   # Install model from HF Hub. This tool increases download speed compared to
   # downloading the model during training.
   HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download meta-llama/Llama-3.2-11B-Vision-Instruct --exclude original/*
diff --git a/pyproject.toml b/pyproject.toml
@@ -64,11 +64,11 @@ dependencies = [
     "responses>=0.25.0,<0.26",
     "skypilot>=0.7.0,<0.8",      # Used by launcher
     "tensorboard>=2.18.0,<2.19", # Optional, for monitoring training
-    "torch>=2.5.0,<2.6.0",
+    "torch>=2.6.0,<2.7.0",
     "torchao>=0.11.0,<0.12",     # Used by transformers
     # torchdata 0.10 drops support for datapipes which we currently use
     "torchdata>=0.9.0,<0.10.0",  # Used by data pipes loader
-    "torchvision>=0.20.0,<0.21", # Used by some VLM-s (multimodal)
+    "torchvision>=0.21.0,<0.22", # Used by some VLM-s (multimodal)
     "tqdm",
     # Llama Vision attention is broken as late as 4.48.2 if gradient checkpointing is
     # enabled. See OPE-875 and https://github.com/huggingface/transformers/issues/36040.
@@ -121,7 +121,7 @@ gpu = [
     # When updating verl version, make sure to also update the default config:
     # src/oumi/core/trainers/verl_trainer_config.yaml.
     "verl>=0.3.0,<0.4",   # Used for the VERL_GRPO trainer.
-    "vllm>=0.7.3,<0.8.0", # For VLLMInferenceEngine
+    "vllm>=0.8.3,<0.9", # For VLLMInferenceEngine, and vLLM-powered GRPO training.
 ]
 
 # Targets for supported cloud providers
@@ -164,7 +164,7 @@ file_formats = ["pdf2image>=1.17.0,<1.18", "python-poppler>=0.4.1,<0.5"]
 # CI targets
 ci_cpu = [
     "oumi[dev,docs,gcp]",
-    "vllm>=0.7.3,<0.8.0", # For VLLMInferenceEngine
+    "vllm>=0.8.3,<0.9", # For VLLMInferenceEngine
     # This may fail to install. As a temporary workaround, run:
     # CMAKE_ARGS="-DLLAVA_BUILD=OFF" pip install -U llama-cpp-python
     "llama-cpp-python>=0.3.5,<0.4", # For LlamaCppInferenceEngine
diff --git a/src/oumi/builders/callbacks.py b/src/oumi/builders/callbacks.py
@@ -60,6 +60,7 @@ def build_training_callbacks(
     if not config.training.include_performance_metrics:
         return result
 
+    dtype = next(model.parameters()).dtype
     add_mfu_callbacks: bool = True
     if not torch.cuda.is_available():
         logger.warning("MFU logging is only supported on GPU. Skipping MFU callbacks.")
@@ -73,7 +74,7 @@ def build_training_callbacks(
     else:
         device_name = get_device_name()
         try:
-            _get_device_flops(device_name, model.dtype)
+            _get_device_flops(device_name, dtype)
         except NotImplementedError:
             logger.warning(
                 f"MFU logging is currently not supported for device {device_name}. "
@@ -93,7 +94,7 @@ def build_training_callbacks(
             # Ignore attention and rematerialization to ensure metric matches most
             # common implementations.
             mfu_callback = MfuTrainerCallback(
-                dtype=model.dtype,
+                dtype=dtype,
                 num_params=num_mfu_params,
                 sequence_length=config.model.model_max_length,
             )
@@ -112,7 +113,7 @@ def build_training_callbacks(
                 TrainerType.HF,
             )
         ):
-            result.append(HfMfuTrainerCallback(dtype=model.dtype))
+            result.append(HfMfuTrainerCallback(dtype=dtype))
 
     if profiler is not None:
         result.append(ProfilerStepCallback(profiler=profiler))
diff --git a/src/oumi/builders/models.py b/src/oumi/builders/models.py
@@ -184,7 +184,7 @@ def build_oumi_model(
     return model
 
 
-def _disable_cache_in_model_config(model: nn.Module):
+def _disable_cache_in_model_config(model: transformers.PreTrainedModel) -> None:
     # Required for FSDP.
     # Context: https://github.com/huggingface/transformers/issues/28499
     model.config.use_cache = False
diff --git a/src/oumi/core/trainers/hf_trainer.py b/src/oumi/core/trainers/hf_trainer.py
@@ -13,8 +13,9 @@
 # limitations under the License.
 
 import pathlib
-from typing import Optional
+from typing import Optional, cast
 
+import peft
 import transformers
 
 from oumi.core.configs import TrainingConfig
@@ -102,17 +103,20 @@ def _save_model(self, config: TrainingConfig, final: bool = True) -> None:
                             "attempting to delete during model saving."
                         )
 
-                merged_model = self._hf_trainer.model.merge_and_unload(
-                    progressbar=True, safe_merge=True
-                )
+                model = cast(peft.LoraModel, self._hf_trainer.model)
+                merged_model = model.merge_and_unload(progressbar=True, safe_merge=True)
+                merged_model = cast(transformers.PreTrainedModel, merged_model)
                 merged_model.save_pretrained(output_dir)
             elif config.peft.peft_save_mode == PeftSaveMode.ADAPTER_ONLY:
                 # Save the LoRA adapter (doesn't include the base model).
                 self._hf_trainer.save_model(output_dir)
             elif config.peft.peft_save_mode == PeftSaveMode.ADAPTER_AND_BASE_MODEL:
                 self._hf_trainer.save_model(output_dir)
                 # Saving the base model requires a separate call.
-                self._hf_trainer.model.base_model.save_pretrained(output_dir)
+                model = cast(
+                    transformers.PreTrainedModel, self._hf_trainer.model.base_model
+                )
+                model.save_pretrained(output_dir)
             else:
                 raise ValueError(
                     f"Unsupported PEFT save mode: {config.peft.peft_save_mode}"
diff --git a/src/oumi/core/trainers/oumi_trainer.py b/src/oumi/core/trainers/oumi_trainer.py
@@ -30,6 +30,8 @@
 import torch.utils.tensorboard as tensorboard
 
 import mlflow  # isort: skip
+import transformers
+
 import wandb  # isort: skip
 from torch.distributed.checkpoint.state_dict import (
     StateDictOptions,
@@ -159,7 +161,12 @@ def __init__(
         # Prepare model for training
         # ----------------------------------
         if args.enable_gradient_checkpointing:
+            if not isinstance(model, transformers.PreTrainedModel):
+                raise ValueError(
+                    "Gradient checkpointing is only supported for transformers models."
+                )
             model.gradient_checkpointing_enable(args.gradient_checkpointing_kwargs)
+        model = cast(torch.nn.Module, model)
         model.to(self.device)
         if is_distributed():
             # Wrap model for distributed training
diff --git a/src/oumi/inference/native_text_inference_engine.py b/src/oumi/inference/native_text_inference_engine.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Optional
+from typing import Optional, cast
 
 import PIL.Image
 import torch
@@ -56,7 +56,16 @@ def __init__(
         """
         super().__init__(model_params=model_params, generation_params=generation_params)
 
-        self._model = build_model(self._model_params)
+        self._model = cast(
+            transformers.PreTrainedModel, build_model(self._model_params)
+        )
+        if (
+            not hasattr(self._model, "generation_config")
+            or self._model.generation_config is None
+        ):
+            raise ValueError(
+                f"Model {self._model_params.model_name} requires a generation config."
+            )
         self._tokenizer = build_tokenizer(self._model_params)
         self._processor: Optional[BaseProcessor] = None
 
@@ -309,8 +318,12 @@ def _infer(
             disable=disable_tgdm,
         ):
             batch = input_batches[batch_index]
-            output_batch = self._model.generate(
-                **batch, generation_config=generation_config, tokenizer=self._tokenizer
+            output_batch: torch.LongTensor = self._model.generate(
+                # TODO: OPE-1328 - Fix type.
+                # type(batch) == BatchEncoding, but function expects a tensor.
+                **batch,  # type: ignore
+                generation_config=generation_config,
+                tokenizer=self._tokenizer,
             )
 
             # For each batch, remove the prepended prompts from all model responses.
diff --git a/tests/integration/infer/test_native_text_inference_engine.py b/tests/integration/infer/test_native_text_inference_engine.py
@@ -408,5 +408,5 @@ def test_unsupported_model_raises_error():
         tokenizer_pad_token="<|endoftext|>",
         load_pretrained_weights=False,
     )
-    with pytest.raises(ValueError, match="does not support generation"):
+    with pytest.raises(ValueError, match="requires a generation config"):
         NativeTextInferenceEngine(model_params)
diff --git a/tests/unit/builders/test_callbacks.py b/tests/unit/builders/test_callbacks.py
@@ -23,8 +23,7 @@ def test_build_training_callbacks_mfu_callback():
     config.training.include_performance_metrics = True
     config.data.train.pack = True
     config.model.model_max_length = 128
-    model = torch.nn.Module()
-    model.dtype = torch.bfloat16  # type: ignore
+    model = torch.nn.Sequential(torch.nn.Linear(10, 10))
     with patch("torch.cuda.is_available", return_value=True):
         with patch("torch.cuda.get_device_name", return_value="NVIDIA A100-PCIE-40GB"):
             result = build_training_callbacks(config, model, None)
@@ -38,7 +37,7 @@ def test_build_training_callbacks_mfu_callback():
 def test_build_training_callbacks_no_cuda(mock_logger_warning):
     config = TrainingConfig()
     config.training.include_performance_metrics = True
-    model = torch.nn.Module()
+    model = torch.nn.Sequential(torch.nn.Linear(10, 10))
     with patch("torch.cuda.is_available", return_value=False):
         result = build_training_callbacks(config, model, None)
     assert len(result) == 2
@@ -54,7 +53,7 @@ def test_build_training_callbacks_peft(mock_logger_warning):
     config = TrainingConfig()
     config.training.include_performance_metrics = True
     config.training.use_peft = True
-    model = torch.nn.Module()
+    model = torch.nn.Sequential(torch.nn.Linear(10, 10))
     with patch("torch.cuda.is_available", return_value=True):
         result = build_training_callbacks(config, model, None)
     assert len(result) == 2
@@ -69,7 +68,7 @@ def test_build_training_callbacks_peft(mock_logger_warning):
 def test_build_training_callbacks_no_pack(mock_logger_warning):
     config = TrainingConfig()
     config.training.include_performance_metrics = True
-    model = torch.nn.Module()
+    model = torch.nn.Sequential(torch.nn.Linear(10, 10))
     with patch("torch.cuda.is_available", return_value=True):
         result = build_training_callbacks(config, model, None)
     assert len(result) == 2
@@ -85,8 +84,7 @@ def test_build_training_callbacks_unknown_device_name(mock_logger_warning):
     config = TrainingConfig()
     config.training.include_performance_metrics = True
     config.data.train.pack = True
-    model = torch.nn.Module()
-    model.dtype = torch.bfloat16  # type: ignore
+    model = torch.nn.Sequential(torch.nn.Linear(10, 10))
     with patch("torch.cuda.is_available", return_value=True):
         with patch("torch.cuda.get_device_name", return_value="Foo"):
             result = build_training_callbacks(config, model, None)
@@ -103,8 +101,7 @@ def test_build_training_callbacks_no_model_max_length(mock_logger_warning):
     config = TrainingConfig()
     config.training.include_performance_metrics = True
     config.data.train.pack = True
-    model = torch.nn.Module()
-    model.dtype = torch.bfloat16  # type: ignore
+    model = torch.nn.Sequential(torch.nn.Linear(10, 10))
     with patch("torch.cuda.is_available", return_value=True):
         with patch("torch.cuda.get_device_name", return_value="NVIDIA A100-PCIE-40GB"):
             result = build_training_callbacks(config, model, None)

Original file line number	Diff line number	Diff line change
`@@ -408,5 +408,5 @@ def test_unsupported_model_raises_error():`
`408`	`408`	`tokenizer_pad_token="<\|endoftext\|>",`
`409`	`409`	`load_pretrained_weights=False,`
`410`	`410`	`)`
`411`		`- with pytest.raises(ValueError, match="does not support generation"):`
	`411`	`+ with pytest.raises(ValueError, match="requires a generation config"):`
`412`	`412`	`NativeTextInferenceEngine(model_params)`