Skip to content

Commit 2d0bdb7

Browse files
authored
deps: Update vLLM to 0.8.3 (#1739)
1 parent 6e7d70f commit 2d0bdb7

File tree

9 files changed

+51
-27
lines changed

9 files changed

+51
-27
lines changed

configs/recipes/vision/llama3_2_vision/sft/11b_full/gcp_job.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ envs:
4444
setup: |
4545
set -e
4646
pip install uv && uv pip install oumi[gpu] hf_transfer
47+
# TODO: OPE-1329 - torch 2.6 causes this job to crash
48+
uv pip install torch==2.5.0 torchvision==0.20.0
4749
# Install model from HF Hub. This tool increases download speed compared to
4850
# downloading the model during training.
4951
HF_HUB_ENABLE_HF_TRANSFER=1 huggingface-cli download meta-llama/Llama-3.2-11B-Vision-Instruct --exclude original/*

pyproject.toml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,11 @@ dependencies = [
6464
"responses>=0.25.0,<0.26",
6565
"skypilot>=0.7.0,<0.8", # Used by launcher
6666
"tensorboard>=2.18.0,<2.19", # Optional, for monitoring training
67-
"torch>=2.5.0,<2.6.0",
67+
"torch>=2.6.0,<2.7.0",
6868
"torchao>=0.11.0,<0.12", # Used by transformers
6969
# torchdata 0.10 drops support for datapipes which we currently use
7070
"torchdata>=0.9.0,<0.10.0", # Used by data pipes loader
71-
"torchvision>=0.20.0,<0.21", # Used by some VLM-s (multimodal)
71+
"torchvision>=0.21.0,<0.22", # Used by some VLM-s (multimodal)
7272
"tqdm",
7373
# Llama Vision attention is broken as late as 4.48.2 if gradient checkpointing is
7474
# enabled. See OPE-875 and https://github.com/huggingface/transformers/issues/36040.
@@ -121,7 +121,7 @@ gpu = [
121121
# When updating verl version, make sure to also update the default config:
122122
# src/oumi/core/trainers/verl_trainer_config.yaml.
123123
"verl>=0.3.0,<0.4", # Used for the VERL_GRPO trainer.
124-
"vllm>=0.7.3,<0.8.0", # For VLLMInferenceEngine
124+
"vllm>=0.8.3,<0.9", # For VLLMInferenceEngine, and vLLM-powered GRPO training.
125125
]
126126

127127
# Targets for supported cloud providers
@@ -164,7 +164,7 @@ file_formats = ["pdf2image>=1.17.0,<1.18", "python-poppler>=0.4.1,<0.5"]
164164
# CI targets
165165
ci_cpu = [
166166
"oumi[dev,docs,gcp]",
167-
"vllm>=0.7.3,<0.8.0", # For VLLMInferenceEngine
167+
"vllm>=0.8.3,<0.9", # For VLLMInferenceEngine
168168
# This may fail to install. As a temporary workaround, run:
169169
# CMAKE_ARGS="-DLLAVA_BUILD=OFF" pip install -U llama-cpp-python
170170
"llama-cpp-python>=0.3.5,<0.4", # For LlamaCppInferenceEngine

src/oumi/builders/callbacks.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ def build_training_callbacks(
6060
if not config.training.include_performance_metrics:
6161
return result
6262

63+
dtype = next(model.parameters()).dtype
6364
add_mfu_callbacks: bool = True
6465
if not torch.cuda.is_available():
6566
logger.warning("MFU logging is only supported on GPU. Skipping MFU callbacks.")
@@ -73,7 +74,7 @@ def build_training_callbacks(
7374
else:
7475
device_name = get_device_name()
7576
try:
76-
_get_device_flops(device_name, model.dtype)
77+
_get_device_flops(device_name, dtype)
7778
except NotImplementedError:
7879
logger.warning(
7980
f"MFU logging is currently not supported for device {device_name}. "
@@ -93,7 +94,7 @@ def build_training_callbacks(
9394
# Ignore attention and rematerialization to ensure metric matches most
9495
# common implementations.
9596
mfu_callback = MfuTrainerCallback(
96-
dtype=model.dtype,
97+
dtype=dtype,
9798
num_params=num_mfu_params,
9899
sequence_length=config.model.model_max_length,
99100
)
@@ -112,7 +113,7 @@ def build_training_callbacks(
112113
TrainerType.HF,
113114
)
114115
):
115-
result.append(HfMfuTrainerCallback(dtype=model.dtype))
116+
result.append(HfMfuTrainerCallback(dtype=dtype))
116117

117118
if profiler is not None:
118119
result.append(ProfilerStepCallback(profiler=profiler))

src/oumi/builders/models.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,7 @@ def build_oumi_model(
184184
return model
185185

186186

187-
def _disable_cache_in_model_config(model: nn.Module):
187+
def _disable_cache_in_model_config(model: transformers.PreTrainedModel) -> None:
188188
# Required for FSDP.
189189
# Context: https://github.com/huggingface/transformers/issues/28499
190190
model.config.use_cache = False

src/oumi/core/trainers/hf_trainer.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,9 @@
1313
# limitations under the License.
1414

1515
import pathlib
16-
from typing import Optional
16+
from typing import Optional, cast
1717

18+
import peft
1819
import transformers
1920

2021
from oumi.core.configs import TrainingConfig
@@ -102,17 +103,20 @@ def _save_model(self, config: TrainingConfig, final: bool = True) -> None:
102103
"attempting to delete during model saving."
103104
)
104105

105-
merged_model = self._hf_trainer.model.merge_and_unload(
106-
progressbar=True, safe_merge=True
107-
)
106+
model = cast(peft.LoraModel, self._hf_trainer.model)
107+
merged_model = model.merge_and_unload(progressbar=True, safe_merge=True)
108+
merged_model = cast(transformers.PreTrainedModel, merged_model)
108109
merged_model.save_pretrained(output_dir)
109110
elif config.peft.peft_save_mode == PeftSaveMode.ADAPTER_ONLY:
110111
# Save the LoRA adapter (doesn't include the base model).
111112
self._hf_trainer.save_model(output_dir)
112113
elif config.peft.peft_save_mode == PeftSaveMode.ADAPTER_AND_BASE_MODEL:
113114
self._hf_trainer.save_model(output_dir)
114115
# Saving the base model requires a separate call.
115-
self._hf_trainer.model.base_model.save_pretrained(output_dir)
116+
model = cast(
117+
transformers.PreTrainedModel, self._hf_trainer.model.base_model
118+
)
119+
model.save_pretrained(output_dir)
116120
else:
117121
raise ValueError(
118122
f"Unsupported PEFT save mode: {config.peft.peft_save_mode}"

src/oumi/core/trainers/oumi_trainer.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
import torch.utils.tensorboard as tensorboard
3131

3232
import mlflow # isort: skip
33+
import transformers
34+
3335
import wandb # isort: skip
3436
from torch.distributed.checkpoint.state_dict import (
3537
StateDictOptions,
@@ -159,7 +161,12 @@ def __init__(
159161
# Prepare model for training
160162
# ----------------------------------
161163
if args.enable_gradient_checkpointing:
164+
if not isinstance(model, transformers.PreTrainedModel):
165+
raise ValueError(
166+
"Gradient checkpointing is only supported for transformers models."
167+
)
162168
model.gradient_checkpointing_enable(args.gradient_checkpointing_kwargs)
169+
model = cast(torch.nn.Module, model)
163170
model.to(self.device)
164171
if is_distributed():
165172
# Wrap model for distributed training

src/oumi/inference/native_text_inference_engine.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
from typing import Optional
15+
from typing import Optional, cast
1616

1717
import PIL.Image
1818
import torch
@@ -56,7 +56,16 @@ def __init__(
5656
"""
5757
super().__init__(model_params=model_params, generation_params=generation_params)
5858

59-
self._model = build_model(self._model_params)
59+
self._model = cast(
60+
transformers.PreTrainedModel, build_model(self._model_params)
61+
)
62+
if (
63+
not hasattr(self._model, "generation_config")
64+
or self._model.generation_config is None
65+
):
66+
raise ValueError(
67+
f"Model {self._model_params.model_name} requires a generation config."
68+
)
6069
self._tokenizer = build_tokenizer(self._model_params)
6170
self._processor: Optional[BaseProcessor] = None
6271

@@ -309,8 +318,12 @@ def _infer(
309318
disable=disable_tgdm,
310319
):
311320
batch = input_batches[batch_index]
312-
output_batch = self._model.generate(
313-
**batch, generation_config=generation_config, tokenizer=self._tokenizer
321+
output_batch: torch.LongTensor = self._model.generate(
322+
# TODO: OPE-1328 - Fix type.
323+
# type(batch) == BatchEncoding, but function expects a tensor.
324+
**batch, # type: ignore
325+
generation_config=generation_config,
326+
tokenizer=self._tokenizer,
314327
)
315328

316329
# For each batch, remove the prepended prompts from all model responses.

tests/integration/infer/test_native_text_inference_engine.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,5 +408,5 @@ def test_unsupported_model_raises_error():
408408
tokenizer_pad_token="<|endoftext|>",
409409
load_pretrained_weights=False,
410410
)
411-
with pytest.raises(ValueError, match="does not support generation"):
411+
with pytest.raises(ValueError, match="requires a generation config"):
412412
NativeTextInferenceEngine(model_params)

tests/unit/builders/test_callbacks.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,7 @@ def test_build_training_callbacks_mfu_callback():
2323
config.training.include_performance_metrics = True
2424
config.data.train.pack = True
2525
config.model.model_max_length = 128
26-
model = torch.nn.Module()
27-
model.dtype = torch.bfloat16 # type: ignore
26+
model = torch.nn.Sequential(torch.nn.Linear(10, 10))
2827
with patch("torch.cuda.is_available", return_value=True):
2928
with patch("torch.cuda.get_device_name", return_value="NVIDIA A100-PCIE-40GB"):
3029
result = build_training_callbacks(config, model, None)
@@ -38,7 +37,7 @@ def test_build_training_callbacks_mfu_callback():
3837
def test_build_training_callbacks_no_cuda(mock_logger_warning):
3938
config = TrainingConfig()
4039
config.training.include_performance_metrics = True
41-
model = torch.nn.Module()
40+
model = torch.nn.Sequential(torch.nn.Linear(10, 10))
4241
with patch("torch.cuda.is_available", return_value=False):
4342
result = build_training_callbacks(config, model, None)
4443
assert len(result) == 2
@@ -54,7 +53,7 @@ def test_build_training_callbacks_peft(mock_logger_warning):
5453
config = TrainingConfig()
5554
config.training.include_performance_metrics = True
5655
config.training.use_peft = True
57-
model = torch.nn.Module()
56+
model = torch.nn.Sequential(torch.nn.Linear(10, 10))
5857
with patch("torch.cuda.is_available", return_value=True):
5958
result = build_training_callbacks(config, model, None)
6059
assert len(result) == 2
@@ -69,7 +68,7 @@ def test_build_training_callbacks_peft(mock_logger_warning):
6968
def test_build_training_callbacks_no_pack(mock_logger_warning):
7069
config = TrainingConfig()
7170
config.training.include_performance_metrics = True
72-
model = torch.nn.Module()
71+
model = torch.nn.Sequential(torch.nn.Linear(10, 10))
7372
with patch("torch.cuda.is_available", return_value=True):
7473
result = build_training_callbacks(config, model, None)
7574
assert len(result) == 2
@@ -85,8 +84,7 @@ def test_build_training_callbacks_unknown_device_name(mock_logger_warning):
8584
config = TrainingConfig()
8685
config.training.include_performance_metrics = True
8786
config.data.train.pack = True
88-
model = torch.nn.Module()
89-
model.dtype = torch.bfloat16 # type: ignore
87+
model = torch.nn.Sequential(torch.nn.Linear(10, 10))
9088
with patch("torch.cuda.is_available", return_value=True):
9189
with patch("torch.cuda.get_device_name", return_value="Foo"):
9290
result = build_training_callbacks(config, model, None)
@@ -103,8 +101,7 @@ def test_build_training_callbacks_no_model_max_length(mock_logger_warning):
103101
config = TrainingConfig()
104102
config.training.include_performance_metrics = True
105103
config.data.train.pack = True
106-
model = torch.nn.Module()
107-
model.dtype = torch.bfloat16 # type: ignore
104+
model = torch.nn.Sequential(torch.nn.Linear(10, 10))
108105
with patch("torch.cuda.is_available", return_value=True):
109106
with patch("torch.cuda.get_device_name", return_value="NVIDIA A100-PCIE-40GB"):
110107
result = build_training_callbacks(config, model, None)

0 commit comments

Comments
 (0)