nits

Goekdeniz-Guelmez · Goekdeniz-Guelmez · commit 830017cb004d · 2025-09-09T11:03:55.000+02:00
diff --git a/mlx_vlm/trainer/trainer.py b/mlx_vlm/trainer/trainer.py
@@ -14,7 +14,7 @@
 from mlx.utils import tree_flatten, tree_map
 from tqdm import tqdm
 
-from .utils import grad_checkpoint, Colors
+from .utils import grad_checkpoint, Colors, get_learning_rate
 
 @dataclass
 class TrainingArgs:
@@ -52,20 +52,26 @@ class TrainingArgs:
         metadata={"help": "Learning rate."},
     )
     grad_clip: float = field(
-        default=None,
+        default=1.0,
         metadata={"help": "Gradient clipping value."},
     )
+    warmup_steps: int = field(
+        default=100,
+        metadata={"help": "Number of warmup steps for learning rate."},
+    )
+    min_learning_rate: float = field(
+        default=1e-6,
+        metadata={"help": "Minimum learning rate after decay."},
+    )
 
 
 def default_loss(model, inputs, targets, lengths, train_on_completions=False, assistant_id=77091):
     outputs = model(inputs)
     logits = outputs.logits.astype(mx.float32)
     
-    batch_size, seq_len = targets.shape
+    _, seq_len = targets.shape
     steps = mx.arange(seq_len)[None, :]
-    
     base_mask = steps < lengths[:, None]
-    
     if train_on_completions:
         eq = (inputs == assistant_id)
         idxs = mx.arange(seq_len)[None, :]
@@ -82,7 +88,6 @@ def default_loss(model, inputs, targets, lengths, train_on_completions=False, as
     ce = ce.sum() / ntoks
     return ce, ntoks
 
-
 def iterate_batches(dataset, batch_size, max_seq_length, train=False):
     # Simple indices without sorting
     indices = list(range(len(dataset)))
diff --git a/mlx_vlm/trainer/utils.py b/mlx_vlm/trainer/utils.py
@@ -1,5 +1,6 @@
 import json
 from pathlib import Path
+import math
 
 import mlx.nn as nn
 import mlx.core as mx
@@ -36,6 +37,15 @@ def inner_fn(params, *args, **kwargs):
     type(layer).__call__ = checkpointed_fn
 
 
+def get_learning_rate(iters: int, step: int, warmup_steps: int, learning_rate: float, min_learning_rate: float):
+    if step < warmup_steps:
+        return learning_rate * (step / warmup_steps)
+    
+    progress = (step - warmup_steps) / (iters - warmup_steps)
+    cosine_decay = 0.5 * (1 + math.cos(math.pi * progress))
+    return min_learning_rate + (learning_rate - min_learning_rate) * cosine_decay
+
+
 def get_module_by_name(model, name):
     parts = name.split(".")
     module = model