enable batch flattening

djsaunde · djsaunde · commit 4e5f1ae6bd3e · 2025-11-05T14:10:01.000-05:00
diff --git a/tests/utils/test_packing.py b/tests/utils/test_packing.py
@@ -2,6 +2,7 @@
 from unsloth.utils import attention_dispatch as attention_dispatch_utils
 from unsloth.utils.packing import configure_sample_packing, enable_sample_packing
 
+from collections.abc import Iterable
 from contextlib import ExitStack
 from types import SimpleNamespace
 from unittest.mock import patch
@@ -10,6 +11,7 @@
 import torch
 from datasets import Dataset
 from trl import SFTConfig, SFTTrainer
+from trl.trainer.sft_trainer import DataCollatorForLanguageModeling
 
 
 def _build_packed_training_setup(tmp_path, device):
@@ -120,25 +122,16 @@ def __init__(self):
         self.generation_config = SimpleNamespace(attn_implementation="sdpa")
 
 
-class _DummyCollator:
-    def __init__(self):
-        self.padding_free = False
-        self.return_position_ids = False
-
-    def torch_call(self, examples):
-        batch_size = len(examples)
-        max_tokens = 4
-        return {
-            "input_ids": torch.zeros(batch_size, max_tokens, dtype=torch.long),
-            "attention_mask": torch.ones(batch_size, max_tokens, dtype=torch.long),
-            "batch": examples,
-        }
-
-
 class _DummyTrainer:
     def __init__(self):
         self.args = SimpleNamespace(remove_unused_columns=True)
-        self.data_collator = _DummyCollator()
+        self.data_collator = DataCollatorForLanguageModeling(
+            pad_token_id=0,
+            completion_only_loss=False,
+            padding_free=True,
+            return_position_ids=False,
+            return_tensors="pt",
+        )
 
 
 def test_enable_sample_packing():
@@ -151,17 +144,21 @@ def test_enable_sample_packing():
     assert getattr(model, "_unsloth_allow_packed_overlength") is True
     assert getattr(model.child, "_unsloth_allow_packed_overlength") is True
 
-    # trainer args are updated to keep the packed metadata
-    assert trainer.args.remove_unused_columns is False
-
     collator = trainer.data_collator
-    assert collator.padding_free is True
     assert collator.return_position_ids is True
     assert getattr(collator, "_unsloth_packing_wrapped") is True
 
     examples = [
-        {"seq_lengths": [2, 1]},
-        {"seq_lengths": [3]},
+        {
+            "input_ids": [0, 1, 2],
+            "labels": [0, 1, 2],
+            "seq_lengths": [2, 1],
+        },
+        {
+            "input_ids": [3, 4, 5],
+            "labels": [3, 4, 5],
+            "seq_lengths": [3],
+        },
     ]
     batch = collator.torch_call(examples)
 
@@ -172,13 +169,43 @@ def test_enable_sample_packing():
         torch.tensor([2, 1, 3], dtype=torch.int32),
     )
 
-    assert "position_ids" in batch
-    assert torch.equal(batch["position_ids"][0, :3], torch.tensor([0, 1, 0], dtype=torch.long))
-    assert torch.equal(batch["position_ids"][1, :3], torch.tensor([0, 1, 2], dtype=torch.long))
+    assert batch["input_ids"].shape == (1, 6)
+    expected_positions = torch.tensor([0, 1, 0, 0, 1, 2], dtype=torch.long)
+    assert torch.equal(batch["position_ids"].view(-1)[:6], expected_positions)
 
-    # attention_mask is dropped when return_position_ids is set
-    assert "attention_mask" not in batch
-    assert batch["batch"] == examples
+
+def test_enable_sample_packing_trl_collator(tmp_path):
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model, _, trainer, _ = _build_packed_training_setup(tmp_path, device)
+
+    enable_sample_packing(model, trainer)
+
+    examples = [
+        {
+            "input_ids": [0, 1, 2],
+            "labels": [0, 1, 2],
+            "seq_lengths": [2, 1],
+        },
+        {
+            "input_ids": [3, 4, 5],
+            "labels": [3, 4, 5],
+            "seq_lengths": [3],
+        },
+    ]
+
+    batch = trainer.data_collator.torch_call(examples)
+
+    assert batch["input_ids"].shape == (1, 6)
+    assert torch.equal(
+        batch["packed_seq_lengths"],
+        torch.tensor([2, 1, 3], dtype=torch.int32),
+    )
+
+    expected_positions = torch.tensor([0, 1, 0, 0, 1, 2], dtype=torch.long)
+    assert torch.equal(batch["position_ids"].view(-1)[:6], expected_positions)
+
+    if hasattr(trainer, "accelerator"):
+        trainer.accelerator.free_memory()
 
 
 def test_packing_sdpa(tmp_path):
diff --git a/unsloth/utils/packing.py b/unsloth/utils/packing.py
@@ -55,15 +55,6 @@ def enable_sample_packing(
     sequence_lengths_key: str = "seq_lengths",
 ) -> None:
     """Enable runtime support for packed batches on an existing trainer."""
-    train_bs = getattr(trainer.args, "per_device_train_batch_size", 1)
-    eval_bs = getattr(trainer.args, "per_device_eval_batch_size", 1)
-
-    if train_bs != 1 or eval_bs != 1:
-        raise ValueError(
-            "Sample packing requires per_device_train_batch_size=1 and "
-            f"per_device_eval_batch_size=1; received {train_bs}, {eval_bs}."
-        )
-
     def _mark_allow_overlength(module):
         if hasattr(module, "max_seq_length"):
             setattr(module, "_unsloth_allow_packed_overlength", True)
@@ -72,17 +63,14 @@ def _mark_allow_overlength(module):
 
     _mark_allow_overlength(model)
 
-    if hasattr(trainer.args, "remove_unused_columns"):
-        trainer.args.remove_unused_columns = False
-
     collator = getattr(trainer, "data_collator", None)
-    if collator is None or not hasattr(collator, "torch_call"):
-        return
-    if getattr(collator, "_unsloth_packing_wrapped", False):
+    if (
+        collator is None
+        or not hasattr(collator, "torch_call")
+        or getattr(collator, "_unsloth_packing_wrapped", False)
+    ):
         return
 
-    if hasattr(collator, "padding_free"):
-        collator.padding_free = True
     if hasattr(collator, "return_position_ids"):
         collator.return_position_ids = True
 
@@ -92,41 +80,12 @@ def torch_call_with_lengths(examples: Sequence[dict]):
         batch = original_torch_call(examples)
         if examples and isinstance(examples[0], dict):
             seq_lengths: list[int] = []
-            per_example_counts: list[int] = []
             for example in examples:
                 lengths = example.get(sequence_lengths_key)
                 if isinstance(lengths, Iterable):
-                    numeric_lengths = [int(length) for length in lengths]
-                    seq_lengths.extend(numeric_lengths)
-                    per_example_counts.append(len(numeric_lengths))
-                else:
-                    per_example_counts.append(0)
+                    seq_lengths.extend(int(length) for length in lengths)
             if seq_lengths:
                 batch["packed_seq_lengths"] = torch.tensor(seq_lengths, dtype=torch.int32)
-
-                position_ids = batch.get("position_ids")
-                input_ids = batch.get("input_ids")
-                if position_ids is None and input_ids is not None:
-                    position_ids = torch.zeros_like(
-                        input_ids, dtype=torch.long, device=input_ids.device
-                    )
-
-                if position_ids is not None and input_ids is not None:
-                    seq_index = 0
-                    for row_idx, count in enumerate(per_example_counts):
-                        cursor = 0
-                        for _ in range(count):
-                            length = seq_lengths[seq_index]
-                            if length > 0:
-                                position_ids[row_idx, cursor : cursor + length] = torch.arange(
-                                    length, dtype=torch.long, device=position_ids.device
-                                )
-                                cursor += length
-                            seq_index += 1
-                    batch["position_ids"] = position_ids
-
-                if "attention_mask" in batch and getattr(collator, "return_position_ids", False):
-                    batch.pop("attention_mask")
         return batch
 
     collator.torch_call = torch_call_with_lengths