unslothai · pluesclues · Jun 23, 2025 · Jun 23, 2025 · Jun 23, 2025 · Jun 23, 2025
diff --git a/unsloth/models/llama.py b/unsloth/models/llama.py
@@ -840,19 +840,14 @@ def LlamaModel_fast_forward(
         inputs_embeds *= attention_mask.unsqueeze(0).transpose(0, 1).transpose(1, 2)
         if inputs_requires_grad: inputs_embeds.requires_grad_(True)
     pass
-
     # Ignore attention_mask
     if attention_mask is None:
         padding_mask = None
-    elif self.training:
+    elif self.training:    
         attention_mask = None
         padding_mask = None
     else:
-        # if 0 in attention_mask:
-        #     padding_mask = attention_mask
-        # else:
         padding_mask = None
-
         attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
             attention_mask,
             (batch_size, seq_length),

diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
@@ -33,7 +33,11 @@
     RL_CONFIG_CHANGES,
     RL_METRICS_CHANGES,
 )
+
 selective_log_softmax = RL_REPLACEMENTS["selective_log_softmax"]
+create_completion_attention_mask = RL_REPLACEMENTS["create_completion_attention_mask"] 
+calculate_pad_tokens_in_prompt = RL_REPLACEMENTS["calculate_pad_tokens_in_prompt"] 
+left_pack_padding = RL_REPLACEMENTS["left_pack_padding"]
 
 torch_compile_options = {
     "epilogue_fusion"   : True,
@@ -109,6 +113,12 @@ def generate_with_clone(*args, **kwargs):
 from torch.nn import functional as F
 from transformers import DataCollatorForSeq2Seq, DataCollatorForLanguageModeling as TransformersDataCollatorForLanguageModeling
 
+{create_completion_attention_mask_code}
+
+{calculate_pad_tokens_in_prompt_code}
+
+{left_pack_padding_code}
+
 torch_compile_options = {{
     "epilogue_fusion"   : True,
     "max_autotune"      : False,
@@ -118,6 +128,7 @@ def generate_with_clone(*args, **kwargs):
 }}
 
 {selective_log_softmax_code}
+
 {RL_pre}
 
 @dataclass
@@ -695,6 +706,11 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
     # Selective log softmax
     selective_log_softmax_code = inspect.getsource(selective_log_softmax)
 
+    #GRPO masking code
-    #GRPO masking code
+    # GRPO masking code
-    #GRPO masking code
+    # GRPO masking code
+    create_completion_attention_mask_code = inspect.getsource(create_completion_attention_mask)
+    calculate_pad_tokens_in_prompt_code = inspect.getsource(calculate_pad_tokens_in_prompt)
+    left_pack_padding_code = inspect.getsource(left_pack_padding)
+
     # Get final source code
     RLTrainer_source = RLTrainer_replacement.format(
         RLTrainer_name       = RLTrainer_name,
@@ -720,6 +736,10 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
         max_seq_length_post  = max_seq_length_post,
 
         selective_log_softmax_code = selective_log_softmax_code,
+        create_completion_attention_mask_code = create_completion_attention_mask_code, 
+        calculate_pad_tokens_in_prompt_code = calculate_pad_tokens_in_prompt_code,
+        left_pack_padding_code = left_pack_padding_code,
+
     )
 
     if RLTrainer_name == "SFTTrainer":

diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py
@@ -245,6 +245,20 @@ def grpo_trainer__generate_and_score_completions(function_name, function):
         "prompt_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False",
     )
 
+    # Left pad prompt before calculation old and ref hidden states
+    line_to_replace = "batch_size = self.args.per_device_train_batch_size if mode == \"train\" else self.args.per_device_eval_batch_size"
+
+    # The new lines you want to insert
+    replacement_lines = """batch_size = self.args.per_device_train_batch_size if mode == "train" else self.args.per_device_eval_batch_size
+        prompt_completion_ids = left_pack_padding(prompt_completion_ids, self.processing_class.pad_token_id)"""
+
+    function = function.replace(line_to_replace, replacement_lines)
+
+    # function = function.replace(
+    #     "logits_to_keep,",
+    #     "#logits_to_keep,",
+    # )
+
     # Always between max_prompt_length and use_vllm
     found = re.findall(
         r"\n(([ ]{8,})if self\.max_prompt_length is not None:.*?"\
@@ -282,7 +296,8 @@ def strip_leading_tokens(text):
         # Generate completions using either vLLM or regular generation
         if self.use_vllm:"""
             function = function.replace(replace_part, new_replacement)
-    pass
+
+
     return function
 pass
 RL_FUNCTIONS["grpo_trainer"].append(grpo_trainer__generate_and_score_completions)