diff --git a/examples/configs/recipes/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.yaml b/examples/configs/recipes/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.yaml
index 574db88263..258f82c4bc 100644
--- a/examples/configs/recipes/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.yaml
+++ b/examples/configs/recipes/llm/grpo-math-llama-nemotron-super-49b-v.5-4n8g-fsdp2tp8.yaml
@@ -1,15 +1,20 @@
 defaults: ../../grpo_math_1B.yaml
 grpo:
-  num_prompts_per_step: 128
+  num_prompts_per_step: 64
+  num_generations_per_prompt: 16
 policy:
-  model_name: nvidia/Llama-3_3-Nemotron-Super-49B-v1_5
+  model_name: /lustre/fsw/portfolios/coreai/users/joyang/models/llama-3_3-nemotron-49b-instruct-128k-v1_2-hf
   tokenizer:
     name: nvidia/Llama-3_3-Nemotron-Super-49B-v1_5
-  max_total_sequence_length: 1024
-  train_global_batch_size: 128
+  max_total_sequence_length: 24576
+  #max_total_sequence_length: 1024
+  train_global_batch_size: 64
+  train_micro_batch_size: 1
+  logprob_batch_size: 2
   dtensor_cfg:
     activation_checkpointing: true
-    tensor_parallel_size: 8
+    context_parallel_size: 4
+    tensor_parallel_size: 2
     custom_parallel_plan: examples.configs.recipes.llm.llama_nemotron_super_49b_custom_plan.custom_parallel_plan
   dynamic_batching:
     enabled: true
@@ -32,16 +37,19 @@ policy:
     - 13
   generation:
     vllm_cfg:
+      async_engine: false
       tensor_parallel_size: 4
+      #pipeline_parallel_size: 2
+  make_sequence_length_divisible_by: ${max:${mul:${policy.dtensor_cfg.context_parallel_size}, 2}, ${policy.max_total_sequence_length}}
 logger:
   wandb_enabled: true
   monitor_gpus: false
   wandb:
     project: grpo-nemotron-super-49b
-    name: grpo-${data.dataset_name}-nemotron-super-49b-tp${policy.dtensor_cfg.tensor_parallel_size}
+    name: grpo-${data.dataset_name}-nemotron-super-49b-tp${policy.dtensor_cfg.tensor_parallel_size}-cp${policy.dtensor_cfg.context_parallel_size}
   mlflow:
     experiment_name: sft-dev
     run_name: grpo-nemotron-super-49b
 cluster:
   gpus_per_node: 8
-  num_nodes: 4
+  num_nodes: 8
diff --git a/examples/configs/recipes/llm/llama_nemotron_super_49b_custom_plan.py b/examples/configs/recipes/llm/llama_nemotron_super_49b_custom_plan.py
index a0381adf9c..2922c69f9e 100644
--- a/examples/configs/recipes/llm/llama_nemotron_super_49b_custom_plan.py
+++ b/examples/configs/recipes/llm/llama_nemotron_super_49b_custom_plan.py
@@ -12,38 +12,73 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from typing import cast
+
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
     ParallelStyle,
-    PrepareModuleInput,
-    PrepareModuleOutput,
     RowwiseParallel,
+    SequenceParallel,
 )
 from torch.distributed.tensor.placement_types import Replicate, Shard
 
-custom_parallel_plan: dict[str, ParallelStyle] = {
-    "model.layers.*.self_attn": PrepareModuleInput(
-        input_kwarg_layouts={"attention_mask": Replicate()},
-        desired_input_kwarg_layouts={"attention_mask": Replicate()},
-    ),
-    "model.embed_tokens": RowwiseParallel(
-        input_layouts=Replicate(), output_layouts=Replicate(), use_local_output=True
-    ),
-    "model.layers.*.self_attn.q_proj": ColwiseParallel(use_local_output=False),
-    "model.layers.*.self_attn.k_proj": ColwiseParallel(use_local_output=False),
-    "model.layers.*.self_attn.v_proj": ColwiseParallel(use_local_output=False),
-    "model.layers.*.self_attn.o_proj": RowwiseParallel(
-        output_layouts=Replicate(), use_local_output=True
-    ),
-    "model.layers.*.self_attn.rotary_emb": PrepareModuleOutput(
-        output_layouts=(Replicate(), Replicate()),
-        desired_output_layouts=(Replicate(), Replicate()),
-        use_local_output=False,
-    ),
-    "model.layers.*.mlp.up_proj": ColwiseParallel(),
-    "model.layers.*.mlp.gate_proj": ColwiseParallel(),
-    "model.layers.*.mlp.down_proj": RowwiseParallel(
-        output_layouts=Replicate(), use_local_output=True
-    ),
-    "lm_head": ColwiseParallel(output_layouts=Shard(-1), use_local_output=False),
-}
+
+def get_custom_parallel_plan():
+    # Reuse llama default parallel plan
+    base_model_tp_plan: dict[str, ParallelStyle] = {
+        "model.embed_tokens": RowwiseParallel(input_layouts=Replicate()),
+        "model.layers.*.self_attn.q_proj": ColwiseParallel(),
+        "model.layers.*.self_attn.k_proj": ColwiseParallel(),
+        "model.layers.*.self_attn.v_proj": ColwiseParallel(),
+        "model.layers.*.self_attn.o_proj": RowwiseParallel(),
+        "model.layers.*.mlp.up_proj": ColwiseParallel(),
+        "model.layers.*.mlp.gate_proj": ColwiseParallel(),
+        "model.layers.*.mlp.down_proj": RowwiseParallel(),
+        "lm_head": ColwiseParallel(output_layouts=Shard(-1), use_local_output=False),
+    }
+
+    base_model_sp_plan = {
+        "model.embed_tokens": RowwiseParallel(
+            input_layouts=Replicate(), output_layouts=Shard(1)
+        ),
+        "model.norm": SequenceParallel(),
+        "model.layers.*.input_layernorm": SequenceParallel(),
+        "model.layers.*.self_attn.o_proj": RowwiseParallel(output_layouts=Shard(1)),
+        "model.layers.*.post_attention_layernorm": SequenceParallel(),
+        "model.layers.*.mlp.down_proj": RowwiseParallel(output_layouts=Shard(1)),
+        "lm_head": ColwiseParallel(
+            input_layouts=Shard(1), output_layouts=Shard(-1), use_local_output=False
+        ),
+    }
+
+    if False:
+        # Enable sequence parallelism only if TP size > 1
+        base_model_tp_plan.update(cast(dict[str, ParallelStyle], base_model_sp_plan))
+
+    return base_model_tp_plan
+
+
+custom_parallel_plan: dict[str, ParallelStyle] = get_custom_parallel_plan()
+# {
+
+# "model.embed_tokens": RowwiseParallel(
+#     input_layouts=Replicate(), output_layouts=Replicate(), use_local_output=True
+# ),
+# "model.layers.*.self_attn.q_proj": ColwiseParallel(use_local_output=False),
+# "model.layers.*.self_attn.k_proj": ColwiseParallel(use_local_output=False),
+# "model.layers.*.self_attn.v_proj": ColwiseParallel(use_local_output=False),
+# "model.layers.*.self_attn.o_proj": RowwiseParallel(
+#     output_layouts=Replicate(), use_local_output=True
+# ),
+# "model.layers.*.self_attn.rotary_emb": PrepareModuleOutput(
+#     output_layouts=(Replicate(), Replicate()),
+#     desired_output_layouts=(Replicate(), Replicate()),
+#     use_local_output=False,
+# ),
+# "model.layers.*.mlp.up_proj": ColwiseParallel(),
+# "model.layers.*.mlp.gate_proj": ColwiseParallel(),
+# "model.layers.*.mlp.down_proj": RowwiseParallel(
+#     output_layouts=Replicate(), use_local_output=True
+# ),
+# "lm_head": ColwiseParallel(output_layouts=Shard(-1), use_local_output=False),
+# }
diff --git a/examples/configs/sft_nemotron_super_49b.yaml b/examples/configs/sft_nemotron_super_49b.yaml
new file mode 100644
index 0000000000..d79837dbb8
--- /dev/null
+++ b/examples/configs/sft_nemotron_super_49b.yaml
@@ -0,0 +1,134 @@
+# SFT Algorithm Configuration
+sft:
+  max_num_epochs: 3
+  max_num_steps: 100
+  val_period: 10
+  val_batches: 8
+  val_global_batch_size: 128
+  val_micro_batch_size: 1
+  val_at_start: true
+  seed: 42
+
+checkpointing:
+  enabled: true
+  checkpoint_dir: "results/sft_nemotron_super_49b"
+  metric_name: "val_loss"
+  higher_is_better: false
+  keep_top_k: 100
+  save_period: 500
+  checkpoint_must_save_by: null
+
+policy:
+  # model_name: Qwen/Qwen2.5-7B-Instruct
+  # tokenizer:
+  #   name: Qwen/Qwen2.5-7B-Instruct
+  model_name: "/lustre/fsw/portfolios/coreai/users/joyang/models/llama-3_3-nemotron-49b-instruct-128k-v1_2-hf"
+  tokenizer:
+    name: ${policy.model_name}
+  max_total_sequence_length: 4096
+  precision: "bfloat16"
+  train_global_batch_size: 128
+  train_micro_batch_size: 8
+
+  dtensor_cfg:
+    _v2: true
+    activation_checkpointing: true
+    context_parallel_size: 2
+    cpu_offload: false
+    enabled: true
+    sequence_parallel: false
+    tensor_parallel_size: 4
+    custom_parallel_plan: examples.configs.recipes.llm.llama_nemotron_super_49b_custom_plan.custom_parallel_plan
+
+  megatron_cfg:
+    enabled: false
+
+  dynamic_batching:
+    enabled: false
+    train_mb_tokens: 4096
+    logprob_mb_tokens: 8192
+    sequence_length_round: 64
+
+  sequence_packing:
+    enabled: false
+    train_mb_tokens: ${mul:${policy.max_total_sequence_length}, ${policy.train_micro_batch_size}}
+    algorithm: "modified_first_fit_decreasing"
+    sequence_length_round: 64
+  
+
+  # makes the training sequence length divisible by the tensor parallel size
+  # this is useful for sequence parallel training
+  make_sequence_length_divisible_by: ${max:${mul:${policy.dtensor_cfg.context_parallel_size}, 2}, ${policy.max_total_sequence_length}}
+  max_grad_norm: null
+
+  optimizer:
+    name: "torch.optim.AdamW"
+    kwargs:
+      lr: 2e-5
+      weight_decay: 0.01
+      betas: [0.9, 0.98]
+      eps: 1e-8
+      # when using Dtensor, we need to set foreach
+      # and fused to False
+      foreach: False
+      fused: False
+
+# data:
+#   add_bos: true
+#   add_eos: true
+#   add_generation_prompt: false
+#   dataset_name: "tulu3_sft_mixture"
+#   cache_dir: "/lustre/fsw/portfolios/coreai/users/gvenkatakris/data-cache"
+#   max_input_seq_length: 1024
+#   max_samples: 10000
+#   shuffle: true
+#   test_size: 0.05
+
+data:
+  max_input_seq_length: ${policy.max_total_sequence_length}
+  add_bos: true
+  add_eos: true
+  add_generation_prompt: false
+  shuffle: true
+  num_workers: 20
+
+  dataset_name: "squad"
+  # You can use custom response datasets for training and validation. For example:
+  #   data:
+  #     dataset_name: ResponseDataset
+  #     train_data_path: <PathToTrainingDataset>  # e.g., /path/to/local/dataset.jsonl or hf_org/hf_dataset_name (HuggingFace)
+  #     val_data_path: <PathToValidationDataset>
+  #     input_key: <QuestionKey>, default is "input"
+  #     output_key: <AnswerKey>, default is "output"
+  #     train_split: <TrainSplit>, default is None  # used for HuggingFace datasets
+  #     val_split: <ValSplit>, default is None  # used for HuggingFace datasets
+  # See https://github.com/NVIDIA-NeMo/RL/blob/main/docs/guides/sft.md#datasets for more details.
+
+  ## unused with squad dataset
+  prompt_file: null
+  split: null
+  output_key: null
+  seed: null
+
+logger:
+  log_dir: "logs"  # Base directory for all logs
+  wandb_enabled: true # Make sure you do a ``wandb login [Your API key]'' before running
+  tensorboard_enabled: false
+  mlflow_enabled: false
+  monitor_gpus: false  # If true, will monitor GPU usage and log to wandb and/or tensorboard
+  swanlab_enabled: false
+  wandb:
+    project: "sft-nemotron-joyang"
+    name: "sft-${data.dataset_name}-nemotron-super-49b-joyang"
+  tensorboard:
+    log_dir: "tb_logs-openmathinstruct-nemorl-1M_train"
+  mlflow:
+    experiment_name: "sft-dev"
+    run_name: "openmathinstruct-nemorl-1M_train"
+  gpu_monitoring:
+    collection_interval: 10  # How often to collect GPU usage metrics (in seconds)
+    flush_interval: 10  # How often to flush GPU usage metrics to the loggers (in seconds)
+
+cluster:
+  gpus_per_node: 8
+  num_nodes: 1
diff --git a/examples/run_grpo_math.py b/examples/run_grpo_math.py
index 51adfddced..eb73bc5a30 100644
--- a/examples/run_grpo_math.py
+++ b/examples/run_grpo_math.py
@@ -19,7 +19,7 @@
 from typing import Any, Optional
 
 from omegaconf import OmegaConf
-from transformers import PreTrainedTokenizerBase
+from transformers import AutoConfig, PreTrainedTokenizerBase
 
 from nemo_rl.algorithms.grpo import MasterConfig, grpo_train, setup
 from nemo_rl.algorithms.utils import get_tokenizer
@@ -41,6 +41,7 @@
 from nemo_rl.utils.logger import get_next_experiment_dir
 
 OmegaConf.register_new_resolver("mul", lambda a, b: a * b)
+OmegaConf.register_new_resolver("max", lambda a, b: max(a, b))
 
 
 def parse_args() -> tuple[argparse.Namespace, list[str]]:
@@ -158,7 +159,17 @@ def main() -> None:
 
     init_ray()
 
-    # setup tokenizer
+    # setup tokenizer and preloading model to force HF to download the model and modules
+    # to avoid race condition inside generation/policy workers.
+    try:
+        _ = AutoConfig.from_pretrained(
+            config["policy"]["model_name"], trust_remote_code=True
+        )
+        print(f"Config preloaded successfully: {config['policy']['model_name']}")
+    except Exception as e:
+        print("WARNIN: error in preloading model, in general it's not a problem: ")
+        print(e)
+
     tokenizer = get_tokenizer(config["policy"]["tokenizer"])
     assert config["policy"]["generation"] is not None, (
         "A generation config is required for GRPO"
diff --git a/examples/run_sft.py b/examples/run_sft.py
index b804b4e19f..bcda89e09c 100644
--- a/examples/run_sft.py
+++ b/examples/run_sft.py
@@ -32,6 +32,7 @@
 from nemo_rl.utils.logger import get_next_experiment_dir
 
 OmegaConf.register_new_resolver("mul", lambda a, b: a * b)
+OmegaConf.register_new_resolver("max", lambda a, b: max(a, b))
 
 
 def parse_args():
diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py
index 190f3c2921..fecbb884ce 100644
--- a/nemo_rl/algorithms/grpo.py
+++ b/nemo_rl/algorithms/grpo.py
@@ -793,7 +793,8 @@ def grpo_train(
 
                 print("▶ Computing logprobs...", flush=True)
                 with timer.time("policy_and_reference_logprobs"):
-                    fprop_logprobs = policy.get_logprobs(train_data)["logprobs"]
+                    logprobs_results = policy.get_logprobs(train_data)
+                    fprop_logprobs = logprobs_results["logprobs"]
                     reference_logprobs = policy.get_reference_policy_logprobs(
                         train_data
                     )["reference_logprobs"]
@@ -915,12 +916,15 @@ def grpo_train(
                 log_data, f"train_data_step{total_steps}.jsonl"
             )
 
+            print(f"train_results: {train_results['train_max_seq_len']}")
             metrics = {
                 "loss": train_results["loss"].numpy(),
+                "train_max_seq_len": train_results["train_max_seq_len"],
                 "reward": rewards.numpy(),
                 "grad_norm": train_results["grad_norm"].numpy(),
                 "mean_prompt_length": repeated_batch["length"].numpy(),
                 "total_num_tokens": input_lengths.numpy(),
+                "train_max_seq_len": train_results["train_max_seq_len"],
             }
             metrics.update(train_results["all_mb_metrics"])
             for k, v in metrics.items():
diff --git a/nemo_rl/data/llm_message_utils.py b/nemo_rl/data/llm_message_utils.py
index c0572ce3a1..6591465120 100644
--- a/nemo_rl/data/llm_message_utils.py
+++ b/nemo_rl/data/llm_message_utils.py
@@ -552,29 +552,29 @@ def _format_content_helper(
         message_chunk = formatted_message[prev_message_len_no_eos:]
 
         # Debug: Print each message turn separately (only once for the first sample)
-        if not hasattr(get_formatted_message_log, "_debug_printed"):
-            if i == 0:
-                # Print header only at the start of first message
-                print("\n" + "=" * 80)
-                print("DEBUG: Individual message turns from apply_chat_template")
-                print("=" * 80)
-
-            print(f"\n[Turn {i + 1}/{len(message_log_strs)}] Role: {message['role']}")
-            print("-" * 40)
-            print("Extracted message chunk:")
-            print(repr(message_chunk))  # Using repr to show special characters
-            print(f"Raw text (len={len(message_chunk)}):")
-            print(message_chunk)
-            print("-" * 40)
-
-            if i == len(message_log_strs) - 1:
-                # Mark as printed after processing all turns of the first sample
-                get_formatted_message_log._debug_printed = True
-                print("\n" + "=" * 80)
-                print("DEBUG: Complete formatted conversation:")
-                print("-" * 80)
-                print(formatted_message)
-                print("=" * 80 + "\n")
+        # if not hasattr(get_formatted_message_log, "_debug_printed"):
+        #     if i == 0:
+        #         # Print header only at the start of first message
+        #         print("\n" + "=" * 80)
+        #         print("DEBUG: Individual message turns from apply_chat_template")
+        #         print("=" * 80)
+
+        #     print(f"\n[Turn {i + 1}/{len(message_log_strs)}] Role: {message['role']}")
+        #     print("-" * 40)
+        #     print("Extracted message chunk:")
+        #     print(repr(message_chunk))  # Using repr to show special characters
+        #     print(f"Raw text (len={len(message_chunk)}):")
+        #     print(message_chunk)
+        #     print("-" * 40)
+
+        #     if i == len(message_log_strs) - 1:
+        #         # Mark as printed after processing all turns of the first sample
+        #         get_formatted_message_log._debug_printed = True
+        #         print("\n" + "=" * 80)
+        #         print("DEBUG: Complete formatted conversation:")
+        #         print("-" * 80)
+        #         print(formatted_message)
+        #         print("=" * 80 + "\n")
 
         if i == 0:
             if add_bos_token:
diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py
index ed919579b7..298ae7e7f6 100644
--- a/nemo_rl/models/policy/dtensor_policy_worker_v2.py
+++ b/nemo_rl/models/policy/dtensor_policy_worker_v2.py
@@ -18,8 +18,10 @@
 import warnings
 from collections import defaultdict
 from contextlib import AbstractContextManager, contextmanager, nullcontext
+from functools import wraps
 from typing import Any, Generator, Iterable, Optional, cast
 
+import humanize
 import ray
 import torch
 from accelerate import init_empty_weights
@@ -44,6 +46,7 @@
     get_cpu_state_dict,
     to_local_if_dtensor,
 )
+from tabulate import tabulate
 from torch import nn
 from torch.distributed.checkpoint.state_dict import (
     StateDictOptions,
@@ -92,6 +95,26 @@
 from nemo_rl.utils.nsys import wrap_with_nvtx_name
 
 
+def mem_stats(func):
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        torch.cuda.reset_peak_memory_stats()
+        ret = func(*args, **kwargs)
+        allocated = humanize.naturalsize(torch.cuda.memory_allocated())
+        reserved = humanize.naturalsize(torch.cuda.memory_reserved())
+        peak_allocated = humanize.naturalsize(torch.cuda.max_memory_allocated())
+        peak_reserved = humanize.naturalsize(torch.cuda.max_memory_reserved())
+
+        headers = ["Allocated", "Peak Allocated", "Reserved", "Peak Reserved"]
+        row = [[allocated, peak_allocated, reserved, peak_reserved]]
+
+        print("Memory stats:")
+        print(tabulate(row, headers=headers, tablefmt="grid"))
+        return ret
+
+    return wrapper
+
+
 @ray.remote(
     runtime_env=get_runtime_env_for_policy_worker("dtensor_policy_worker_v2")
 )  # pragma: no cover
@@ -180,6 +203,8 @@ def __init__(
             else None,
         )
 
+        # model_config.num_hidden_layers = 2
+
         self.allow_flash_attn_args = self.check_model_allow_flash_attn_args(
             model_config
         )
@@ -245,10 +270,7 @@ def __init__(
             # https://github.com/NVIDIA-NeMo/Automodel/blob/7e748be260651349307862426c0c168cebdeeec3/nemo_automodel/components/_transformers/auto_model.py#L180
             self.model = model_class.from_config(
                 model_config,
-                attn_implementation="flash_attention_2"
-                if self.enable_seq_packing
-                else None,
-                use_liger_kernel=False,
+                attn_implementation="sdpa",
                 trust_remote_code=True,
                 torch_dtype=str(model_config.torch_dtype),
             )
@@ -488,6 +510,7 @@ def get_gpu_info(self) -> dict[str, Any]:
         """Return information about the GPU being used by this worker."""
         return get_gpu_info(self.model)
 
+    @mem_stats
     @wrap_with_nvtx_name("dtensor_policy_worker_v2/train")
     def train(
         self,
@@ -498,6 +521,8 @@ def train(
         mbs: Optional[int] = None,
     ) -> dict[str, Any]:
         """Train the policy on a batch of data with a given loss function."""
+        max_seq_len_in_step = 0
+
         if gbs is None:
             gbs = self.cfg["train_global_batch_size"]
         if mbs is None:
@@ -657,6 +682,8 @@ def train(
                         if len(vlm_kwargs) > 0:
                             position_ids = None
 
+                    max_seq_len_in_step = max(max_seq_len_in_step, seq_len)
+
                     context_parallel_ctx = None
                     if self.cp_size > 1:
                         assert len(vlm_kwargs) == 0, (
@@ -859,6 +886,7 @@ def train(
                 "gpu_name": torch.cuda.get_device_name(),
                 "model_dtype": self.dtype,
                 "all_mb_metrics": dict(mb_metrics),
+                "train_max_seq_len": max_seq_len_in_step,
             }
 
             return metrics
@@ -880,6 +908,7 @@ def get_logprobs(
           We use the convention that the logprob of the first token is 0 so that the sequence length is maintained.
           The logprob of input token i is specified at position i in the output logprobs tensor.
         """
+        max_seq_len_in_step = 0
         logprob_batch_size = (
             micro_batch_size
             if micro_batch_size is not None
@@ -986,6 +1015,8 @@ def get_logprobs(
                 if len(vlm_kwargs) > 0:
                     position_ids = None
 
+                max_seq_len_in_step = max(max_seq_len_in_step, seq_len)
+
                 context_parallel_ctx = None
                 if self.cp_size > 1:
                     assert len(vlm_kwargs) == 0, (
@@ -1167,6 +1198,9 @@ def get_logprobs(
                 )
             all_log_probs_padded.append(lp)
         return_data["logprobs"] = torch.cat(all_log_probs_padded, dim=0).cpu()
+        print(f"get_logprobs: max_seq_len_in_step: {max_seq_len_in_step}")
+
+        # return_data["logprobs_max_seq_len"] = max_seq_len_in_step
 
         return return_data
 
diff --git a/nemo_rl/models/policy/lm_policy.py b/nemo_rl/models/policy/lm_policy.py
index 23746a8037..c2b1132f52 100644
--- a/nemo_rl/models/policy/lm_policy.py
+++ b/nemo_rl/models/policy/lm_policy.py
@@ -430,6 +430,7 @@ def train(
         aggregated_results = {
             "loss": results[0]["global_loss"],
             "grad_norm": results[0]["grad_norm"],
+            "train_max_seq_len": max([r["train_max_seq_len"] for r in results]),
         }
 
         if self.flops_tracker is not None:
diff --git a/pyproject.toml b/pyproject.toml
index 69d3d9fea4..9260c89ea9 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -52,6 +52,8 @@ dependencies = [
     "mlflow",
     "nvidia-nvshmem-cu12", # for deep_ep build
     "swanlab",
+    "humanize",
+    "tabulate",
 ]
 
 [project.optional-dependencies]
@@ -202,11 +204,12 @@ requires-dist = ["torch", "einops", "setuptools", "psutil", "ninja"]
 [tool.black]
 line-length = 120
 include = '\.pyi?$'
-exclude = '''
+force-exclude = '''
 /(
     \.git
   | \.venv
   | build
+  | nemo_rl/models/dtensor/custom_models
 )/
 '''
 
diff --git a/uv.lock b/uv.lock
index 50e2abc88f..37e23d311c 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1714,6 +1714,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/f0/0f/310fb31e39e2d734ccaa2c0fb981ee41f7bd5056ce9bc29b2248bd569169/humanfriendly-10.0-py2.py3-none-any.whl", hash = "sha256:1697e1a8a8f550fd43c2865cd84542fc175a61dcb779b6fee18cf6b6ccba1477", size = 86794, upload-time = "2021-09-17T21:40:39.897Z" },
 ]
 
+[[package]]
+name = "humanize"
+version = "4.13.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/98/1d/3062fcc89ee05a715c0b9bfe6490c00c576314f27ffee3a704122c6fd259/humanize-4.13.0.tar.gz", hash = "sha256:78f79e68f76f0b04d711c4e55d32bebef5be387148862cb1ef83d2b58e7935a0", size = 81884, upload-time = "2025-08-25T09:39:20.04Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/1e/c7/316e7ca04d26695ef0635dc81683d628350810eb8e9b2299fc08ba49f366/humanize-4.13.0-py3-none-any.whl", hash = "sha256:b810820b31891813b1673e8fec7f1ed3312061eab2f26e3fa192c393d11ed25f", size = 128869, upload-time = "2025-08-25T09:39:18.54Z" },
+]
+
 [[package]]
 name = "hydra-core"
 version = "1.3.2"
@@ -2901,6 +2910,7 @@ dependencies = [
     { name = "colored" },
     { name = "datasets" },
     { name = "debugpy" },
+    { name = "humanize" },
     { name = "hydra-core" },
     { name = "math-verify" },
     { name = "matplotlib" },
@@ -2919,6 +2929,7 @@ dependencies = [
     { name = "setuptools" },
     { name = "swanlab" },
     { name = "sympy" },
+    { name = "tabulate" },
     { name = "tensorboard" },
     { name = "tiktoken" },
     { name = "torch", version = "2.7.1", source = { registry = "https://pypi.org/simple" }, marker = "sys_platform == 'darwin'" },
@@ -3009,6 +3020,7 @@ requires-dist = [
     { name = "flash-attn", marker = "extra == 'automodel'", specifier = "==2.7.4.post1" },
     { name = "flash-attn", marker = "extra == 'mcore'", specifier = "==2.7.4.post1" },
     { name = "flash-attn", marker = "extra == 'vllm'", specifier = "==2.7.4.post1" },
+    { name = "humanize" },
     { name = "hydra-core" },
     { name = "mamba-ssm", marker = "extra == 'automodel'", git = "https://github.com/state-spaces/mamba.git?rev=2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4" },
     { name = "mamba-ssm", marker = "extra == 'vllm'", git = "https://github.com/state-spaces/mamba.git?rev=2e16fc3062cdcd4ebef27a9aa4442676e1c7edf4" },
@@ -3033,6 +3045,7 @@ requires-dist = [
     { name = "setuptools" },
     { name = "swanlab" },
     { name = "sympy", specifier = ">=1.14.0" },
+    { name = "tabulate" },
     { name = "tensorboard" },
     { name = "tiktoken" },
     { name = "torch", marker = "sys_platform != 'darwin'", specifier = "==2.7.1", index = "https://download.pytorch.org/whl/cu128" },
@@ -5617,6 +5630,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" },
 ]
 
+[[package]]
+name = "tabulate"
+version = "0.9.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" },
+]
+
 [[package]]
 name = "tensorboard"
 version = "2.20.0"