flush cache before update begins

PrinsYin · PrinsYin · commit 2e7b82a4e96b · 2025-11-30T14:02:59.000-05:00
diff --git a/nemo_rl/algorithms/grpo.py b/nemo_rl/algorithms/grpo.py
@@ -984,6 +984,12 @@ def refit_policy_generation(
                 
                 # Stream weights via HTTP
                 # Each training worker will match its GPU UUID to the corresponding SGLang server
+                # Megatron-style: Flush cache before weight updates
+                print("[sglang refit] Flushing KV cache before weight updates (Megatron-style)...", flush=True)
+                flush_success = policy_generation.invalidate_kv_cache()
+                if not flush_success:
+                    print("[sglang refit] WARNING - Cache flush had issues, but continuing with weight update", flush=True)
+                
                 print("[sglang refit] Starting weight streaming via HTTP...", flush=True)
                 futures_train = policy.stream_weights_via_http(
                     sglang_url_to_gpu_uuids=sglang_url_to_gpu_uuids,
diff --git a/nemo_rl/models/generation/sglang/sglang_generation.py b/nemo_rl/models/generation/sglang/sglang_generation.py
@@ -353,22 +353,27 @@ def __del__(self) -> None:
         self.shutdown()
 
     def invalidate_kv_cache(self) -> bool:
-        """Invalidate KV cache after weight updates.
+        """Invalidate KV cache before weight updates (Megatron-style).
         
-        For SGLang, this might need to call a different method or might not be needed
-        if the server handles it automatically.
+        This flushes the cache before weight updates to clear stale cache.
+        Only primary workers (TP rank 0, model owners) will flush their cache.
+        
+        Returns:
+            bool: True if all caches were flushed successfully, False otherwise
         """
         try:
-            # For SGLang, we can call a method on each worker if it exists
-            futures = []
-            for worker in self.worker_group.workers:
-                if hasattr(worker, "invalidate_kv_cache"):
-                    futures.append(worker.invalidate_kv_cache.remote())
-            
-            if futures:
-                results = ray.get(futures)
-                return all(result for result in results if result is not None)
-            return True
+            futures = self.worker_group.run_all_workers_single_data(
+                "invalidate_kv_cache",
+                run_rank_0_only_axes=["tensor_parallel"],
+            )
+            results = ray.get(futures)
+            results = [r for r in results if r is not None]
+            success = all(result for result in results) if results else True
+            if success:
+                print("[sglang refit] All SGLang server caches flushed successfully", flush=True)
+            else:
+                print("[sglang refit] WARNING - Some SGLang server caches failed to flush", flush=True)
+            return success
         except Exception as e:
-            print(f"Error invalidating SGLang caches: {e}")
+            print(f"[sglang refit] Error flushing SGLang caches: {e}", flush=True)
             return False
diff --git a/nemo_rl/models/policy/dtensor_policy_worker_v2.py b/nemo_rl/models/policy/dtensor_policy_worker_v2.py
@@ -1757,8 +1757,10 @@ def stream_weights_via_http(
         current_device_uuid = self.report_device_id()
 
         def dtensor_params_generator():
-            """Generator that yields (name, tensor) pairs, converting DTensors to local tensors."""
-            for name, tensor in self.model.state_dict().items():
+            """Generator that yields (name, tensor) pairs, converting DTensors to local tensors.
+            """
+            state_dict_items = sorted(self.model.state_dict().items(), key=lambda x: x[0])
+            for name, tensor in state_dict_items:
                 if isinstance(tensor, DTensor):
                     # Convert DTensor to full tensor for streaming
                     full_tensor = tensor.full_tensor()
@@ -1770,7 +1772,6 @@ def dtensor_params_generator():
                 else:
                     # Convert to target dtype
                     yield name, tensor.to(self.dtype, non_blocking=True).contiguous()
-
         # Use the HTTP implementation
         stream_weights_via_http_impl(
             params_generator=dtensor_params_generator(),
diff --git a/nemo_rl/models/policy/utils.py b/nemo_rl/models/policy/utils.py
@@ -524,6 +524,7 @@ def stream_weights_via_http_impl(
         from sglang.srt.utils.patch_torch import monkey_patch_torch_reductions
     except ImportError:
         from sglang.srt.patch_torch import monkey_patch_torch_reductions
+    print(f"[sglang refit details] entering stream_weights_via_http_impl")
     
     monkey_patch_torch_reductions()
     
@@ -559,6 +560,13 @@ def stream_weights_via_http_impl(
         tensor_list = list(params_generator)
         total_tensors = len(tensor_list)
         
+        if rank == ipc_gather_src:
+            print(
+                f"[sglang refit details] {worker_name}: Starting weight update - "
+                f"Total parameters to update: {total_tensors}",
+                flush=True
+            )
+        
         for idx, (name, tensor) in enumerate(tensor_list):
             torch.cuda.current_stream().synchronize()
             tensor = tensor.contiguous().cuda()
@@ -574,10 +582,9 @@ def stream_weights_via_http_impl(
             )
             
             if rank == ipc_gather_src:
-                is_last = (idx == total_tensors - 1)
                 _send_tensor_to_sglang(
                     url, name, gathered_handlers, tensor.shape, str(tensor.dtype),
-                    flush_cache=is_last
+                    flush_cache=False
                 )
                 tensor_count += 1
             
@@ -586,11 +593,18 @@ def stream_weights_via_http_impl(
                 del gathered_handlers
             torch.cuda.empty_cache()
         
-        if rank == 0:
+        if rank == ipc_gather_src:
             print(
-                f"[sglang refit] {worker_name}: Sent {tensor_count} tensors to SGLang server: {base_url}",
+                f"[sglang refit details] {worker_name}: Weight update completed - "
+                f"Successfully updated {tensor_count}/{total_tensors} parameters to SGLang server: {base_url}",
                 flush=True
             )
+            if tensor_count != total_tensors:
+                print(
+                    f"[sglang refit details] {worker_name}: WARNING - Expected {total_tensors} tensors, "
+                    f"but only sent {tensor_count}",
+                    flush=True
+                )
     
     except Exception as e:
         print(