volcengine · puneeshkhanna · Oct 7, 2025
@@ -437,7 +437,8 @@ def _compute_loss_and_backward(self, batch, do_backward=True, n_micro_batches=1)
 
             if self.config.data.balance_dp_token:
                 torch.distributed.all_reduce(valid_token_this_rank)
-                dp_size = self.ulysses_device_mesh.size("dp") if use_sp else torch.distributed.get_world_size()
+                # Valid token is reduced over all ranks (including SP ranks), set dp size to world size
+                dp_size = torch.distributed.get_world_size()
             else:
                 dp_size = 1