pipe engine eval_batch: add option to disable loss broadcast (#4326)

nelyahu · loadams · web-flow · commit f9698c7307d0 · 2023-10-06T10:26:31.000Z
it is sometimes not required to have the loss bcasted to all rank after
evaluation cycle and it is only required by some ranks.
It adds overhead of communication between rank.
by setting bcast_loss=False (default is True and retains the previous behavior),
loss will not be bcasted.
if monitor is enabled loss will be bcasted.

Co-authored-by: Logan Adams &lt;114770087+loadams@users.noreply.github.com&gt;
diff --git a/deepspeed/runtime/pipe/engine.py b/deepspeed/runtime/pipe/engine.py
@@ -386,7 +386,7 @@ def train_batch(self, data_iter=None):
         # TODO: should return precisely what loss returned and allow others to be queried?
         return self.agg_train_loss
 
-    def eval_batch(self, data_iter, return_logits=False, compute_loss=True, reduce_output='avg'):
+    def eval_batch(self, data_iter, return_logits=False, compute_loss=True, reduce_output='avg', bcast_loss=True):
         """Evaluate the pipeline on a batch of data from ``data_iter``. The
         engine will evaluate ``self.train_batch_size()`` total samples
         collectively across all workers.
@@ -449,7 +449,7 @@ def eval_batch(self, data_iter, return_logits=False, compute_loss=True, reduce_o
         if self.is_last_stage():
             eval_output = self._reduce_outputs(self.fwd_outputs, reduce=reduce_output)
 
-        if compute_loss:
+        if compute_loss and (bcast_loss or self.monitor.enabled):
             eval_output = self._bcast_pipe_scalar(eval_output)
 
         if self.global_rank == 0 and self.monitor.enabled: