Report progress at gradient accumulation boundary (#2553)

ShijieZZZZ · web-flow · commit 340fc0cf1926 · 2022-11-28T20:39:38.000-05:00
* report progress at gradient accumulation boundary

* format

* format
diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
@@ -2038,7 +2038,7 @@ def step(self, lr_kwargs=None):
         assert self.optimizer is not None and not isinstance(self.optimizer, DummyOptim), \
             "must provide optimizer during init in order to use step"
 
-        report_progress = self.global_rank == 0 if self.global_rank else True
+        report_progress = False
 
         self._step_applied = False  # assume False, will flip to True
 
@@ -2065,6 +2065,8 @@ def step(self, lr_kwargs=None):
             else:
                 self._take_model_step(lr_kwargs)
 
+            report_progress = self.global_rank == 0 if self.global_rank else True
+
         self.tput_timer.stop(report_progress)
 
         self._stop_timers(self.engine_timers.step_timers)
diff --git a/deepspeed/utils/timer.py b/deepspeed/utils/timer.py
@@ -193,18 +193,19 @@ def stop(self, report_speed=True):
 
             curr_samples_sec = (self.batch_size * self.num_workers) / duration
 
-            if self.local_step_count % self.steps_per_output == 0:
-                if report_speed:
-                    self.logging(
-                        "{}/{}, RunningAvgSamplesPerSec={}, CurrSamplesPerSec={}, MemAllocated={}GB, MaxMemAllocated={}GB"
-                        .format(self.epoch_count,
-                                self.local_step_count,
-                                self.avg_samples_per_sec(),
-                                curr_samples_sec,
-                                round(torch.cuda.memory_allocated() / 1024**3,
-                                      2),
-                                round(torch.cuda.max_memory_allocated() / 1024**3,
-                                      2)))
+            if report_speed:
+                self.logging(
+                    "{}/{}, RunningAvgSamplesPerSec={}, CurrSamplesPerSec={}, MemAllocated={}GB, MaxMemAllocated={}GB"
+                    .format(
+                        self.epoch_count,
+                        self.local_step_count,
+                        self.avg_samples_per_sec(),
+                        curr_samples_sec,
+                        round(torch.cuda.memory_allocated() / 1024**3,
+                              2),
+                        round(torch.cuda.max_memory_allocated() / 1024**3,
+                              2),
+                    ))
                 if self.monitor_memory:
                     virt_mem = psutil.virtual_memory()
                     swap = psutil.swap_memory()