Skip to content

Commit 340fc0c

Browse files
authored
Report progress at gradient accumulation boundary (#2553)
* report progress at gradient accumulation boundary * format * format
1 parent 21c2802 commit 340fc0c

File tree

2 files changed

+16
-13
lines changed

2 files changed

+16
-13
lines changed

deepspeed/runtime/engine.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2038,7 +2038,7 @@ def step(self, lr_kwargs=None):
20382038
assert self.optimizer is not None and not isinstance(self.optimizer, DummyOptim), \
20392039
"must provide optimizer during init in order to use step"
20402040

2041-
report_progress = self.global_rank == 0 if self.global_rank else True
2041+
report_progress = False
20422042

20432043
self._step_applied = False # assume False, will flip to True
20442044

@@ -2065,6 +2065,8 @@ def step(self, lr_kwargs=None):
20652065
else:
20662066
self._take_model_step(lr_kwargs)
20672067

2068+
report_progress = self.global_rank == 0 if self.global_rank else True
2069+
20682070
self.tput_timer.stop(report_progress)
20692071

20702072
self._stop_timers(self.engine_timers.step_timers)

deepspeed/utils/timer.py

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -193,18 +193,19 @@ def stop(self, report_speed=True):
193193

194194
curr_samples_sec = (self.batch_size * self.num_workers) / duration
195195

196-
if self.local_step_count % self.steps_per_output == 0:
197-
if report_speed:
198-
self.logging(
199-
"{}/{}, RunningAvgSamplesPerSec={}, CurrSamplesPerSec={}, MemAllocated={}GB, MaxMemAllocated={}GB"
200-
.format(self.epoch_count,
201-
self.local_step_count,
202-
self.avg_samples_per_sec(),
203-
curr_samples_sec,
204-
round(torch.cuda.memory_allocated() / 1024**3,
205-
2),
206-
round(torch.cuda.max_memory_allocated() / 1024**3,
207-
2)))
196+
if report_speed:
197+
self.logging(
198+
"{}/{}, RunningAvgSamplesPerSec={}, CurrSamplesPerSec={}, MemAllocated={}GB, MaxMemAllocated={}GB"
199+
.format(
200+
self.epoch_count,
201+
self.local_step_count,
202+
self.avg_samples_per_sec(),
203+
curr_samples_sec,
204+
round(torch.cuda.memory_allocated() / 1024**3,
205+
2),
206+
round(torch.cuda.max_memory_allocated() / 1024**3,
207+
2),
208+
))
208209
if self.monitor_memory:
209210
virt_mem = psutil.virtual_memory()
210211
swap = psutil.swap_memory()

0 commit comments

Comments
 (0)