diff --git a/README.md b/README.md index e7f61bf20..e11122f5e 100644 --- a/README.md +++ b/README.md @@ -640,7 +640,7 @@ If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher, # Profiling -We support profiling with Nsight Systems and PyTorch Memory Profiling. +We support profiling with Nsight Systems, the PyTorch Profiler, and PyTorch Memory Profiling. ## Nsight Systems Profiling @@ -656,6 +656,15 @@ The generated output file can then by viewed with the Nsight Systems GUI: ![Alt text](images/nsight_profiling.png) +## PyTorch Profiling + +To use the built-in PyTorch profiler, set config options `profile`, `profile_step_start`, and `profile_step_stop`. + +The PyTorch profiler will save traces to your `tensorboard` log directory. You can view these traces within +TensorBoard by following the steps [here](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html). + +![Alt text](images/pytorch_profiling.png) + ## PyTorch Memory Profiling To use PyTorch Memory Profiling, set config options `memory_profiling` and `memory_profiling_path`. diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md index 48c03f15a..1dbb4dd8a 100644 --- a/configs/neox_arguments.md +++ b/configs/neox_arguments.md @@ -111,7 +111,7 @@ Logging Arguments - **git_hash**: str - Default = 0d5992f + Default = b68ba6d current git hash of repository diff --git a/images/pytorch_profiling.png b/images/pytorch_profiling.png new file mode 100644 index 000000000..e85324dc6 Binary files /dev/null and b/images/pytorch_profiling.png differ diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp index aca290854..9b062b050 100644 --- a/megatron/data/helpers.cpp +++ b/megatron/data/helpers.cpp @@ -428,9 +428,9 @@ py::array build_mapping_impl(const py::array_t& docs_, } } // for (auto sent_index=sent_index_first; ... - } // if (num_remain_sent > 1) { - } // for (int doc=0; doc < num_docs; ++doc) { - } // for (int epoch=0; epoch < num_epochs; ++epoch) { + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { @@ -660,9 +660,9 @@ py::array build_blocks_mapping_impl(const py::array_t& docs_, num_sent = 0; } } // for (auto sent_index=sent_index_first; ... - } // if (num_remain_sent > 1) { - } // for (int doc=0; doc < num_docs; ++doc) { - } // for (int epoch=0; epoch < num_epochs; ++epoch) { + } // if (num_remain_sent > 1) { + } // for (int doc=0; doc < num_docs; ++doc) { + } // for (int epoch=0; epoch < num_epochs; ++epoch) { if (!second) { if (verbose) { diff --git a/megatron/training.py b/megatron/training.py index 6a4e843ab..3265680c5 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -970,7 +970,28 @@ def train( # to monitor if we've skipped many iterations in a row and trigger an early exit overflow_monitor = OverflowMonitor(optimizer) + + if neox_args.profile: + schedule = torch.profiler.schedule( + wait=neox_args.profile_step_start, + warmup=1, + active=neox_args.profile_step_stop - neox_args.profile_step_start, + ) + prof = torch.profiler.profile( + schedule=schedule, + on_trace_ready=torch.profiler.tensorboard_trace_handler( + neox_args.tensorboard_dir + ), + record_shapes=True, + profile_memory=True, + with_flops=True, + with_modules=True, + with_stack=True, + ) + prof.start() while iteration < neox_args.train_iters: + if neox_args.profile: + prof.step() if neox_args.profile and iteration == neox_args.profile_step_start: torch.cuda.cudart().cudaProfilerStart() loss_dict, skipped_iter = train_step( @@ -983,6 +1004,7 @@ def train( ) if neox_args.profile and iteration == neox_args.profile_step_stop: torch.cuda.cudart().cudaProfilerStop() + prof.stop() iteration += 1 neox_args.iteration = iteration if neox_args.precision == "fp16":