diff --git a/README.md b/README.md
index e7f61bf20..e11122f5e 100644
--- a/README.md
+++ b/README.md
@@ -640,7 +640,7 @@ If you need to supply a hostfile for use with the MPI-based DeepSpeed launcher,
 
 # Profiling
 
-We support profiling with Nsight Systems and PyTorch Memory Profiling.
+We support profiling with Nsight Systems, the PyTorch Profiler, and PyTorch Memory Profiling.
 
 ## Nsight Systems Profiling
 
@@ -656,6 +656,15 @@ The generated output file can then by viewed with the Nsight Systems GUI:
 
 ![Alt text](images/nsight_profiling.png)
 
+## PyTorch Profiling
+
+To use the built-in PyTorch profiler, set config options `profile`, `profile_step_start`, and `profile_step_stop`.
+
+The PyTorch profiler will save traces to your `tensorboard` log directory.  You can view these traces within
+TensorBoard by following the steps [here](https://pytorch.org/tutorials/intermediate/tensorboard_profiler_tutorial.html).
+
+![Alt text](images/pytorch_profiling.png)
+
 ## PyTorch Memory Profiling
 
 To use PyTorch Memory Profiling, set config options `memory_profiling` and `memory_profiling_path`.
diff --git a/configs/neox_arguments.md b/configs/neox_arguments.md
index 48c03f15a..1dbb4dd8a 100644
--- a/configs/neox_arguments.md
+++ b/configs/neox_arguments.md
@@ -111,7 +111,7 @@ Logging Arguments
 
 - **git_hash**: str
 
-    Default = 0d5992f
+    Default = b68ba6d
 
     current git hash of repository
 
diff --git a/images/pytorch_profiling.png b/images/pytorch_profiling.png
new file mode 100644
index 000000000..e85324dc6
Binary files /dev/null and b/images/pytorch_profiling.png differ
diff --git a/megatron/data/helpers.cpp b/megatron/data/helpers.cpp
index aca290854..9b062b050 100644
--- a/megatron/data/helpers.cpp
+++ b/megatron/data/helpers.cpp
@@ -428,9 +428,9 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
                         }
 
                     }  // for (auto sent_index=sent_index_first; ...
-                }  // if (num_remain_sent > 1) {
-            }  // for (int doc=0; doc < num_docs; ++doc) {
-        }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
+                }      // if (num_remain_sent > 1) {
+            }          // for (int doc=0; doc < num_docs; ++doc) {
+        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
         if (!second) {
             if (verbose) {
@@ -660,9 +660,9 @@ py::array build_blocks_mapping_impl(const py::array_t<int64_t>& docs_,
                             num_sent = 0;
                         }
                     }  // for (auto sent_index=sent_index_first; ...
-                }  // if (num_remain_sent > 1) {
-            }  // for (int doc=0; doc < num_docs; ++doc) {
-        }  // for (int epoch=0; epoch < num_epochs; ++epoch) {
+                }      // if (num_remain_sent > 1) {
+            }          // for (int doc=0; doc < num_docs; ++doc) {
+        }              // for (int epoch=0; epoch < num_epochs; ++epoch) {
 
         if (!second) {
             if (verbose) {
diff --git a/megatron/training.py b/megatron/training.py
index 6a4e843ab..3265680c5 100644
--- a/megatron/training.py
+++ b/megatron/training.py
@@ -970,7 +970,28 @@ def train(
 
     # to monitor if we've skipped many iterations in a row and trigger an early exit
     overflow_monitor = OverflowMonitor(optimizer)
+
+    if neox_args.profile:
+        schedule = torch.profiler.schedule(
+            wait=neox_args.profile_step_start,
+            warmup=1,
+            active=neox_args.profile_step_stop - neox_args.profile_step_start,
+        )
+        prof = torch.profiler.profile(
+            schedule=schedule,
+            on_trace_ready=torch.profiler.tensorboard_trace_handler(
+                neox_args.tensorboard_dir
+            ),
+            record_shapes=True,
+            profile_memory=True,
+            with_flops=True,
+            with_modules=True,
+            with_stack=True,
+        )
+        prof.start()
     while iteration < neox_args.train_iters:
+        if neox_args.profile:
+            prof.step()
         if neox_args.profile and iteration == neox_args.profile_step_start:
             torch.cuda.cudart().cudaProfilerStart()
         loss_dict, skipped_iter = train_step(
@@ -983,6 +1004,7 @@ def train(
         )
         if neox_args.profile and iteration == neox_args.profile_step_stop:
             torch.cuda.cudart().cudaProfilerStop()
+            prof.stop()
         iteration += 1
         neox_args.iteration = iteration
         if neox_args.precision == "fp16":