diff --git a/userbenchmark/dynamo/dynamobench/common.py b/userbenchmark/dynamo/dynamobench/common.py index fd3fd7e6c9..20445b3b9d 100644 --- a/userbenchmark/dynamo/dynamobench/common.py +++ b/userbenchmark/dynamo/dynamobench/common.py @@ -737,6 +737,11 @@ def maybe_mark_profile(*args, **kwargs): row.append(kwargs["compression_ratio"]) row.append(kwargs["eager_peak_mem"]) row.append(kwargs["dynamo_peak_mem"]) + + if "cache_lookup_latency" in kwargs: + headers.append("cache_lookup_latency") + row.append(kwargs["cache_lookup_latency"]) + if "dynamo_stats" in kwargs: for k, v in kwargs["dynamo_stats"].items(): headers.append(k) @@ -2678,6 +2683,21 @@ def warmup(fn, model, example_inputs, mode, niters=5): optimized_model_iter_fn, model, example_inputs, "dynamo" ) + if self.args.profile_dynamo_cache_lookup: + with torch.profiler.profile( + activities=[torch.profiler.ProfilerActivity.CPU] + ) as prof: + with maybe_enable_compiled_autograd(self.args.compiled_autograd): + warmup(optimized_model_iter_fn, model, example_inputs, "dynamo") + + events = list( + filter( + lambda event: "TorchDynamo Cache Lookup" in event.key, + prof.key_averages(), + ) + ) + dynamo_cache_lookup_latency = events[0].self_cpu_time_total + compilation_time = dynamo_latency - eager_latency + aot_compilation_time compression_ratio = ( eager_peak_mem / dynamo_peak_mem if dynamo_peak_mem else 0.0 @@ -2695,6 +2715,10 @@ def warmup(fn, model, example_inputs, mode, niters=5): experiment_kwargs["eager_peak_mem"] = eager_peak_mem experiment_kwargs["dynamo_peak_mem"] = dynamo_peak_mem experiment_kwargs["dynamo_stats"] = dynamo_stats + if self.args.profile_dynamo_cache_lookup: + experiment_kwargs[ + "cache_lookup_latency" + ] = dynamo_cache_lookup_latency if experiment.func is coverage_experiment: ok, total = Stats.reset_counters() @@ -3201,6 +3225,13 @@ def get_example_inputs(self): help="Enables compiled autograd on compiled benchmark", ) + parser.add_argument( + "--profile_dynamo_cache_lookup", + "--profile-dynamo-cache-lookup", + action="store_true", + help="profiles TorchDynamo cache lookup", + ) + group_fuser = parser.add_mutually_exclusive_group() # --nvfuser is now the default, keep the option to not break scripts group_fuser.add_argument("--nvfuser", action="store_true", help=argparse.SUPPRESS)