1212import torch
1313import torch ._dynamo .config
1414import torch ._inductor .config
15+ torch ._inductor .config .cpp .enable_kernel_profile = True
1516
1617def device_sync (device ):
1718 if "cuda" in device :
@@ -132,7 +133,7 @@ def encode_tokens(tokenizer, string, bos=True, device='cuda'):
132133 tokens = tokenizer .encode (string )
133134 if bos :
134135 tokens = [tokenizer .bos_id ()] + tokens
135- return torch .tensor (tokens , dtype = torch .int , device = device )
136+ return torch .tensor (tokens , dtype = torch .int , device = args . device )
136137
137138def _load_model (checkpoint_path , device , precision , use_tp ):
138139 with torch .device ('meta' ):
@@ -248,8 +249,13 @@ def callback(x):
248249 if (i != num_samples - 1 or not profile ) or (use_tp and rank != 0 ):
249250 prof = contextlib .nullcontext ()
250251 else :
251- torch .profiler ._utils ._init_for_cuda_graphs ()
252- prof = torch .profiler .profile ()
252+ if device == 'cuda' :
253+ torch .profiler ._utils ._init_for_cuda_graphs ()
254+ prof = torch .profiler .profile (activities = [torch .profiler .ProfilerActivity .CPU , torch .profiler .ProfilerActivity .CUDA ], use_cuda = True )
255+ profile_sort = 'self_cuda_time_total'
256+ elif device == 'cpu' :
257+ prof = torch .profiler .profile (activities = [torch .profiler .ProfilerActivity .CPU ])
258+ profile_sort = 'self_cpu_time_total'
253259 with prof :
254260 y = generate (
255261 model ,
@@ -263,6 +269,8 @@ def callback(x):
263269 if i == - 1 :
264270 print (f"Compilation time: { time .perf_counter () - t0 :.2f} seconds" )
265271 continue
272+ if hasattr (prof , "key_averages" ):
273+ print (prof .key_averages ().table (sort_by = profile_sort , row_limit = - 1 ))
266274 if hasattr (prof , "export_chrome_trace" ):
267275 if use_tp :
268276 prof .export_chrome_trace (f"{ profile } _rank_{ rank } .json" )
0 commit comments