2323#include  < executorch/runtime/core/error.h> 
2424#include  < executorch/runtime/platform/log.h> 
2525
26+ #include  < cuda_runtime.h> 
27+ 
2628#define  STB_IMAGE_IMPLEMENTATION 
2729#include  < stb_image.h> 
2830#define  STB_IMAGE_RESIZE_IMPLEMENTATION 
@@ -67,6 +69,20 @@ using ::executorch::extension::llm::make_text_input;
6769using  ::executorch::extension::llm::MultimodalInput;
6870using  ::executorch::runtime::EValue;
6971
72+ size_t  get_gpu_memory_used () {
73+   size_t  free_bytes = 0 ;
74+   size_t  total_bytes = 0 ;
75+   cudaError_t status = cudaMemGetInfo (&free_bytes, &total_bytes);
76+   if  (status != cudaSuccess) {
77+     ET_LOG (
78+         Error,
79+         " Warning: cudaMemGetInfo failed: %s" 
80+         cudaGetErrorString (status));
81+     return  0 ;
82+   }
83+   return  total_bytes - free_bytes;
84+ }
85+ 
7086bool  ends_with (const  std::string& str, const  std::string& suffix) {
7187  return  str.size () >= suffix.size () &&
7288      str.compare (str.size () - suffix.size (), suffix.size (), suffix) == 0 ;
@@ -200,13 +216,29 @@ int32_t main(int32_t argc, char** argv) {
200216    return  1 ;
201217  }
202218
219+   //  Measure memory before loading
220+   cudaDeviceSynchronize ();
221+   size_t  mem_before_load = get_gpu_memory_used ();
222+   ET_LOG (
223+       Info,
224+       " GPU memory before loading: %.2f MB" 
225+       mem_before_load / (1024.0  * 1024.0 ));
226+ 
203227  //  Load runner
204228  auto  load_error = runner->load ();
205229  if  (load_error != ::executorch::runtime::Error::Ok) {
206230    ET_LOG (Error, " Failed to load multimodal runner" 
207231    return  1 ;
208232  }
209233
234+   //  Measure memory after loading
235+   cudaDeviceSynchronize ();
236+   size_t  mem_after_load = get_gpu_memory_used ();
237+   ET_LOG (
238+       Info,
239+       " GPU memory after loading: %.2f MB" 
240+       mem_after_load / (1024.0  * 1024.0 ));
241+ 
210242  //  Prepare inputs
211243  std::vector<MultimodalInput> inputs = {
212244      make_text_input (" <start_of_turn>user\n <start_of_image>" 
@@ -230,13 +262,64 @@ int32_t main(int32_t argc, char** argv) {
230262    runner->reset ();
231263  }
232264
265+   //  Measure memory before generation
266+   cudaDeviceSynchronize ();
267+   size_t  mem_before_gen = get_gpu_memory_used ();
268+ 
233269  auto  error = runner->generate (inputs, config);
234270
235271  if  (error != ::executorch::runtime::Error::Ok) {
236272    ET_LOG (Error, " Failed to generate with multimodal runner\n " 
237273    return  1 ;
238274  }
275+ 
276+   //  Measure memory after generation
277+   cudaDeviceSynchronize ();
278+   size_t  mem_after_gen = get_gpu_memory_used ();
279+ 
239280  ET_LOG (Info, " Generated successfully" 
240281
282+   //  Calculate and print memory usage statistics
283+   size_t  load_memory = mem_after_load - mem_before_load;
284+   size_t  gen_memory =
285+       mem_after_gen > mem_before_gen ? (mem_after_gen - mem_before_gen) : 0 ;
286+   size_t  total_memory = mem_after_gen - mem_before_load;
287+   size_t  peak_memory = mem_after_gen;
288+ 
289+   std::printf (" \n === CUDA Memory Usage Statistics ===\n " 
290+   std::printf (
291+       " Memory before loading:          %.2f MB (%zu bytes)\n " 
292+       mem_before_load / (1024.0  * 1024.0 ),
293+       mem_before_load);
294+   std::printf (
295+       " Memory after loading:           %.2f MB (%zu bytes)\n " 
296+       mem_after_load / (1024.0  * 1024.0 ),
297+       mem_after_load);
298+   std::printf (
299+       " Memory consumed by loading:     %.2f MB (%zu bytes)\n " 
300+       load_memory / (1024.0  * 1024.0 ),
301+       load_memory);
302+   std::printf (
303+       " Memory before generation:       %.2f MB (%zu bytes)\n " 
304+       mem_before_gen / (1024.0  * 1024.0 ),
305+       mem_before_gen);
306+   std::printf (
307+       " Memory after generation:        %.2f MB (%zu bytes)\n " 
308+       mem_after_gen / (1024.0  * 1024.0 ),
309+       mem_after_gen);
310+   std::printf (
311+       " Memory consumed by generation:  %.2f MB (%zu bytes)\n " 
312+       gen_memory / (1024.0  * 1024.0 ),
313+       gen_memory);
314+   std::printf (
315+       " Total memory consumed:          %.2f MB (%zu bytes)\n " 
316+       total_memory / (1024.0  * 1024.0 ),
317+       total_memory);
318+   std::printf (
319+       " Peak GPU memory used:           %.2f MB (%zu bytes)\n " 
320+       peak_memory / (1024.0  * 1024.0 ),
321+       peak_memory);
322+   std::printf (" ====================================\n\n " 
323+ 
241324  return  0 ;
242325}
0 commit comments