Skip to content

Commit 8f13885

Browse files
committed
adding cuda memory estimation support
Pull Request resolved: #15294 Differential Revision: [D85119089](https://our.internmc.facebook.com/intern/diff/D85119089/) ghstack-source-id: 317766112
1 parent ff6deb2 commit 8f13885

File tree

2 files changed

+114
-0
lines changed

2 files changed

+114
-0
lines changed

backends/cuda/tests/multimodal_benchmark.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,9 @@
1717
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
1818
#include <executorch/runtime/core/portable_type/tensor.h>
1919

20+
#include <cuda_runtime.h>
21+
#include <nvml.h>
22+
2023
namespace {
2124

2225
using executorch::aten::ScalarType;
@@ -201,8 +204,21 @@ TensorPtr create_fallback_text_embedding(const ModelConfig& config) {
201204
struct MethodTiming {
202205
double load_ms{0.0};
203206
double run_ms{0.0};
207+
size_t peak_gpu_memory_bytes{0};
204208
};
205209

210+
size_t get_gpu_memory_used() {
211+
size_t free_bytes = 0;
212+
size_t total_bytes = 0;
213+
cudaError_t status = cudaMemGetInfo(&free_bytes, &total_bytes);
214+
if (status != cudaSuccess) {
215+
std::cerr << "Warning: cudaMemGetInfo failed: "
216+
<< cudaGetErrorString(status) << std::endl;
217+
return 0;
218+
}
219+
return total_bytes - free_bytes;
220+
}
221+
206222
enum class MethodCategory { ENCODER, TOKEN_EMBEDDING, TEXT_DECODER, UNKNOWN };
207223

208224
MethodCategory categorize_method(const std::string& method_name) {
@@ -306,6 +322,9 @@ Error execute_method(
306322
std::vector<EValue> inputs = create_inputs_for_method(
307323
method_name, category, model_type, config, token_output, owned_inputs);
308324

325+
cudaDeviceSynchronize();
326+
size_t mem_before = get_gpu_memory_used();
327+
309328
const auto run_start = Clock::now();
310329
ET_LOG(Info, "%s running", method_name.c_str());
311330
Result<std::vector<EValue>> output_result =
@@ -314,6 +333,11 @@ Error execute_method(
314333
const auto run_end = Clock::now();
315334
timing.run_ms = DurationMs(run_end - run_start).count();
316335

336+
cudaDeviceSynchronize();
337+
size_t mem_after = get_gpu_memory_used();
338+
timing.peak_gpu_memory_bytes =
339+
mem_after > mem_before ? (mem_after - mem_before) : 0;
340+
317341
if (output_result.error() != Error::Ok) {
318342
std::cerr << method_name << " execution failed: error code "
319343
<< static_cast<int>(output_result.error()) << std::endl;
@@ -457,6 +481,13 @@ int main(int argc, char** argv) {
457481
std::cout << " " << name << ": " << timing.run_ms << std::endl;
458482
}
459483

484+
std::cout << "\nPeak GPU memory usage:" << std::endl;
485+
for (const auto& [name, timing] : timings) {
486+
double memory_mb = timing.peak_gpu_memory_bytes / (1024.0 * 1024.0);
487+
std::cout << " " << name << ": " << memory_mb << " MB ("
488+
<< timing.peak_gpu_memory_bytes << " bytes)" << std::endl;
489+
}
490+
460491
return 0;
461492
} catch (const std::exception& ex) {
462493
std::cerr << "Unhandled exception: " << ex.what() << std::endl;

examples/models/gemma3/e2e_runner.cpp

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,8 @@
2323
#include <executorch/runtime/core/error.h>
2424
#include <executorch/runtime/platform/log.h>
2525

26+
#include <cuda_runtime.h>
27+
2628
#define STB_IMAGE_IMPLEMENTATION
2729
#include <stb_image.h>
2830
#define STB_IMAGE_RESIZE_IMPLEMENTATION
@@ -67,6 +69,20 @@ using ::executorch::extension::llm::make_text_input;
6769
using ::executorch::extension::llm::MultimodalInput;
6870
using ::executorch::runtime::EValue;
6971

72+
size_t get_gpu_memory_used() {
73+
size_t free_bytes = 0;
74+
size_t total_bytes = 0;
75+
cudaError_t status = cudaMemGetInfo(&free_bytes, &total_bytes);
76+
if (status != cudaSuccess) {
77+
ET_LOG(
78+
Error,
79+
"Warning: cudaMemGetInfo failed: %s",
80+
cudaGetErrorString(status));
81+
return 0;
82+
}
83+
return total_bytes - free_bytes;
84+
}
85+
7086
bool ends_with(const std::string& str, const std::string& suffix) {
7187
return str.size() >= suffix.size() &&
7288
str.compare(str.size() - suffix.size(), suffix.size(), suffix) == 0;
@@ -200,13 +216,29 @@ int32_t main(int32_t argc, char** argv) {
200216
return 1;
201217
}
202218

219+
// Measure memory before loading
220+
cudaDeviceSynchronize();
221+
size_t mem_before_load = get_gpu_memory_used();
222+
ET_LOG(
223+
Info,
224+
"GPU memory before loading: %.2f MB",
225+
mem_before_load / (1024.0 * 1024.0));
226+
203227
// Load runner
204228
auto load_error = runner->load();
205229
if (load_error != ::executorch::runtime::Error::Ok) {
206230
ET_LOG(Error, "Failed to load multimodal runner");
207231
return 1;
208232
}
209233

234+
// Measure memory after loading
235+
cudaDeviceSynchronize();
236+
size_t mem_after_load = get_gpu_memory_used();
237+
ET_LOG(
238+
Info,
239+
"GPU memory after loading: %.2f MB",
240+
mem_after_load / (1024.0 * 1024.0));
241+
210242
// Prepare inputs
211243
std::vector<MultimodalInput> inputs = {
212244
make_text_input("<start_of_turn>user\n<start_of_image>"),
@@ -230,13 +262,64 @@ int32_t main(int32_t argc, char** argv) {
230262
runner->reset();
231263
}
232264

265+
// Measure memory before generation
266+
cudaDeviceSynchronize();
267+
size_t mem_before_gen = get_gpu_memory_used();
268+
233269
auto error = runner->generate(inputs, config);
234270

235271
if (error != ::executorch::runtime::Error::Ok) {
236272
ET_LOG(Error, "Failed to generate with multimodal runner\n");
237273
return 1;
238274
}
275+
276+
// Measure memory after generation
277+
cudaDeviceSynchronize();
278+
size_t mem_after_gen = get_gpu_memory_used();
279+
239280
ET_LOG(Info, "Generated successfully");
240281

282+
// Calculate and print memory usage statistics
283+
size_t load_memory = mem_after_load - mem_before_load;
284+
size_t gen_memory =
285+
mem_after_gen > mem_before_gen ? (mem_after_gen - mem_before_gen) : 0;
286+
size_t total_memory = mem_after_gen - mem_before_load;
287+
size_t peak_memory = mem_after_gen;
288+
289+
std::printf("\n=== CUDA Memory Usage Statistics ===\n");
290+
std::printf(
291+
"Memory before loading: %.2f MB (%zu bytes)\n",
292+
mem_before_load / (1024.0 * 1024.0),
293+
mem_before_load);
294+
std::printf(
295+
"Memory after loading: %.2f MB (%zu bytes)\n",
296+
mem_after_load / (1024.0 * 1024.0),
297+
mem_after_load);
298+
std::printf(
299+
"Memory consumed by loading: %.2f MB (%zu bytes)\n",
300+
load_memory / (1024.0 * 1024.0),
301+
load_memory);
302+
std::printf(
303+
"Memory before generation: %.2f MB (%zu bytes)\n",
304+
mem_before_gen / (1024.0 * 1024.0),
305+
mem_before_gen);
306+
std::printf(
307+
"Memory after generation: %.2f MB (%zu bytes)\n",
308+
mem_after_gen / (1024.0 * 1024.0),
309+
mem_after_gen);
310+
std::printf(
311+
"Memory consumed by generation: %.2f MB (%zu bytes)\n",
312+
gen_memory / (1024.0 * 1024.0),
313+
gen_memory);
314+
std::printf(
315+
"Total memory consumed: %.2f MB (%zu bytes)\n",
316+
total_memory / (1024.0 * 1024.0),
317+
total_memory);
318+
std::printf(
319+
"Peak GPU memory used: %.2f MB (%zu bytes)\n",
320+
peak_memory / (1024.0 * 1024.0),
321+
peak_memory);
322+
std::printf("====================================\n\n");
323+
241324
return 0;
242325
}

0 commit comments

Comments
 (0)