-
Notifications
You must be signed in to change notification settings - Fork 352
Quick Model Loading fix #444
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -207,70 +207,101 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path, | |
| model_params.use_mmap = false; | ||
| #endif | ||
|
|
||
| // Detect model size from filename to set appropriate GPU layers BEFORE loading | ||
| // This prevents OOM crashes on mobile devices with limited GPU memory | ||
| // Note: We use filename heuristics here because we can't know param count until after loading | ||
| std::string path_lower = model_path; | ||
| std::transform(path_lower.begin(), path_lower.end(), path_lower.begin(), ::tolower); | ||
|
|
||
| int gpu_layers = -1; // Default: all layers to GPU | ||
|
|
||
| // Check for large model indicators in filename using word boundary detection | ||
| // Patterns like "7b", "8b", "13b" should match at word boundaries to avoid | ||
| // false positives like "/backup7b/" or "/2017beta/" | ||
| auto is_model_size_marker = [&path_lower](const char* marker) { | ||
| size_t pos = path_lower.find(marker); | ||
| while (pos != std::string::npos) { | ||
| // Check for word boundary before (start of string, or non-alphanumeric) | ||
| bool valid_start = (pos == 0) || !std::isalnum(path_lower[pos - 1]); | ||
| // Check for word boundary after (end of string, or non-alphanumeric except digits for patterns like "7b-q4") | ||
| size_t end_pos = pos + strlen(marker); | ||
| bool valid_end = (end_pos >= path_lower.size()) || | ||
| (!std::isalpha(path_lower[end_pos]) || path_lower[end_pos] == '-' || path_lower[end_pos] == '_'); | ||
|
|
||
| if (valid_start && valid_end) { | ||
| return true; | ||
| } | ||
| pos = path_lower.find(marker, pos + 1); | ||
| } | ||
| return false; | ||
| }; | ||
| // If llama_params_fits aborts, use the user provided value | ||
| int user_gpu_layers = -1; // -1 = not set by user | ||
| if (config.contains("gpu_layers")) { | ||
| user_gpu_layers = config["gpu_layers"].get<int>(); | ||
| LOGI("User-provided GPU layers: %d (will apply after fit)", user_gpu_layers); | ||
| } | ||
|
|
||
| // Detect large models (7B+) that may need GPU layer limiting on mobile | ||
| // First check for config-based override (for custom-named models) | ||
| bool is_large_model = false; | ||
| if (config.contains("expected_params_billions")) { | ||
| double expected_params = config["expected_params_billions"].get<double>(); | ||
| is_large_model = (expected_params >= 7.0); | ||
| if (is_large_model) { | ||
| LOGI("Large model detected from config (%.1fB expected params)", expected_params); | ||
| } | ||
| // Set up context params early for llama_params_fit | ||
| llama_context_params ctx_params = llama_context_default_params(); | ||
| ctx_params.n_ctx = 0; | ||
| ctx_params.n_threads = backend_->get_num_threads(); | ||
| ctx_params.n_threads_batch = backend_->get_num_threads(); | ||
| ctx_params.no_perf = true; | ||
|
|
||
| if (user_context_size > 0) { | ||
| ctx_params.n_ctx = user_context_size; | ||
| LOGI("User-provided context size: %d", user_context_size); | ||
| } | ||
|
|
||
| // Fall back to filename heuristics if no config provided | ||
| if (!is_large_model) { | ||
| is_large_model = is_model_size_marker("7b") || | ||
| is_model_size_marker("8b") || | ||
| is_model_size_marker("9b") || | ||
| is_model_size_marker("13b") || | ||
| is_model_size_marker("70b"); | ||
| size_t n_devices = llama_max_devices(); | ||
| size_t n_overrides = llama_max_tensor_buft_overrides(); | ||
|
|
||
| std::vector<float> tensor_split(n_devices, 0.0f); | ||
| std::vector<llama_model_tensor_buft_override> tensor_buft_overrides(n_overrides); | ||
| std::vector<size_t> margins(n_devices, 0); | ||
|
|
||
| size_t margin_mib = 1024; // Configurable parameter | ||
| if (config.contains("fit_margin_mib")) { | ||
| margin_mib = config["fit_margin_mib"].get<size_t>(); | ||
| } | ||
| for (size_t i = 0; i < n_devices; ++i) { | ||
| margins[i] = margin_mib * 1024 * 1024; | ||
| } | ||
|
|
||
| if (is_large_model) { | ||
| // For 7B+ models on mobile: limit GPU layers to prevent OOM | ||
| // Most 7B models have 32 layers, offload ~24 to GPU, rest to CPU | ||
| gpu_layers = 24; | ||
| LOGI("Large model detected, limiting GPU layers to %d to prevent OOM", gpu_layers); | ||
| uint32_t n_ctx_min = 2048; // Configurable parameter | ||
|
|
||
| LOGI("Calling llama_params_fit (margin=%zuMiB, n_ctx_min=%u, n_devices=%zu)", | ||
| margin_mib, n_ctx_min, n_devices); | ||
|
|
||
| llama_params_fit_status fit_status = llama_params_fit( | ||
| model_path.c_str(), | ||
| &model_params, | ||
| &ctx_params, | ||
| tensor_split.data(), | ||
| tensor_buft_overrides.data(), | ||
| margins.data(), | ||
| n_ctx_min, | ||
| GGML_LOG_LEVEL_INFO | ||
| ); | ||
|
|
||
| switch (fit_status) { | ||
| case LLAMA_PARAMS_FIT_STATUS_SUCCESS: | ||
| LOGI("llama_params_fit SUCCESS: n_gpu_layers=%d, n_ctx=%u", | ||
| model_params.n_gpu_layers, ctx_params.n_ctx); | ||
| break; | ||
| case LLAMA_PARAMS_FIT_STATUS_FAILURE: | ||
| LOGI("llama_params_fit FAILURE: could not fit model to device memory. " | ||
| "Proceeding with conservative CPU-only defaults."); | ||
| model_params.n_gpu_layers = 0; | ||
| if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) { | ||
| ctx_params.n_ctx = 2048; | ||
| } | ||
| break; | ||
| case LLAMA_PARAMS_FIT_STATUS_ERROR: | ||
| LOGE("llama_params_fit ERROR for model: %s. " | ||
| "Falling back to conservative CPU-only defaults.", model_path.c_str()); | ||
| model_params.n_gpu_layers = 0; | ||
| if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) { | ||
| ctx_params.n_ctx = 2048; | ||
| } | ||
| break; | ||
|
Comment on lines
+265
to
+280
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. User-provided context size silently overridden on fit failure If the user explicitly sets Consider adding an explicit log when the user-supplied context is reduced: case LLAMA_PARAMS_FIT_STATUS_FAILURE:
LOGI("llama_params_fit FAILURE: could not fit model to device memory. "
"Proceeding with conservative CPU-only defaults.");
model_params.n_gpu_layers = 0;
if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) {
if (user_context_size > 2048) {
LOGI("Ignoring user-requested context size %d due to fit failure; capping at 2048", user_context_size);
}
ctx_params.n_ctx = 2048;
}
break;Prompt To Fix With AIThis is a comment left during a code review.
Path: sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
Line: 265-280
Comment:
**User-provided context size silently overridden on fit failure**
If the user explicitly sets `context_size` in their config (e.g., `user_context_size = 3000`), it is pre-populated into `ctx_params.n_ctx` at line 225. However, when `llama_params_fit` returns `FAILURE` or `ERROR`, the fallback guard `ctx_params.n_ctx > 2048` will silently cap it to 2048 without any log message specific to the user's request being overridden. The general failure message doesn't make it obvious that the requested context was discarded.
Consider adding an explicit log when the user-supplied context is reduced:
```cpp
case LLAMA_PARAMS_FIT_STATUS_FAILURE:
LOGI("llama_params_fit FAILURE: could not fit model to device memory. "
"Proceeding with conservative CPU-only defaults.");
model_params.n_gpu_layers = 0;
if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) {
if (user_context_size > 2048) {
LOGI("Ignoring user-requested context size %d due to fit failure; capping at 2048", user_context_size);
}
ctx_params.n_ctx = 2048;
}
break;
```
How can I resolve this? If you propose a fix, please make it concise. |
||
| } | ||
|
|
||
| // Allow user override via config | ||
| if (config.contains("gpu_layers")) { | ||
| gpu_layers = config["gpu_layers"].get<int>(); | ||
| LOGI("Using user-provided GPU layers: %d", gpu_layers); | ||
| // Apply user gpu_layers override after fit | ||
| if (user_gpu_layers >= 0) { | ||
| model_params.n_gpu_layers = user_gpu_layers; | ||
| LOGI("Applying user GPU layers override: %d", user_gpu_layers); | ||
| } | ||
|
|
||
| // Currently llama_params_fit does not detect cpu only memory | ||
| // They have an ongoing pr, which when merged, should solve our problem. | ||
| // this should work as a placeholder until then. | ||
| #if !defined(GGML_USE_METAL) && !defined(GGML_USE_CUDA) && !defined(GGML_USE_WEBGPU) | ||
| if (fit_status == LLAMA_PARAMS_FIT_STATUS_SUCCESS) { | ||
| LOGI("CPU-only build: llama_params_fit fitted to GPU memory but no GPU backend active. " | ||
| "Applying conservative CPU defaults."); | ||
| } | ||
| model_params.n_gpu_layers = 0; | ||
| if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 4096) { | ||
| ctx_params.n_ctx = 4096; | ||
| LOGI("CPU-only: capping context to %u", ctx_params.n_ctx); | ||
| } | ||
| #endif | ||
|
Comment on lines
+292
to
+302
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. CPU-only block unconditionally overwrites user The Consider moving the user override application to after the CPU-only block, or at minimum logging a warning that the user override was ignored: #if !defined(GGML_USE_METAL) && !defined(GGML_USE_CUDA) && !defined(GGML_USE_WEBGPU)
if (fit_status == LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
LOGI("CPU-only build: llama_params_fit fitted to GPU memory but no GPU backend active. "
"Applying conservative CPU defaults.");
}
model_params.n_gpu_layers = 0;
if (user_gpu_layers >= 0) {
LOGI("CPU-only build: ignoring user-provided gpu_layers=%d (no GPU backend active)", user_gpu_layers);
}
if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 4096) {
ctx_params.n_ctx = 4096;
LOGI("CPU-only: capping context to %u", ctx_params.n_ctx);
}
#else
// Apply user gpu_layers override after fit (GPU builds only)
if (user_gpu_layers >= 0) {
model_params.n_gpu_layers = user_gpu_layers;
LOGI("Applying user GPU layers override: %d", user_gpu_layers);
}
#endifPrompt To Fix With AIThis is a comment left during a code review.
Path: sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
Line: 292-302
Comment:
**CPU-only block unconditionally overwrites user `gpu_layers` override**
The `user_gpu_layers` override is applied at lines 283–287, but the CPU-only preprocessor block immediately after (lines 292–302) unconditionally sets `model_params.n_gpu_layers = 0`, silently discarding the user's explicit config. The comment at line 210 states "If llama_params_fits aborts, use the user provided value," but this intent is not honored in CPU-only builds.
Consider moving the user override application to after the CPU-only block, or at minimum logging a warning that the user override was ignored:
```cpp
#if !defined(GGML_USE_METAL) && !defined(GGML_USE_CUDA) && !defined(GGML_USE_WEBGPU)
if (fit_status == LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
LOGI("CPU-only build: llama_params_fit fitted to GPU memory but no GPU backend active. "
"Applying conservative CPU defaults.");
}
model_params.n_gpu_layers = 0;
if (user_gpu_layers >= 0) {
LOGI("CPU-only build: ignoring user-provided gpu_layers=%d (no GPU backend active)", user_gpu_layers);
}
if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 4096) {
ctx_params.n_ctx = 4096;
LOGI("CPU-only: capping context to %u", ctx_params.n_ctx);
}
#else
// Apply user gpu_layers override after fit (GPU builds only)
if (user_gpu_layers >= 0) {
model_params.n_gpu_layers = user_gpu_layers;
LOGI("Applying user GPU layers override: %d", user_gpu_layers);
}
#endif
```
How can I resolve this? If you propose a fix, please make it concise. |
||
|
|
||
| model_params.n_gpu_layers = gpu_layers; | ||
| LOGI("Loading model with n_gpu_layers=%d", gpu_layers); | ||
| LOGI("Loading model with n_gpu_layers=%d", model_params.n_gpu_layers); | ||
|
|
||
| model_ = llama_model_load_from_file(model_path.c_str(), model_params); | ||
|
|
||
|
|
@@ -280,60 +311,24 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path, | |
| } | ||
|
|
||
| int model_train_ctx = llama_model_n_ctx_train(model_); | ||
| LOGI("Model training context size: %d", model_train_ctx); | ||
|
|
||
| // Get model parameter count to determine appropriate context size | ||
| // Large models (7B+) need smaller context on mobile to fit in memory | ||
| uint64_t n_params = llama_model_n_params(model_); | ||
| double params_billions = static_cast<double>(n_params) / 1e9; | ||
| LOGI("Model parameters: %.2fB", params_billions); | ||
|
|
||
| // Post-load verification: warn if actual param count differs from filename heuristic | ||
| bool actual_is_large = (params_billions >= 7.0); | ||
| if (actual_is_large && !is_large_model) { | ||
| LOGI("WARNING: Model has %.1fB params but filename didn't indicate large model. " | ||
| "Consider using gpu_layers config for optimal performance.", params_billions); | ||
| } else if (!actual_is_large && is_large_model) { | ||
| LOGI("NOTE: Filename suggested large model but actual params are %.1fB. " | ||
| "GPU layer limiting may be conservative.", params_billions); | ||
| } | ||
|
|
||
| // Adaptive context size based on model size for mobile devices | ||
| int adaptive_max_context; | ||
| if (params_billions >= 7.0) { | ||
| // 7B+ models: use 2048 context to fit in ~6GB GPU memory | ||
| adaptive_max_context = 2048; | ||
| LOGI("Large model detected (%.1fB params), limiting context to %d for memory", params_billions, adaptive_max_context); | ||
| } else if (params_billions >= 3.0) { | ||
| // 3-7B models: use 4096 context | ||
| adaptive_max_context = 4096; | ||
| LOGI("Medium model detected (%.1fB params), limiting context to %d", params_billions, adaptive_max_context); | ||
| } else if (params_billions >= 1.0) { | ||
| // 1-3B models: use 2048 context (higher values OOM on mobile, especially with LoRA) | ||
| adaptive_max_context = 2048; | ||
| LOGI("Small-medium model detected (%.1fB params), limiting context to %d", params_billions, adaptive_max_context); | ||
| } else { | ||
| // Tiny models (<1B): can use larger context | ||
| adaptive_max_context = max_default_context_; | ||
| } | ||
| LOGI("Model loaded: %.2fB params, training context=%d", params_billions, model_train_ctx); | ||
|
|
||
| if (user_context_size > 0) { | ||
| context_size_ = std::min(user_context_size, model_train_ctx); | ||
| LOGI("Using user-provided context size: %d (requested: %d, model max: %d)", context_size_, | ||
| user_context_size, model_train_ctx); | ||
| } else { | ||
| context_size_ = std::min({model_train_ctx, max_default_context_, adaptive_max_context}); | ||
| LOGI("Auto-detected context size: %d (model: %d, cap: %d, adaptive: %d)", context_size_, | ||
| model_train_ctx, max_default_context_, adaptive_max_context); | ||
| if (ctx_params.n_ctx == 0) { | ||
| ctx_params.n_ctx = std::min(model_train_ctx, max_default_context_); | ||
| } | ||
| context_size_ = std::min({(int)ctx_params.n_ctx, model_train_ctx, max_default_context_}); | ||
|
|
||
| LOGI("Final context size: %d (fitted=%u, train=%d, cap=%d)", | ||
| context_size_, ctx_params.n_ctx, model_train_ctx, max_default_context_); | ||
|
|
||
| int max_safe_batch = 2048; // Configurable parameter | ||
| int safe_batch_size = std::min(context_size_, max_safe_batch); | ||
|
|
||
| llama_context_params ctx_params = llama_context_default_params(); | ||
| ctx_params.n_ctx = context_size_; | ||
| ctx_params.n_batch = context_size_; // Allow processing full prompt at once | ||
| ctx_params.n_ubatch = context_size_; // Physical batch size must also match | ||
| ctx_params.n_threads = backend_->get_num_threads(); | ||
| ctx_params.n_threads_batch = backend_->get_num_threads(); | ||
| ctx_params.no_perf = true; | ||
| ctx_params.n_batch = safe_batch_size; | ||
| ctx_params.n_ubatch = safe_batch_size; | ||
|
|
||
| context_ = llama_init_from_model(model_, ctx_params); | ||
|
|
||
|
|
@@ -802,11 +797,13 @@ bool LlamaCppTextGeneration::recreate_context() { | |
| context_ = nullptr; | ||
| } | ||
|
|
||
| // Create new context (adapters are now visible to it) | ||
| int max_safe_batch = 2048; // Configurable parameter | ||
| int safe_batch_size = std::min(context_size_, max_safe_batch); | ||
|
|
||
| llama_context_params ctx_params = llama_context_default_params(); | ||
| ctx_params.n_ctx = context_size_; | ||
| ctx_params.n_batch = context_size_; | ||
| ctx_params.n_ubatch = context_size_; | ||
| ctx_params.n_batch = safe_batch_size; | ||
| ctx_params.n_ubatch = safe_batch_size; | ||
| ctx_params.n_threads = backend_->get_num_threads(); | ||
| ctx_params.n_threads_batch = backend_->get_num_threads(); | ||
| ctx_params.no_perf = true; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
n_ctx_minmarked configurable but not wired to configThe comment says
// Configurable parameter, but unlikefit_margin_mib(which has a correspondingconfig.contains("fit_margin_mib")lookup),n_ctx_minis always hardcoded to2048. If the intent is to allow callers to tune the minimum context threshold, the config read is missing:Prompt To Fix With AI