From 8508673b81fcece6eb6dabb5884942613d94d3e7 Mon Sep 17 00:00:00 2001
From: VyasGuru <71374747+VyasGuru@users.noreply.github.com>
Date: Wed, 4 Mar 2026 19:35:00 +0530
Subject: [PATCH] Quick Model Loading fix

Fixes model loading issue due to huge buffer allocation.
---
 .../backends/llamacpp/llamacpp_backend.cpp    | 209 +++++++++---------
 1 file changed, 103 insertions(+), 106 deletions(-)
diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
index f7578d58b..f76179710 100644
--- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
+++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
@@ -207,70 +207,101 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path,
     model_params.use_mmap = false;
 #endif
 
-    // Detect model size from filename to set appropriate GPU layers BEFORE loading
-    // This prevents OOM crashes on mobile devices with limited GPU memory
-    // Note: We use filename heuristics here because we can't know param count until after loading
-    std::string path_lower = model_path;
-    std::transform(path_lower.begin(), path_lower.end(), path_lower.begin(), ::tolower);
-
-    int gpu_layers = -1;  // Default: all layers to GPU
-
-    // Check for large model indicators in filename using word boundary detection
-    // Patterns like "7b", "8b", "13b" should match at word boundaries to avoid
-    // false positives like "/backup7b/" or "/2017beta/"
-    auto is_model_size_marker = [&path_lower](const char* marker) {
-        size_t pos = path_lower.find(marker);
-        while (pos != std::string::npos) {
-            // Check for word boundary before (start of string, or non-alphanumeric)
-            bool valid_start = (pos == 0) || !std::isalnum(path_lower[pos - 1]);
-            // Check for word boundary after (end of string, or non-alphanumeric except digits for patterns like "7b-q4")
-            size_t end_pos = pos + strlen(marker);
-            bool valid_end = (end_pos >= path_lower.size()) ||
-                            (!std::isalpha(path_lower[end_pos]) || path_lower[end_pos] == '-' || path_lower[end_pos] == '_');
-
-            if (valid_start && valid_end) {
-                return true;
-            }
-            pos = path_lower.find(marker, pos + 1);
-        }
-        return false;
-    };
+    // If llama_params_fits aborts, use the user provided value
+    int user_gpu_layers = -1;  // -1 = not set by user
+    if (config.contains("gpu_layers")) {
+        user_gpu_layers = config["gpu_layers"].get<int>();
+        LOGI("User-provided GPU layers: %d (will apply after fit)", user_gpu_layers);
+    }
 
-    // Detect large models (7B+) that may need GPU layer limiting on mobile
-    // First check for config-based override (for custom-named models)
-    bool is_large_model = false;
-    if (config.contains("expected_params_billions")) {
-        double expected_params = config["expected_params_billions"].get<double>();
-        is_large_model = (expected_params >= 7.0);
-        if (is_large_model) {
-            LOGI("Large model detected from config (%.1fB expected params)", expected_params);
-        }
+    // Set up context params early for llama_params_fit
+    llama_context_params ctx_params = llama_context_default_params();
+    ctx_params.n_ctx = 0;
+    ctx_params.n_threads = backend_->get_num_threads();
+    ctx_params.n_threads_batch = backend_->get_num_threads();
+    ctx_params.no_perf = true;
+
+    if (user_context_size > 0) {
+        ctx_params.n_ctx = user_context_size;
+        LOGI("User-provided context size: %d", user_context_size);
     }
 
-    // Fall back to filename heuristics if no config provided
-    if (!is_large_model) {
-        is_large_model = is_model_size_marker("7b") ||
-                         is_model_size_marker("8b") ||
-                         is_model_size_marker("9b") ||
-                         is_model_size_marker("13b") ||
-                         is_model_size_marker("70b");
+    size_t n_devices = llama_max_devices();
+    size_t n_overrides = llama_max_tensor_buft_overrides();
+
+    std::vector<float> tensor_split(n_devices, 0.0f);
+    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides(n_overrides);
+    std::vector<size_t> margins(n_devices, 0);
+
+    size_t margin_mib = 1024; // Configurable parameter
+    if (config.contains("fit_margin_mib")) {
+        margin_mib = config["fit_margin_mib"].get<size_t>();
+    }
+    for (size_t i = 0; i < n_devices; ++i) {
+        margins[i] = margin_mib * 1024 * 1024;
     }
 
-    if (is_large_model) {
-        // For 7B+ models on mobile: limit GPU layers to prevent OOM
-        // Most 7B models have 32 layers, offload ~24 to GPU, rest to CPU
-        gpu_layers = 24;
-        LOGI("Large model detected, limiting GPU layers to %d to prevent OOM", gpu_layers);
+    uint32_t n_ctx_min = 2048;  // Configurable parameter 
+
+    LOGI("Calling llama_params_fit (margin=%zuMiB, n_ctx_min=%u, n_devices=%zu)",
+         margin_mib, n_ctx_min, n_devices);
+
+    llama_params_fit_status fit_status = llama_params_fit(
+        model_path.c_str(),
+        &model_params,
+        &ctx_params,
+        tensor_split.data(),
+        tensor_buft_overrides.data(),
+        margins.data(),
+        n_ctx_min,
+        GGML_LOG_LEVEL_INFO
+    );
+
+    switch (fit_status) {
+        case LLAMA_PARAMS_FIT_STATUS_SUCCESS:
+            LOGI("llama_params_fit SUCCESS: n_gpu_layers=%d, n_ctx=%u",
+                 model_params.n_gpu_layers, ctx_params.n_ctx);
+            break;
+        case LLAMA_PARAMS_FIT_STATUS_FAILURE:
+            LOGI("llama_params_fit FAILURE: could not fit model to device memory. "
+                 "Proceeding with conservative CPU-only defaults.");
+            model_params.n_gpu_layers = 0;
+            if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) {
+                ctx_params.n_ctx = 2048;
+            }
+            break;
+        case LLAMA_PARAMS_FIT_STATUS_ERROR:
+            LOGE("llama_params_fit ERROR for model: %s. "
+                 "Falling back to conservative CPU-only defaults.", model_path.c_str());
+            model_params.n_gpu_layers = 0;
+            if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) {
+                ctx_params.n_ctx = 2048;
+            }
+            break;
     }
 
-    // Allow user override via config
-    if (config.contains("gpu_layers")) {
-        gpu_layers = config["gpu_layers"].get<int>();
-        LOGI("Using user-provided GPU layers: %d", gpu_layers);
+    // Apply user gpu_layers override after fit 
+    if (user_gpu_layers >= 0) {
+        model_params.n_gpu_layers = user_gpu_layers;
+        LOGI("Applying user GPU layers override: %d", user_gpu_layers);
+    }
+
+    // Currently llama_params_fit does not detect cpu only memory
+    // They have an ongoing pr, which when merged, should solve our problem.
+    // this should work as a placeholder until then.
+#if !defined(GGML_USE_METAL) && !defined(GGML_USE_CUDA) && !defined(GGML_USE_WEBGPU)
+    if (fit_status == LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
+        LOGI("CPU-only build: llama_params_fit fitted to GPU memory but no GPU backend active. "
+             "Applying conservative CPU defaults.");
+    }
+    model_params.n_gpu_layers = 0;
+    if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 4096) {
+        ctx_params.n_ctx = 4096;
+        LOGI("CPU-only: capping context to %u", ctx_params.n_ctx);
     }
+#endif
 
-    model_params.n_gpu_layers = gpu_layers;
-    LOGI("Loading model with n_gpu_layers=%d", gpu_layers);
+    LOGI("Loading model with n_gpu_layers=%d", model_params.n_gpu_layers);
 
     model_ = llama_model_load_from_file(model_path.c_str(), model_params);
 
@@ -280,60 +311,24 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path,
     }
 
     int model_train_ctx = llama_model_n_ctx_train(model_);
-    LOGI("Model training context size: %d", model_train_ctx);
-
-    // Get model parameter count to determine appropriate context size
-    // Large models (7B+) need smaller context on mobile to fit in memory
     uint64_t n_params = llama_model_n_params(model_);
     double params_billions = static_cast<double>(n_params) / 1e9;
-    LOGI("Model parameters: %.2fB", params_billions);
-
-    // Post-load verification: warn if actual param count differs from filename heuristic
-    bool actual_is_large = (params_billions >= 7.0);
-    if (actual_is_large && !is_large_model) {
-        LOGI("WARNING: Model has %.1fB params but filename didn't indicate large model. "
-             "Consider using gpu_layers config for optimal performance.", params_billions);
-    } else if (!actual_is_large && is_large_model) {
-        LOGI("NOTE: Filename suggested large model but actual params are %.1fB. "
-             "GPU layer limiting may be conservative.", params_billions);
-    }
-
-    // Adaptive context size based on model size for mobile devices
-    int adaptive_max_context;
-    if (params_billions >= 7.0) {
-        // 7B+ models: use 2048 context to fit in ~6GB GPU memory
-        adaptive_max_context = 2048;
-        LOGI("Large model detected (%.1fB params), limiting context to %d for memory", params_billions, adaptive_max_context);
-    } else if (params_billions >= 3.0) {
-        // 3-7B models: use 4096 context
-        adaptive_max_context = 4096;
-        LOGI("Medium model detected (%.1fB params), limiting context to %d", params_billions, adaptive_max_context);
-    } else if (params_billions >= 1.0) {
-        // 1-3B models: use 2048 context (higher values OOM on mobile, especially with LoRA)
-        adaptive_max_context = 2048;
-        LOGI("Small-medium model detected (%.1fB params), limiting context to %d", params_billions, adaptive_max_context);
-    } else {
-        // Tiny models (<1B): can use larger context
-        adaptive_max_context = max_default_context_;
-    }
+    LOGI("Model loaded: %.2fB params, training context=%d", params_billions, model_train_ctx);
 
-    if (user_context_size > 0) {
-        context_size_ = std::min(user_context_size, model_train_ctx);
-        LOGI("Using user-provided context size: %d (requested: %d, model max: %d)", context_size_,
-             user_context_size, model_train_ctx);
-    } else {
-        context_size_ = std::min({model_train_ctx, max_default_context_, adaptive_max_context});
-        LOGI("Auto-detected context size: %d (model: %d, cap: %d, adaptive: %d)", context_size_,
-             model_train_ctx, max_default_context_, adaptive_max_context);
+    if (ctx_params.n_ctx == 0) {
+        ctx_params.n_ctx = std::min(model_train_ctx, max_default_context_);
     }
+    context_size_ = std::min({(int)ctx_params.n_ctx, model_train_ctx, max_default_context_});
+
+    LOGI("Final context size: %d (fitted=%u, train=%d, cap=%d)",
+         context_size_, ctx_params.n_ctx, model_train_ctx, max_default_context_);
+
+    int max_safe_batch = 2048; // Configurable parameter
+    int safe_batch_size = std::min(context_size_, max_safe_batch);
 
-    llama_context_params ctx_params = llama_context_default_params();
     ctx_params.n_ctx = context_size_;
-    ctx_params.n_batch = context_size_;   // Allow processing full prompt at once
-    ctx_params.n_ubatch = context_size_;  // Physical batch size must also match
-    ctx_params.n_threads = backend_->get_num_threads();
-    ctx_params.n_threads_batch = backend_->get_num_threads();
-    ctx_params.no_perf = true;
+    ctx_params.n_batch = safe_batch_size;
+    ctx_params.n_ubatch = safe_batch_size;
 
     context_ = llama_init_from_model(model_, ctx_params);
 
@@ -802,11 +797,13 @@ bool LlamaCppTextGeneration::recreate_context() {
         context_ = nullptr;
     }
 
-    // Create new context (adapters are now visible to it)
+    int max_safe_batch = 2048; // Configurable parameter
+    int safe_batch_size = std::min(context_size_, max_safe_batch);
+
     llama_context_params ctx_params = llama_context_default_params();
     ctx_params.n_ctx = context_size_;
-    ctx_params.n_batch = context_size_;
-    ctx_params.n_ubatch = context_size_;
+    ctx_params.n_batch = safe_batch_size;
+    ctx_params.n_ubatch = safe_batch_size;
     ctx_params.n_threads = backend_->get_num_threads();
     ctx_params.n_threads_batch = backend_->get_num_threads();
     ctx_params.no_perf = true;