From 8508673b81fcece6eb6dabb5884942613d94d3e7 Mon Sep 17 00:00:00 2001 From: VyasGuru <71374747+VyasGuru@users.noreply.github.com> Date: Wed, 4 Mar 2026 19:35:00 +0530 Subject: [PATCH] Quick Model Loading fix Fixes model loading issue due to huge buffer allocation. --- .../backends/llamacpp/llamacpp_backend.cpp | 209 +++++++++--------- 1 file changed, 103 insertions(+), 106 deletions(-) diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp index f7578d58b..f76179710 100644 --- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp +++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp @@ -207,70 +207,101 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path, model_params.use_mmap = false; #endif - // Detect model size from filename to set appropriate GPU layers BEFORE loading - // This prevents OOM crashes on mobile devices with limited GPU memory - // Note: We use filename heuristics here because we can't know param count until after loading - std::string path_lower = model_path; - std::transform(path_lower.begin(), path_lower.end(), path_lower.begin(), ::tolower); - - int gpu_layers = -1; // Default: all layers to GPU - - // Check for large model indicators in filename using word boundary detection - // Patterns like "7b", "8b", "13b" should match at word boundaries to avoid - // false positives like "/backup7b/" or "/2017beta/" - auto is_model_size_marker = [&path_lower](const char* marker) { - size_t pos = path_lower.find(marker); - while (pos != std::string::npos) { - // Check for word boundary before (start of string, or non-alphanumeric) - bool valid_start = (pos == 0) || !std::isalnum(path_lower[pos - 1]); - // Check for word boundary after (end of string, or non-alphanumeric except digits for patterns like "7b-q4") - size_t end_pos = pos + strlen(marker); - bool valid_end = (end_pos >= path_lower.size()) || - (!std::isalpha(path_lower[end_pos]) || path_lower[end_pos] == '-' || path_lower[end_pos] == '_'); - - if (valid_start && valid_end) { - return true; - } - pos = path_lower.find(marker, pos + 1); - } - return false; - }; + // If llama_params_fits aborts, use the user provided value + int user_gpu_layers = -1; // -1 = not set by user + if (config.contains("gpu_layers")) { + user_gpu_layers = config["gpu_layers"].get(); + LOGI("User-provided GPU layers: %d (will apply after fit)", user_gpu_layers); + } - // Detect large models (7B+) that may need GPU layer limiting on mobile - // First check for config-based override (for custom-named models) - bool is_large_model = false; - if (config.contains("expected_params_billions")) { - double expected_params = config["expected_params_billions"].get(); - is_large_model = (expected_params >= 7.0); - if (is_large_model) { - LOGI("Large model detected from config (%.1fB expected params)", expected_params); - } + // Set up context params early for llama_params_fit + llama_context_params ctx_params = llama_context_default_params(); + ctx_params.n_ctx = 0; + ctx_params.n_threads = backend_->get_num_threads(); + ctx_params.n_threads_batch = backend_->get_num_threads(); + ctx_params.no_perf = true; + + if (user_context_size > 0) { + ctx_params.n_ctx = user_context_size; + LOGI("User-provided context size: %d", user_context_size); } - // Fall back to filename heuristics if no config provided - if (!is_large_model) { - is_large_model = is_model_size_marker("7b") || - is_model_size_marker("8b") || - is_model_size_marker("9b") || - is_model_size_marker("13b") || - is_model_size_marker("70b"); + size_t n_devices = llama_max_devices(); + size_t n_overrides = llama_max_tensor_buft_overrides(); + + std::vector tensor_split(n_devices, 0.0f); + std::vector tensor_buft_overrides(n_overrides); + std::vector margins(n_devices, 0); + + size_t margin_mib = 1024; // Configurable parameter + if (config.contains("fit_margin_mib")) { + margin_mib = config["fit_margin_mib"].get(); + } + for (size_t i = 0; i < n_devices; ++i) { + margins[i] = margin_mib * 1024 * 1024; } - if (is_large_model) { - // For 7B+ models on mobile: limit GPU layers to prevent OOM - // Most 7B models have 32 layers, offload ~24 to GPU, rest to CPU - gpu_layers = 24; - LOGI("Large model detected, limiting GPU layers to %d to prevent OOM", gpu_layers); + uint32_t n_ctx_min = 2048; // Configurable parameter + + LOGI("Calling llama_params_fit (margin=%zuMiB, n_ctx_min=%u, n_devices=%zu)", + margin_mib, n_ctx_min, n_devices); + + llama_params_fit_status fit_status = llama_params_fit( + model_path.c_str(), + &model_params, + &ctx_params, + tensor_split.data(), + tensor_buft_overrides.data(), + margins.data(), + n_ctx_min, + GGML_LOG_LEVEL_INFO + ); + + switch (fit_status) { + case LLAMA_PARAMS_FIT_STATUS_SUCCESS: + LOGI("llama_params_fit SUCCESS: n_gpu_layers=%d, n_ctx=%u", + model_params.n_gpu_layers, ctx_params.n_ctx); + break; + case LLAMA_PARAMS_FIT_STATUS_FAILURE: + LOGI("llama_params_fit FAILURE: could not fit model to device memory. " + "Proceeding with conservative CPU-only defaults."); + model_params.n_gpu_layers = 0; + if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) { + ctx_params.n_ctx = 2048; + } + break; + case LLAMA_PARAMS_FIT_STATUS_ERROR: + LOGE("llama_params_fit ERROR for model: %s. " + "Falling back to conservative CPU-only defaults.", model_path.c_str()); + model_params.n_gpu_layers = 0; + if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) { + ctx_params.n_ctx = 2048; + } + break; } - // Allow user override via config - if (config.contains("gpu_layers")) { - gpu_layers = config["gpu_layers"].get(); - LOGI("Using user-provided GPU layers: %d", gpu_layers); + // Apply user gpu_layers override after fit + if (user_gpu_layers >= 0) { + model_params.n_gpu_layers = user_gpu_layers; + LOGI("Applying user GPU layers override: %d", user_gpu_layers); + } + + // Currently llama_params_fit does not detect cpu only memory + // They have an ongoing pr, which when merged, should solve our problem. + // this should work as a placeholder until then. +#if !defined(GGML_USE_METAL) && !defined(GGML_USE_CUDA) && !defined(GGML_USE_WEBGPU) + if (fit_status == LLAMA_PARAMS_FIT_STATUS_SUCCESS) { + LOGI("CPU-only build: llama_params_fit fitted to GPU memory but no GPU backend active. " + "Applying conservative CPU defaults."); + } + model_params.n_gpu_layers = 0; + if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 4096) { + ctx_params.n_ctx = 4096; + LOGI("CPU-only: capping context to %u", ctx_params.n_ctx); } +#endif - model_params.n_gpu_layers = gpu_layers; - LOGI("Loading model with n_gpu_layers=%d", gpu_layers); + LOGI("Loading model with n_gpu_layers=%d", model_params.n_gpu_layers); model_ = llama_model_load_from_file(model_path.c_str(), model_params); @@ -280,60 +311,24 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path, } int model_train_ctx = llama_model_n_ctx_train(model_); - LOGI("Model training context size: %d", model_train_ctx); - - // Get model parameter count to determine appropriate context size - // Large models (7B+) need smaller context on mobile to fit in memory uint64_t n_params = llama_model_n_params(model_); double params_billions = static_cast(n_params) / 1e9; - LOGI("Model parameters: %.2fB", params_billions); - - // Post-load verification: warn if actual param count differs from filename heuristic - bool actual_is_large = (params_billions >= 7.0); - if (actual_is_large && !is_large_model) { - LOGI("WARNING: Model has %.1fB params but filename didn't indicate large model. " - "Consider using gpu_layers config for optimal performance.", params_billions); - } else if (!actual_is_large && is_large_model) { - LOGI("NOTE: Filename suggested large model but actual params are %.1fB. " - "GPU layer limiting may be conservative.", params_billions); - } - - // Adaptive context size based on model size for mobile devices - int adaptive_max_context; - if (params_billions >= 7.0) { - // 7B+ models: use 2048 context to fit in ~6GB GPU memory - adaptive_max_context = 2048; - LOGI("Large model detected (%.1fB params), limiting context to %d for memory", params_billions, adaptive_max_context); - } else if (params_billions >= 3.0) { - // 3-7B models: use 4096 context - adaptive_max_context = 4096; - LOGI("Medium model detected (%.1fB params), limiting context to %d", params_billions, adaptive_max_context); - } else if (params_billions >= 1.0) { - // 1-3B models: use 2048 context (higher values OOM on mobile, especially with LoRA) - adaptive_max_context = 2048; - LOGI("Small-medium model detected (%.1fB params), limiting context to %d", params_billions, adaptive_max_context); - } else { - // Tiny models (<1B): can use larger context - adaptive_max_context = max_default_context_; - } + LOGI("Model loaded: %.2fB params, training context=%d", params_billions, model_train_ctx); - if (user_context_size > 0) { - context_size_ = std::min(user_context_size, model_train_ctx); - LOGI("Using user-provided context size: %d (requested: %d, model max: %d)", context_size_, - user_context_size, model_train_ctx); - } else { - context_size_ = std::min({model_train_ctx, max_default_context_, adaptive_max_context}); - LOGI("Auto-detected context size: %d (model: %d, cap: %d, adaptive: %d)", context_size_, - model_train_ctx, max_default_context_, adaptive_max_context); + if (ctx_params.n_ctx == 0) { + ctx_params.n_ctx = std::min(model_train_ctx, max_default_context_); } + context_size_ = std::min({(int)ctx_params.n_ctx, model_train_ctx, max_default_context_}); + + LOGI("Final context size: %d (fitted=%u, train=%d, cap=%d)", + context_size_, ctx_params.n_ctx, model_train_ctx, max_default_context_); + + int max_safe_batch = 2048; // Configurable parameter + int safe_batch_size = std::min(context_size_, max_safe_batch); - llama_context_params ctx_params = llama_context_default_params(); ctx_params.n_ctx = context_size_; - ctx_params.n_batch = context_size_; // Allow processing full prompt at once - ctx_params.n_ubatch = context_size_; // Physical batch size must also match - ctx_params.n_threads = backend_->get_num_threads(); - ctx_params.n_threads_batch = backend_->get_num_threads(); - ctx_params.no_perf = true; + ctx_params.n_batch = safe_batch_size; + ctx_params.n_ubatch = safe_batch_size; context_ = llama_init_from_model(model_, ctx_params); @@ -802,11 +797,13 @@ bool LlamaCppTextGeneration::recreate_context() { context_ = nullptr; } - // Create new context (adapters are now visible to it) + int max_safe_batch = 2048; // Configurable parameter + int safe_batch_size = std::min(context_size_, max_safe_batch); + llama_context_params ctx_params = llama_context_default_params(); ctx_params.n_ctx = context_size_; - ctx_params.n_batch = context_size_; - ctx_params.n_ubatch = context_size_; + ctx_params.n_batch = safe_batch_size; + ctx_params.n_ubatch = safe_batch_size; ctx_params.n_threads = backend_->get_num_threads(); ctx_params.n_threads_batch = backend_->get_num_threads(); ctx_params.no_perf = true;