Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
200 changes: 94 additions & 106 deletions sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -207,70 +207,101 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path,
model_params.use_mmap = false;
#endif

// Detect model size from filename to set appropriate GPU layers BEFORE loading
// This prevents OOM crashes on mobile devices with limited GPU memory
// Note: We use filename heuristics here because we can't know param count until after loading
std::string path_lower = model_path;
std::transform(path_lower.begin(), path_lower.end(), path_lower.begin(), ::tolower);

int gpu_layers = -1; // Default: all layers to GPU

// Check for large model indicators in filename using word boundary detection
// Patterns like "7b", "8b", "13b" should match at word boundaries to avoid
// false positives like "/backup7b/" or "/2017beta/"
auto is_model_size_marker = [&path_lower](const char* marker) {
size_t pos = path_lower.find(marker);
while (pos != std::string::npos) {
// Check for word boundary before (start of string, or non-alphanumeric)
bool valid_start = (pos == 0) || !std::isalnum(path_lower[pos - 1]);
// Check for word boundary after (end of string, or non-alphanumeric except digits for patterns like "7b-q4")
size_t end_pos = pos + strlen(marker);
bool valid_end = (end_pos >= path_lower.size()) ||
(!std::isalpha(path_lower[end_pos]) || path_lower[end_pos] == '-' || path_lower[end_pos] == '_');

if (valid_start && valid_end) {
return true;
}
pos = path_lower.find(marker, pos + 1);
}
return false;
};
// If llama_params_fits aborts, use the user provided value
int user_gpu_layers = -1; // -1 = not set by user
if (config.contains("gpu_layers")) {
user_gpu_layers = config["gpu_layers"].get<int>();
LOGI("User-provided GPU layers: %d (will apply after fit)", user_gpu_layers);
}

// Detect large models (7B+) that may need GPU layer limiting on mobile
// First check for config-based override (for custom-named models)
bool is_large_model = false;
if (config.contains("expected_params_billions")) {
double expected_params = config["expected_params_billions"].get<double>();
is_large_model = (expected_params >= 7.0);
if (is_large_model) {
LOGI("Large model detected from config (%.1fB expected params)", expected_params);
}
// Set up context params early for llama_params_fit
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = 0;
ctx_params.n_threads = backend_->get_num_threads();
ctx_params.n_threads_batch = backend_->get_num_threads();
ctx_params.no_perf = true;

if (user_context_size > 0) {
ctx_params.n_ctx = user_context_size;
LOGI("User-provided context size: %d", user_context_size);
}

// Fall back to filename heuristics if no config provided
if (!is_large_model) {
is_large_model = is_model_size_marker("7b") ||
is_model_size_marker("8b") ||
is_model_size_marker("9b") ||
is_model_size_marker("13b") ||
is_model_size_marker("70b");
size_t n_devices = llama_max_devices();
size_t n_overrides = llama_max_tensor_buft_overrides();

std::vector<float> tensor_split(n_devices, 0.0f);
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides(n_overrides);
std::vector<size_t> margins(n_devices, 0);

size_t margin_mib = 1024; // Configurable parameter
if (config.contains("fit_margin_mib")) {
margin_mib = config["fit_margin_mib"].get<size_t>();
}
for (size_t i = 0; i < n_devices; ++i) {
margins[i] = margin_mib * 1024 * 1024;
}

uint32_t n_ctx_min = 2048; // Configurable parameter
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

n_ctx_min marked configurable but not wired to config

The comment says // Configurable parameter, but unlike fit_margin_mib (which has a corresponding config.contains("fit_margin_mib") lookup), n_ctx_min is always hardcoded to 2048. If the intent is to allow callers to tune the minimum context threshold, the config read is missing:

Suggested change
uint32_t n_ctx_min = 2048; // Configurable parameter
uint32_t n_ctx_min = 2048; // Configurable parameter
if (config.contains("fit_n_ctx_min")) {
n_ctx_min = config["fit_n_ctx_min"].get<uint32_t>();
}
Prompt To Fix With AI
This is a comment left during a code review.
Path: sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
Line: 244

Comment:
**`n_ctx_min` marked configurable but not wired to config**

The comment says `// Configurable parameter`, but unlike `fit_margin_mib` (which has a corresponding `config.contains("fit_margin_mib")` lookup), `n_ctx_min` is always hardcoded to `2048`. If the intent is to allow callers to tune the minimum context threshold, the config read is missing:

```suggestion
    uint32_t n_ctx_min = 2048;  // Configurable parameter
    if (config.contains("fit_n_ctx_min")) {
        n_ctx_min = config["fit_n_ctx_min"].get<uint32_t>();
    }
```

How can I resolve this? If you propose a fix, please make it concise.


LOGI("Calling llama_params_fit (margin=%zuMiB, n_ctx_min=%u, n_devices=%zu)",
margin_mib, n_ctx_min, n_devices);

if (is_large_model) {
// For 7B+ models on mobile: limit GPU layers to prevent OOM
// Most 7B models have 32 layers, offload ~24 to GPU, rest to CPU
gpu_layers = 24;
LOGI("Large model detected, limiting GPU layers to %d to prevent OOM", gpu_layers);
llama_params_fit_status fit_status = llama_params_fit(
model_path.c_str(),
&model_params,
&ctx_params,
tensor_split.data(),
tensor_buft_overrides.data(),
margins.data(),
n_ctx_min,
GGML_LOG_LEVEL_INFO
);

switch (fit_status) {
case LLAMA_PARAMS_FIT_STATUS_SUCCESS:
LOGI("llama_params_fit SUCCESS: n_gpu_layers=%d, n_ctx=%u",
model_params.n_gpu_layers, ctx_params.n_ctx);
break;
case LLAMA_PARAMS_FIT_STATUS_FAILURE:
LOGI("llama_params_fit FAILURE: could not fit model to device memory. "
"Proceeding with conservative CPU-only defaults.");
model_params.n_gpu_layers = 0;
if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) {
ctx_params.n_ctx = 2048;
}
break;
case LLAMA_PARAMS_FIT_STATUS_ERROR:
LOGE("llama_params_fit ERROR for model: %s. "
"Falling back to conservative CPU-only defaults.", model_path.c_str());
model_params.n_gpu_layers = 0;
if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) {
ctx_params.n_ctx = 2048;
}
break;
Comment on lines +265 to +280
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

User-provided context size silently overridden on fit failure

If the user explicitly sets context_size in their config (e.g., user_context_size = 3000), it is pre-populated into ctx_params.n_ctx at line 225. However, when llama_params_fit returns FAILURE or ERROR, the fallback guard ctx_params.n_ctx > 2048 will silently cap it to 2048 without any log message specific to the user's request being overridden. The general failure message doesn't make it obvious that the requested context was discarded.

Consider adding an explicit log when the user-supplied context is reduced:

case LLAMA_PARAMS_FIT_STATUS_FAILURE:
    LOGI("llama_params_fit FAILURE: could not fit model to device memory. "
         "Proceeding with conservative CPU-only defaults.");
    model_params.n_gpu_layers = 0;
    if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) {
        if (user_context_size > 2048) {
            LOGI("Ignoring user-requested context size %d due to fit failure; capping at 2048", user_context_size);
        }
        ctx_params.n_ctx = 2048;
    }
    break;
Prompt To Fix With AI
This is a comment left during a code review.
Path: sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
Line: 265-280

Comment:
**User-provided context size silently overridden on fit failure**

If the user explicitly sets `context_size` in their config (e.g., `user_context_size = 3000`), it is pre-populated into `ctx_params.n_ctx` at line 225. However, when `llama_params_fit` returns `FAILURE` or `ERROR`, the fallback guard `ctx_params.n_ctx > 2048` will silently cap it to 2048 without any log message specific to the user's request being overridden. The general failure message doesn't make it obvious that the requested context was discarded.

Consider adding an explicit log when the user-supplied context is reduced:

```cpp
case LLAMA_PARAMS_FIT_STATUS_FAILURE:
    LOGI("llama_params_fit FAILURE: could not fit model to device memory. "
         "Proceeding with conservative CPU-only defaults.");
    model_params.n_gpu_layers = 0;
    if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 2048) {
        if (user_context_size > 2048) {
            LOGI("Ignoring user-requested context size %d due to fit failure; capping at 2048", user_context_size);
        }
        ctx_params.n_ctx = 2048;
    }
    break;
```

How can I resolve this? If you propose a fix, please make it concise.

}

// Allow user override via config
if (config.contains("gpu_layers")) {
gpu_layers = config["gpu_layers"].get<int>();
LOGI("Using user-provided GPU layers: %d", gpu_layers);
// Apply user gpu_layers override after fit
if (user_gpu_layers >= 0) {
model_params.n_gpu_layers = user_gpu_layers;
LOGI("Applying user GPU layers override: %d", user_gpu_layers);
}

model_params.n_gpu_layers = gpu_layers;
LOGI("Loading model with n_gpu_layers=%d", gpu_layers);
// Currently llama_params_fit does not detect cpu only memory
// They have an ongoing pr, which when merged, should solve our problem.
// this should work as a placeholder until then.
#if !defined(GGML_USE_METAL) && !defined(GGML_USE_CUDA) && !defined(GGML_USE_WEBGPU)
if (fit_status == LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
LOGI("CPU-only build: llama_params_fit fitted to GPU memory but no GPU backend active. "
"Applying conservative CPU defaults.");
}
model_params.n_gpu_layers = 0;
if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 4096) {
ctx_params.n_ctx = 4096;
LOGI("CPU-only: capping context to %u", ctx_params.n_ctx);
}
#endif
Comment on lines +292 to +302
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

CPU-only block unconditionally overwrites user gpu_layers override

The user_gpu_layers override is applied at lines 283–287, but the CPU-only preprocessor block immediately after (lines 292–302) unconditionally sets model_params.n_gpu_layers = 0, silently discarding the user's explicit config. The comment at line 210 states "If llama_params_fits aborts, use the user provided value," but this intent is not honored in CPU-only builds.

Consider moving the user override application to after the CPU-only block, or at minimum logging a warning that the user override was ignored:

#if !defined(GGML_USE_METAL) && !defined(GGML_USE_CUDA) && !defined(GGML_USE_WEBGPU)
    if (fit_status == LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
        LOGI("CPU-only build: llama_params_fit fitted to GPU memory but no GPU backend active. "
             "Applying conservative CPU defaults.");
    }
    model_params.n_gpu_layers = 0;
    if (user_gpu_layers >= 0) {
        LOGI("CPU-only build: ignoring user-provided gpu_layers=%d (no GPU backend active)", user_gpu_layers);
    }
    if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 4096) {
        ctx_params.n_ctx = 4096;
        LOGI("CPU-only: capping context to %u", ctx_params.n_ctx);
    }
#else
    // Apply user gpu_layers override after fit (GPU builds only)
    if (user_gpu_layers >= 0) {
        model_params.n_gpu_layers = user_gpu_layers;
        LOGI("Applying user GPU layers override: %d", user_gpu_layers);
    }
#endif
Prompt To Fix With AI
This is a comment left during a code review.
Path: sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
Line: 292-302

Comment:
**CPU-only block unconditionally overwrites user `gpu_layers` override**

The `user_gpu_layers` override is applied at lines 283–287, but the CPU-only preprocessor block immediately after (lines 292–302) unconditionally sets `model_params.n_gpu_layers = 0`, silently discarding the user's explicit config. The comment at line 210 states "If llama_params_fits aborts, use the user provided value," but this intent is not honored in CPU-only builds.

Consider moving the user override application to after the CPU-only block, or at minimum logging a warning that the user override was ignored:

```cpp
#if !defined(GGML_USE_METAL) && !defined(GGML_USE_CUDA) && !defined(GGML_USE_WEBGPU)
    if (fit_status == LLAMA_PARAMS_FIT_STATUS_SUCCESS) {
        LOGI("CPU-only build: llama_params_fit fitted to GPU memory but no GPU backend active. "
             "Applying conservative CPU defaults.");
    }
    model_params.n_gpu_layers = 0;
    if (user_gpu_layers >= 0) {
        LOGI("CPU-only build: ignoring user-provided gpu_layers=%d (no GPU backend active)", user_gpu_layers);
    }
    if (ctx_params.n_ctx == 0 || ctx_params.n_ctx > 4096) {
        ctx_params.n_ctx = 4096;
        LOGI("CPU-only: capping context to %u", ctx_params.n_ctx);
    }
#else
    // Apply user gpu_layers override after fit (GPU builds only)
    if (user_gpu_layers >= 0) {
        model_params.n_gpu_layers = user_gpu_layers;
        LOGI("Applying user GPU layers override: %d", user_gpu_layers);
    }
#endif
```

How can I resolve this? If you propose a fix, please make it concise.


LOGI("Loading model with n_gpu_layers=%d", model_params.n_gpu_layers);

model_ = llama_model_load_from_file(model_path.c_str(), model_params);

Expand All @@ -280,67 +311,25 @@ bool LlamaCppTextGeneration::load_model(const std::string& model_path,
}

int model_train_ctx = llama_model_n_ctx_train(model_);
LOGI("Model training context size: %d", model_train_ctx);

// Get model parameter count to determine appropriate context size
// Large models (7B+) need smaller context on mobile to fit in memory
uint64_t n_params = llama_model_n_params(model_);
double params_billions = static_cast<double>(n_params) / 1e9;
LOGI("Model parameters: %.2fB", params_billions);

// Post-load verification: warn if actual param count differs from filename heuristic
bool actual_is_large = (params_billions >= 7.0);
if (actual_is_large && !is_large_model) {
LOGI("WARNING: Model has %.1fB params but filename didn't indicate large model. "
"Consider using gpu_layers config for optimal performance.", params_billions);
} else if (!actual_is_large && is_large_model) {
LOGI("NOTE: Filename suggested large model but actual params are %.1fB. "
"GPU layer limiting may be conservative.", params_billions);
}

// Adaptive context size based on model size for mobile devices
int adaptive_max_context;
if (params_billions >= 7.0) {
// 7B+ models: use 2048 context to fit in ~6GB GPU memory
adaptive_max_context = 2048;
LOGI("Large model detected (%.1fB params), limiting context to %d for memory", params_billions, adaptive_max_context);
} else if (params_billions >= 3.0) {
// 3-7B models: use 4096 context
adaptive_max_context = 4096;
LOGI("Medium model detected (%.1fB params), limiting context to %d", params_billions, adaptive_max_context);
} else if (params_billions >= 1.0) {
// 1-3B models: use 2048 context (higher values OOM on mobile, especially with LoRA)
adaptive_max_context = 2048;
LOGI("Small-medium model detected (%.1fB params), limiting context to %d", params_billions, adaptive_max_context);
} else {
// Tiny models (<1B): can use larger context
adaptive_max_context = max_default_context_;
}
LOGI("Model loaded: %.2fB params, training context=%d", params_billions, model_train_ctx);

if (user_context_size > 0) {
context_size_ = std::min(user_context_size, model_train_ctx);
LOGI("Using user-provided context size: %d (requested: %d, model max: %d)", context_size_,
user_context_size, model_train_ctx);
} else {
context_size_ = std::min({model_train_ctx, max_default_context_, adaptive_max_context});
LOGI("Auto-detected context size: %d (model: %d, cap: %d, adaptive: %d)", context_size_,
model_train_ctx, max_default_context_, adaptive_max_context);
if (ctx_params.n_ctx == 0) {
ctx_params.n_ctx = std::min(model_train_ctx, max_default_context_);
}
context_size_ = std::min({(int)ctx_params.n_ctx, model_train_ctx, max_default_context_});

LOGI("Final context size: %d (fitted=%u, train=%d, cap=%d)",
context_size_, ctx_params.n_ctx, model_train_ctx, max_default_context_);

// Cap batch sizes to avoid exceeding Metal's 4 GB single-buffer limit on iOS.
// n_ctx controls conversation history length; n_batch/n_ubatch control how many
// tokens are processed in one GPU pass. llama.cpp splits larger prompts automatically.
static constexpr int MAX_BATCH_SIZE = 2048;
static constexpr int MAX_UBATCH_SIZE = 512;
batch_size_ = std::min(context_size_, MAX_BATCH_SIZE);

llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = context_size_;
ctx_params.n_batch = batch_size_;
ctx_params.n_ubatch = std::min(batch_size_, MAX_UBATCH_SIZE);
ctx_params.n_threads = backend_->get_num_threads();
ctx_params.n_threads_batch = backend_->get_num_threads();
ctx_params.no_perf = true;

LOGI("Context params: n_ctx=%d, n_batch=%d, n_ubatch=%d",
ctx_params.n_ctx, ctx_params.n_batch, ctx_params.n_ubatch);
Expand Down Expand Up @@ -1137,7 +1126,6 @@ bool LlamaCppTextGeneration::recreate_context() {
context_ = nullptr;
}

// Create new context (adapters are now visible to it)
llama_context_params ctx_params = llama_context_default_params();
ctx_params.n_ctx = context_size_;
ctx_params.n_batch = batch_size_;
Expand Down Expand Up @@ -1309,4 +1297,4 @@ nlohmann::json LlamaCppTextGeneration::get_lora_info() const {
return adapters;
}

} // namespace runanywhere
} // namespace runanywhere
Loading