diff --git a/sdk/runanywhere-commons/CMakeLists.txt b/sdk/runanywhere-commons/CMakeLists.txt
index b958a86dd..1311257f4 100644
--- a/sdk/runanywhere-commons/CMakeLists.txt
+++ b/sdk/runanywhere-commons/CMakeLists.txt
@@ -129,6 +129,7 @@ set(RAC_CORE_SOURCES
     src/core/rac_core.cpp
     src/core/rac_error.cpp
     src/core/rac_time.cpp
+    src/core/rac_benchmark.cpp
     src/core/rac_memory.cpp
     src/core/rac_logger.cpp
     src/core/rac_audio_utils.cpp
diff --git a/sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h b/sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h
index 41ed7cacf..bce490311 100644
--- a/sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h
+++ b/sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h
@@ -11,6 +11,7 @@
 #ifndef RAC_LLM_LLAMACPP_H
 #define RAC_LLM_LLAMACPP_H
 
+#include "rac/core/rac_benchmark.h"
 #include "rac/core/rac_error.h"
 #include "rac/core/rac_types.h"
 #include "rac/features/llm/rac_llm.h"
@@ -163,6 +164,32 @@ RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_generate_stream(
     rac_handle_t handle, const char* prompt, const rac_llm_options_t* options,
     rac_llm_llamacpp_stream_callback_fn callback, void* user_data);
 
+/**
+ * Generates text with streaming callback and benchmark timing.
+ *
+ * Same as rac_llm_llamacpp_generate_stream but captures benchmark timing:
+ * - t2: Before prefill (llama_decode for prompt batch)
+ * - t3: After prefill completes
+ * - t5: When decode loop exits (last token)
+ *
+ * @param handle Service handle
+ * @param prompt Input prompt text
+ * @param options Generation options
+ * @param callback Callback for each token
+ * @param user_data User context passed to callback
+ * @param timing_out Output: Benchmark timing struct, caller-allocated.
+ *                   Must remain valid for the duration of the call.
+ *                   Caller should initialize via rac_benchmark_timing_init() before passing.
+ *                   On success, all t2/t3/t5 fields are populated.
+ *                   On failure, status is set but timing fields may be partial.
+ *                   Pass NULL to skip timing (zero overhead).
+ * @return RAC_SUCCESS or error code
+ */
+RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_generate_stream_with_timing(
+    rac_handle_t handle, const char* prompt, const rac_llm_options_t* options,
+    rac_llm_llamacpp_stream_callback_fn callback, void* user_data,
+    rac_benchmark_timing_t* timing_out);
+
 /**
  * Cancels ongoing generation.
  *
diff --git a/sdk/runanywhere-commons/include/rac/core/rac_benchmark.h b/sdk/runanywhere-commons/include/rac/core/rac_benchmark.h
new file mode 100644
index 000000000..d0ac8ec39
--- /dev/null
+++ b/sdk/runanywhere-commons/include/rac/core/rac_benchmark.h
@@ -0,0 +1,129 @@
+/**
+ * @file rac_benchmark.h
+ * @brief RunAnywhere Commons - Benchmark Timing Support
+ *
+ * This header provides types and functions for benchmark timing instrumentation.
+ * The timing struct captures key timestamps during LLM inference for performance
+ * measurement and analysis.
+ *
+ * Design principles:
+ * - Zero overhead when not benchmarking: timing is opt-in via pointer parameter
+ * - Monotonic clock: uses steady_clock for accurate cross-platform timing
+ * - All timestamps are relative to a process-local epoch (not wall-clock)
+ */
+
+#ifndef RAC_BENCHMARK_H
+#define RAC_BENCHMARK_H
+
+#include "rac/core/rac_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// =============================================================================
+// BENCHMARK TIMING STRUCT
+// =============================================================================
+
+/**
+ * Benchmark timing structure for LLM inference.
+ *
+ * Captures timestamps at key points during inference:
+ * - t0: Request start (component API entry)
+ * - t2: Prefill start (backend, before llama_decode for prompt)
+ * - t3: Prefill end (backend, after llama_decode returns)
+ * - t4: First token (component, first token callback)
+ * - t5: Last token (backend, decode loop exits)
+ * - t6: Request end (component, before complete callback)
+ *
+ * All timestamps are in milliseconds from a process-local epoch.
+ * Use rac_monotonic_now_ms() to get comparable timestamps.
+ *
+ * Note: t1 is intentionally skipped to match the specification.
+ */
+typedef struct rac_benchmark_timing {
+    /** t0: Request start - recorded at component API entry */
+    int64_t t0_request_start_ms;
+
+    /** t2: Prefill start - recorded before llama_decode for prompt batch */
+    int64_t t2_prefill_start_ms;
+
+    /** t3: Prefill end - recorded after llama_decode returns for prompt */
+    int64_t t3_prefill_end_ms;
+
+    /** t4: First token - recorded when first token callback is invoked */
+    int64_t t4_first_token_ms;
+
+    /** t5: Last token - recorded when decode loop exits */
+    int64_t t5_last_token_ms;
+
+    /** t6: Request end - recorded before complete callback */
+    int64_t t6_request_end_ms;
+
+    /** Number of tokens in the prompt */
+    int32_t prompt_tokens;
+
+    /** Number of tokens generated */
+    int32_t output_tokens;
+
+    /**
+     * Status of the benchmark request.
+     * Uses RAC_BENCHMARK_STATUS_* codes:
+     * - RAC_BENCHMARK_STATUS_SUCCESS (0): Completed successfully
+     * - RAC_BENCHMARK_STATUS_ERROR (1): Failed
+     * - RAC_BENCHMARK_STATUS_TIMEOUT (2): Timed out
+     * - RAC_BENCHMARK_STATUS_CANCELLED (3): Cancelled
+     */
+    int32_t status;
+
+} rac_benchmark_timing_t;
+
+// =============================================================================
+// BENCHMARK STATUS CODES
+// =============================================================================
+
+/** Benchmark request completed successfully */
+#define RAC_BENCHMARK_STATUS_SUCCESS ((int32_t)0)
+
+/** Benchmark request failed due to error */
+#define RAC_BENCHMARK_STATUS_ERROR ((int32_t)1)
+
+/** Benchmark request timed out */
+#define RAC_BENCHMARK_STATUS_TIMEOUT ((int32_t)2)
+
+/** Benchmark request was cancelled */
+#define RAC_BENCHMARK_STATUS_CANCELLED ((int32_t)3)
+
+// =============================================================================
+// MONOTONIC TIME API
+// =============================================================================
+
+/**
+ * Gets the current monotonic time in milliseconds.
+ *
+ * Uses std::chrono::steady_clock for accurate, monotonic timing that is not
+ * affected by system clock changes. The returned value is relative to a
+ * process-local epoch (the first call to this function).
+ *
+ * This function is thread-safe and lock-free on all supported platforms.
+ *
+ * @return Current monotonic time in milliseconds from process-local epoch
+ */
+RAC_API int64_t rac_monotonic_now_ms(void);
+
+// =============================================================================
+// UTILITY FUNCTIONS
+// =============================================================================
+
+/**
+ * Initializes a benchmark timing struct to zero values.
+ *
+ * @param timing Pointer to timing struct to initialize
+ */
+RAC_API void rac_benchmark_timing_init(rac_benchmark_timing_t* timing);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RAC_BENCHMARK_H */
diff --git a/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_component.h b/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_component.h
index 0947e60bb..82ef249d7 100644
--- a/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_component.h
+++ b/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_component.h
@@ -13,6 +13,7 @@
 #define RAC_LLM_COMPONENT_H
 
 #include "rac/core/capabilities/rac_lifecycle.h"
+#include "rac/core/rac_benchmark.h"
 #include "rac/core/rac_error.h"
 #include "rac/features/llm/rac_llm_types.h"
 
@@ -196,6 +197,42 @@ RAC_API rac_result_t rac_llm_component_generate_stream(
     rac_llm_component_complete_callback_fn complete_callback,
     rac_llm_component_error_callback_fn error_callback, void* user_data);
 
+/**
+ * @brief Generate text with streaming and benchmark timing
+ *
+ * Same as rac_llm_component_generate_stream but with optional benchmark timing.
+ * When timing_out is non-NULL, captures detailed timing information:
+ * - t0: Request start (set at API entry)
+ * - t4: First token (set in token callback)
+ * - t6: Request end (set before complete callback)
+ *
+ * Backend timestamps (t2, t3, t5) are captured by the backend if it supports timing.
+ *
+ * Zero overhead when timing_out is NULL - behaves exactly like generate_stream.
+ *
+ * @param handle Component handle
+ * @param prompt Input prompt
+ * @param options Generation options (can be NULL for defaults)
+ * @param token_callback Called for each generated token
+ * @param complete_callback Called when generation completes
+ * @param error_callback Called on error
+ * @param user_data User context passed to callbacks
+ * @param timing_out Output: Benchmark timing struct, caller-allocated.
+ *                   Must remain valid for the duration of the call.
+ *                   Caller should initialize via rac_benchmark_timing_init() before passing.
+ *                   Component fills t0/t4/t6, backend fills t2/t3/t5.
+ *                   On success, all timing fields are populated.
+ *                   On failure, status is set but timing fields may be partial.
+ *                   Pass NULL to skip timing (zero overhead).
+ * @return RAC_SUCCESS or error code
+ */
+RAC_API rac_result_t rac_llm_component_generate_stream_with_timing(
+    rac_handle_t handle, const char* prompt, const rac_llm_options_t* options,
+    rac_llm_component_token_callback_fn token_callback,
+    rac_llm_component_complete_callback_fn complete_callback,
+    rac_llm_component_error_callback_fn error_callback, void* user_data,
+    rac_benchmark_timing_t* timing_out);
+
 /**
  * @brief Get lifecycle state
  *
diff --git a/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h b/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h
index 74720f0c5..9b9960db1 100644
--- a/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h
+++ b/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h
@@ -10,6 +10,7 @@
 #ifndef RAC_LLM_SERVICE_H
 #define RAC_LLM_SERVICE_H
 
+#include "rac/core/rac_benchmark.h"
 #include "rac/core/rac_error.h"
 #include "rac/features/llm/rac_llm_types.h"
 
@@ -38,6 +39,21 @@ typedef struct rac_llm_service_ops {
                                     const rac_llm_options_t* options,
                                     rac_llm_stream_callback_fn callback, void* user_data);
 
+    /**
+     * Generate text with streaming callback and benchmark timing.
+     * Optional: backends that don't support timing can leave this NULL.
+     * If NULL, rac_llm_generate_stream_with_timing falls back to generate_stream.
+     *
+     * Backends that implement this should capture:
+     * - t2: Before prefill (llama_decode for prompt)
+     * - t3: After prefill completes
+     * - t5: When decode loop exits (last token)
+     */
+    rac_result_t (*generate_stream_with_timing)(void* impl, const char* prompt,
+                                                const rac_llm_options_t* options,
+                                                rac_llm_stream_callback_fn callback, void* user_data,
+                                                rac_benchmark_timing_t* timing_out);
+
     /** Get service info */
     rac_result_t (*get_info)(void* impl, rac_llm_info_t* out_info);
 
@@ -117,6 +133,32 @@ RAC_API rac_result_t rac_llm_generate_stream(rac_handle_t handle, const char* pr
                                              const rac_llm_options_t* options,
                                              rac_llm_stream_callback_fn callback, void* user_data);
 
+/**
+ * @brief Stream generate text with benchmark timing
+ *
+ * Same as rac_llm_generate_stream but with optional benchmark timing.
+ * If timing_out is non-NULL and the backend supports timing, captures:
+ * - t2: Before prefill
+ * - t3: After prefill
+ * - t5: Last token generated
+ *
+ * If the backend doesn't implement generate_stream_with_timing, falls back
+ * to generate_stream (timing_out will have t2/t3/t5 as zeros).
+ *
+ * @param handle Service handle
+ * @param prompt Input prompt
+ * @param options Generation options (can be NULL for defaults)
+ * @param callback Callback for each token
+ * @param user_data User context passed to callback
+ * @param timing_out Output: Benchmark timing (can be NULL for no timing)
+ * @return RAC_SUCCESS or error code
+ */
+RAC_API rac_result_t rac_llm_generate_stream_with_timing(rac_handle_t handle, const char* prompt,
+                                                         const rac_llm_options_t* options,
+                                                         rac_llm_stream_callback_fn callback,
+                                                         void* user_data,
+                                                         rac_benchmark_timing_t* timing_out);
+
 /**
  * @brief Get service information
  *
diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
index 27733806b..01da2b34c 100644
--- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
+++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp
@@ -557,6 +557,159 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques
     return !cancel_requested_.load();
 }
 
+bool LlamaCppTextGeneration::generate_stream_with_timing(const TextGenerationRequest& request,
+                                                         TextStreamCallback callback,
+                                                         int* out_prompt_tokens,
+                                                         rac_benchmark_timing_t* timing_out) {
+    std::lock_guard<std::mutex> lock(mutex_);
+
+    if (!is_ready()) {
+        LOGE("Model not ready for generation");
+        return false;
+    }
+
+    cancel_requested_.store(false);
+
+    std::string prompt = build_prompt(request);
+    LOGI("Generating with timing, prompt length: %zu", prompt.length());
+
+    const auto tokens_list = common_tokenize(context_, prompt, true, true);
+
+    int n_ctx = llama_n_ctx(context_);
+    int prompt_tokens = static_cast<int>(tokens_list.size());
+
+    if (out_prompt_tokens) {
+        *out_prompt_tokens = prompt_tokens;
+    }
+
+    int available_tokens = n_ctx - prompt_tokens - 4;
+
+    if (available_tokens <= 0) {
+        LOGE("Prompt too long: %d tokens, context size: %d", prompt_tokens, n_ctx);
+        return false;
+    }
+
+    int effective_max_tokens = std::min(request.max_tokens, available_tokens);
+    if (effective_max_tokens < request.max_tokens) {
+        LOGI("Capping max_tokens: %d → %d (context=%d, prompt=%d tokens)", request.max_tokens,
+             effective_max_tokens, n_ctx, prompt_tokens);
+    }
+    LOGI("Generation with timing: prompt_tokens=%d, max_tokens=%d, context=%d", prompt_tokens,
+         effective_max_tokens, n_ctx);
+
+    llama_batch batch = llama_batch_init(n_ctx, 0, 1);
+
+    batch.n_tokens = 0;
+    for (size_t i = 0; i < tokens_list.size(); i++) {
+        common_batch_add(batch, tokens_list[i], i, {0}, false);
+    }
+    batch.logits[batch.n_tokens - 1] = true;
+
+    // t2: Record prefill start (before llama_decode for prompt)
+    if (timing_out != nullptr) {
+        timing_out->t2_prefill_start_ms = rac_monotonic_now_ms();
+    }
+
+    if (llama_decode(context_, batch) != 0) {
+        LOGE("llama_decode failed for prompt");
+        if (timing_out != nullptr) {
+            int64_t now = rac_monotonic_now_ms();
+            timing_out->t3_prefill_end_ms = now;
+            timing_out->t5_last_token_ms = now;
+        }
+        llama_batch_free(batch);
+        return false;
+    }
+
+    // t3: Record prefill end (after llama_decode returns)
+    if (timing_out != nullptr) {
+        timing_out->t3_prefill_end_ms = rac_monotonic_now_ms();
+    }
+
+    llama_sampler_reset(sampler_);
+
+    const auto vocab = llama_model_get_vocab(model_);
+    std::string cached_token_chars;
+    std::string accumulated_text;
+    int n_cur = batch.n_tokens;
+    int tokens_generated = 0;
+
+    while (tokens_generated < effective_max_tokens && !cancel_requested_.load()) {
+        const llama_token new_token_id = llama_sampler_sample(sampler_, context_, -1);
+
+        llama_sampler_accept(sampler_, new_token_id);
+
+        if (llama_vocab_is_eog(vocab, new_token_id)) {
+            LOGI("End of generation token received");
+            break;
+        }
+
+        auto new_token_chars = common_token_to_piece(context_, new_token_id);
+        cached_token_chars += new_token_chars;
+        accumulated_text += new_token_chars;
+
+        static const std::vector<std::string> stop_sequences = {
+            "<|im_end|>",
+            "<|eot_id|>",
+            "</s>",
+            "<|end|>",
+            "<|endoftext|>",
+            "\n\nUser:",
+            "\n\nHuman:",
+        };
+
+        bool hit_stop_sequence = false;
+        for (const auto& stop_seq : stop_sequences) {
+            size_t pos = accumulated_text.find(stop_seq);
+            if (pos != std::string::npos) {
+                LOGI("Stop sequence detected: %s", stop_seq.c_str());
+                hit_stop_sequence = true;
+                break;
+            }
+        }
+
+        if (hit_stop_sequence) {
+            break;
+        }
+
+        if (is_valid_utf8(cached_token_chars.c_str())) {
+            if (!callback(cached_token_chars)) {
+                LOGI("Generation cancelled by callback");
+                cancel_requested_.store(true);
+                break;
+            }
+            cached_token_chars.clear();
+        }
+
+        batch.n_tokens = 0;
+        common_batch_add(batch, new_token_id, n_cur, {0}, true);
+
+        n_cur++;
+        tokens_generated++;
+
+        if (llama_decode(context_, batch) != 0) {
+            LOGE("llama_decode failed during generation");
+            break;
+        }
+    }
+
+    // t5: Record last token time (decode loop exit)
+    if (timing_out != nullptr) {
+        timing_out->t5_last_token_ms = rac_monotonic_now_ms();
+    }
+
+    if (!cached_token_chars.empty() && is_valid_utf8(cached_token_chars.c_str())) {
+        callback(cached_token_chars);
+    }
+
+    llama_memory_clear(llama_get_memory(context_), true);
+
+    llama_batch_free(batch);
+
+    LOGI("Generation with timing complete: %d tokens", tokens_generated);
+    return !cancel_requested_.load();
+}
+
 void LlamaCppTextGeneration::cancel() {
     cancel_requested_.store(true);
     LOGI("Generation cancel requested");
diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h
index 2d8deb065..29fa4de20 100644
--- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h
+++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h
@@ -18,6 +18,8 @@
 
 #include <nlohmann/json.hpp>
 
+#include "rac/core/rac_benchmark.h"
+
 namespace runanywhere {
 
 // =============================================================================
@@ -117,6 +119,16 @@ class LlamaCppTextGeneration {
     }
     bool generate_stream(const TextGenerationRequest& request, TextStreamCallback callback,
                          int* out_prompt_tokens);
+
+    /**
+     * Generate text with streaming and benchmark timing.
+     * Captures t2 (prefill start), t3 (prefill end), t5 (last token).
+     * @param timing_out Benchmark timing struct (can be NULL for no timing)
+     */
+    bool generate_stream_with_timing(const TextGenerationRequest& request,
+                                     TextStreamCallback callback, int* out_prompt_tokens,
+                                     rac_benchmark_timing_t* timing_out);
+
     void cancel();
     nlohmann::json get_model_info() const;
 
diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/rac_backend_llamacpp_register.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/rac_backend_llamacpp_register.cpp
index 5123ab923..5aeeee662 100644
--- a/sdk/runanywhere-commons/src/backends/llamacpp/rac_backend_llamacpp_register.cpp
+++ b/sdk/runanywhere-commons/src/backends/llamacpp/rac_backend_llamacpp_register.cpp
@@ -64,6 +64,18 @@ static rac_result_t llamacpp_vtable_generate_stream(void* impl, const char* prom
                                             &adapter);
 }
 
+// Generate stream with benchmark timing
+static rac_result_t llamacpp_vtable_generate_stream_with_timing(void* impl, const char* prompt,
+                                                                const rac_llm_options_t* options,
+                                                                rac_llm_stream_callback_fn callback,
+                                                                void* user_data,
+                                                                rac_benchmark_timing_t* timing_out) {
+    StreamAdapter adapter = {callback, user_data};
+    return rac_llm_llamacpp_generate_stream_with_timing(impl, prompt, options,
+                                                        stream_adapter_callback, &adapter,
+                                                        timing_out);
+}
+
 // Get info
 static rac_result_t llamacpp_vtable_get_info(void* impl, rac_llm_info_t* out_info) {
     if (!out_info)
@@ -114,6 +126,7 @@ static const rac_llm_service_ops_t g_llamacpp_ops = {
     .initialize = llamacpp_vtable_initialize,
     .generate = llamacpp_vtable_generate,
     .generate_stream = llamacpp_vtable_generate_stream,
+    .generate_stream_with_timing = llamacpp_vtable_generate_stream_with_timing,
     .get_info = llamacpp_vtable_get_info,
     .cancel = llamacpp_vtable_cancel,
     .cleanup = llamacpp_vtable_cleanup,
diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp
index cde9dc275..a65f40291 100644
--- a/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp
+++ b/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp
@@ -225,6 +225,52 @@ rac_result_t rac_llm_llamacpp_generate_stream(rac_handle_t handle, const char* p
     return success ? RAC_SUCCESS : RAC_ERROR_INFERENCE_FAILED;
 }
 
+rac_result_t rac_llm_llamacpp_generate_stream_with_timing(rac_handle_t handle, const char* prompt,
+                                                          const rac_llm_options_t* options,
+                                                          rac_llm_llamacpp_stream_callback_fn callback,
+                                                          void* user_data,
+                                                          rac_benchmark_timing_t* timing_out) {
+    if (handle == nullptr || prompt == nullptr || callback == nullptr) {
+        return RAC_ERROR_NULL_POINTER;
+    }
+
+    auto* h = static_cast<rac_llm_llamacpp_handle_impl*>(handle);
+    if (!h->text_gen) {
+        return RAC_ERROR_INVALID_HANDLE;
+    }
+
+    runanywhere::TextGenerationRequest request;
+    request.prompt = prompt;
+    if (options != nullptr) {
+        request.max_tokens = options->max_tokens;
+        request.temperature = options->temperature;
+        request.top_p = options->top_p;
+        if (options->stop_sequences != nullptr && options->num_stop_sequences > 0) {
+            for (int32_t i = 0; i < options->num_stop_sequences; i++) {
+                if (options->stop_sequences[i]) {
+                    request.stop_sequences.push_back(options->stop_sequences[i]);
+                }
+            }
+        }
+    }
+
+    // Stream using C++ class with timing
+    bool success = h->text_gen->generate_stream_with_timing(
+        request,
+        [callback, user_data](const std::string& token) -> bool {
+            return callback(token.c_str(), RAC_FALSE, user_data) == RAC_TRUE;
+        },
+        nullptr,    // out_prompt_tokens not needed, timing is captured internally
+        timing_out  // Pass timing struct to backend
+    );
+
+    if (success) {
+        callback("", RAC_TRUE, user_data);  // Final token
+    }
+
+    return success ? RAC_SUCCESS : RAC_ERROR_INFERENCE_FAILED;
+}
+
 void rac_llm_llamacpp_cancel(rac_handle_t handle) {
     if (handle == nullptr) {
         return;
diff --git a/sdk/runanywhere-commons/src/core/rac_benchmark.cpp b/sdk/runanywhere-commons/src/core/rac_benchmark.cpp
new file mode 100644
index 000000000..44f5840e1
--- /dev/null
+++ b/sdk/runanywhere-commons/src/core/rac_benchmark.cpp
@@ -0,0 +1,55 @@
+/**
+ * @file rac_benchmark.cpp
+ * @brief RunAnywhere Commons - Benchmark Timing Implementation
+ *
+ * Implements monotonic time helper and benchmark timing utilities.
+ * Uses std::chrono::steady_clock for accurate, cross-platform timing
+ * that is not affected by system clock adjustments.
+ */
+
+#include "rac/core/rac_benchmark.h"
+
+#include <chrono>
+#include <cstring>
+
+namespace {
+
+/**
+ * Process-local epoch for monotonic timing.
+ * Initialized on first call to rac_monotonic_now_ms().
+ * Using a local epoch keeps timestamp values small and manageable.
+ */
+class MonotonicEpoch {
+   public:
+    static MonotonicEpoch& instance() {
+        static MonotonicEpoch epoch;
+        return epoch;
+    }
+
+    int64_t elapsed_ms() const {
+        auto now = std::chrono::steady_clock::now();
+        auto duration = now - epoch_;
+        return std::chrono::duration_cast<std::chrono::milliseconds>(duration).count();
+    }
+
+   private:
+    MonotonicEpoch() : epoch_(std::chrono::steady_clock::now()) {}
+
+    std::chrono::steady_clock::time_point epoch_;
+};
+
+}  // namespace
+
+extern "C" {
+
+int64_t rac_monotonic_now_ms(void) {
+    return MonotonicEpoch::instance().elapsed_ms();
+}
+
+void rac_benchmark_timing_init(rac_benchmark_timing_t* timing) {
+    if (timing != nullptr) {
+        std::memset(timing, 0, sizeof(rac_benchmark_timing_t));
+    }
+}
+
+}  // extern "C"
diff --git a/sdk/runanywhere-commons/src/features/llm/llm_component.cpp b/sdk/runanywhere-commons/src/features/llm/llm_component.cpp
index bbb7b51ac..7628b9459 100644
--- a/sdk/runanywhere-commons/src/features/llm/llm_component.cpp
+++ b/sdk/runanywhere-commons/src/features/llm/llm_component.cpp
@@ -17,6 +17,7 @@
 
 #include "rac/core/capabilities/rac_lifecycle.h"
 #include "rac/core/rac_analytics_events.h"
+#include "rac/core/rac_benchmark.h"
 #include "rac/core/rac_logger.h"
 #include "rac/core/rac_platform_adapter.h"
 #include "rac/core/rac_structured_error.h"
@@ -459,6 +460,9 @@ struct llm_stream_context {
     float temperature;
     int32_t max_tokens;
     int32_t token_count;  // Track tokens for streaming updates
+
+    // Benchmark timing (optional, NULL when not benchmarking)
+    rac_benchmark_timing_t* timing_out;
 };
 
 /**
@@ -472,6 +476,11 @@ static rac_bool_t llm_stream_token_callback(const char* token, void* user_data)
         ctx->first_token_recorded = true;
         ctx->first_token_time = std::chrono::steady_clock::now();
 
+        // Record t4 (first token) for benchmark timing
+        if (ctx->timing_out != nullptr) {
+            ctx->timing_out->t4_first_token_ms = rac_monotonic_now_ms();
+        }
+
         // Calculate TTFT
         auto ttft_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
             ctx->first_token_time - ctx->start_time);
@@ -618,6 +627,7 @@ extern "C" rac_result_t rac_llm_component_generate_stream(
     ctx.temperature = effective_options->temperature;
     ctx.max_tokens = effective_options->max_tokens;
     ctx.token_count = 0;
+    ctx.timing_out = nullptr;  // No benchmark timing for regular generate_stream
 
     // Perform streaming generation
     result = rac_llm_generate_stream(service, prompt, effective_options, llm_stream_token_callback,
@@ -708,6 +718,231 @@ extern "C" rac_result_t rac_llm_component_generate_stream(
     return RAC_SUCCESS;
 }
 
+extern "C" rac_result_t rac_llm_component_generate_stream_with_timing(
+    rac_handle_t handle, const char* prompt, const rac_llm_options_t* options,
+    rac_llm_component_token_callback_fn token_callback,
+    rac_llm_component_complete_callback_fn complete_callback,
+    rac_llm_component_error_callback_fn error_callback, void* user_data,
+    rac_benchmark_timing_t* timing_out) {
+    if (!handle)
+        return RAC_ERROR_INVALID_HANDLE;
+    if (!prompt)
+        return RAC_ERROR_INVALID_ARGUMENT;
+
+    auto* component = reinterpret_cast<rac_llm_component*>(handle);
+    std::lock_guard<std::mutex> lock(component->mtx);
+
+    // Initialize timing if provided
+    if (timing_out != nullptr) {
+        rac_benchmark_timing_init(timing_out);
+        // Record t0 (request start) - first thing after validation
+        timing_out->t0_request_start_ms = rac_monotonic_now_ms();
+    }
+
+    // Generate unique ID for this generation
+    std::string generation_id = generate_unique_id();
+    const char* model_id = rac_lifecycle_get_model_id(component->lifecycle);
+    const char* model_name = rac_lifecycle_get_model_name(component->lifecycle);
+
+    // Get service from lifecycle manager
+    rac_handle_t service = nullptr;
+    rac_result_t result = rac_lifecycle_require_service(component->lifecycle, &service);
+    if (result != RAC_SUCCESS) {
+        log_error("LLM.Component", "No model loaded - cannot generate stream");
+
+        // Emit generation failed event
+        rac_analytics_event_data_t event = {};
+        event.type = RAC_EVENT_LLM_GENERATION_FAILED;
+        event.data.llm_generation = RAC_ANALYTICS_LLM_GENERATION_DEFAULT;
+        event.data.llm_generation.generation_id = generation_id.c_str();
+        event.data.llm_generation.model_id = model_id;
+        event.data.llm_generation.model_name = model_name;
+        event.data.llm_generation.error_code = result;
+        event.data.llm_generation.error_message = "No model loaded";
+        rac_analytics_event_emit(RAC_EVENT_LLM_GENERATION_FAILED, &event);
+
+        if (timing_out != nullptr) {
+            timing_out->status = RAC_BENCHMARK_STATUS_ERROR;
+        }
+
+        if (error_callback) {
+            error_callback(result, "No model loaded", user_data);
+        }
+        return result;
+    }
+
+    // Check if streaming is supported
+    rac_llm_info_t info;
+    result = rac_llm_get_info(service, &info);
+    if (result != RAC_SUCCESS || (info.supports_streaming == 0)) {
+        log_error("LLM.Component", "Streaming not supported");
+
+        // Emit generation failed event
+        rac_analytics_event_data_t event = {};
+        event.type = RAC_EVENT_LLM_GENERATION_FAILED;
+        event.data.llm_generation = RAC_ANALYTICS_LLM_GENERATION_DEFAULT;
+        event.data.llm_generation.generation_id = generation_id.c_str();
+        event.data.llm_generation.model_id = model_id;
+        event.data.llm_generation.model_name = model_name;
+        event.data.llm_generation.error_code = RAC_ERROR_NOT_SUPPORTED;
+        event.data.llm_generation.error_message = "Streaming not supported";
+        rac_analytics_event_emit(RAC_EVENT_LLM_GENERATION_FAILED, &event);
+
+        if (timing_out != nullptr) {
+            timing_out->status = RAC_BENCHMARK_STATUS_ERROR;
+        }
+
+        if (error_callback) {
+            error_callback(RAC_ERROR_NOT_SUPPORTED, "Streaming not supported", user_data);
+        }
+        return RAC_ERROR_NOT_SUPPORTED;
+    }
+
+    log_info("LLM.Component", "Starting streaming generation with timing");
+
+    // Get context_length from service info
+    int32_t context_length = info.context_length;
+
+    // Use provided options or defaults
+    const rac_llm_options_t* effective_options = options ? options : &component->default_options;
+
+    // Emit generation started event
+    {
+        rac_analytics_event_data_t event = {};
+        event.type = RAC_EVENT_LLM_GENERATION_STARTED;
+        event.data.llm_generation = RAC_ANALYTICS_LLM_GENERATION_DEFAULT;
+        event.data.llm_generation.generation_id = generation_id.c_str();
+        event.data.llm_generation.model_id = model_id;
+        event.data.llm_generation.model_name = model_name;
+        event.data.llm_generation.is_streaming = RAC_TRUE;
+        event.data.llm_generation.framework =
+            static_cast<rac_inference_framework_t>(component->config.preferred_framework);
+        event.data.llm_generation.temperature = effective_options->temperature;
+        event.data.llm_generation.max_tokens = effective_options->max_tokens;
+        event.data.llm_generation.context_length = context_length;
+        rac_analytics_event_emit(RAC_EVENT_LLM_GENERATION_STARTED, &event);
+    }
+
+    // Setup streaming context
+    llm_stream_context ctx;
+    ctx.token_callback = token_callback;
+    ctx.complete_callback = complete_callback;
+    ctx.error_callback = error_callback;
+    ctx.user_data = user_data;
+    ctx.start_time = std::chrono::steady_clock::now();
+    ctx.first_token_recorded = false;
+    ctx.prompt_tokens = estimate_tokens(prompt);
+    ctx.generation_id = generation_id;
+    ctx.model_id = model_id;
+    ctx.model_name = model_name;
+    ctx.framework = static_cast<rac_inference_framework_t>(component->config.preferred_framework);
+    ctx.temperature = effective_options->temperature;
+    ctx.max_tokens = effective_options->max_tokens;
+    ctx.token_count = 0;
+    ctx.timing_out = timing_out;  // Pass timing for t4 capture in callback
+
+    // Perform streaming generation with timing
+    // Note: Backend timing (t2, t3, t5) will be captured if backend supports it
+    result = rac_llm_generate_stream_with_timing(service, prompt, effective_options,
+                                                 llm_stream_token_callback, &ctx, timing_out);
+
+    if (result != RAC_SUCCESS) {
+        log_error("LLM.Component", "Streaming generation failed");
+        rac_lifecycle_track_error(component->lifecycle, result, "generateStream");
+
+        // Emit generation failed event
+        rac_analytics_event_data_t event = {};
+        event.type = RAC_EVENT_LLM_GENERATION_FAILED;
+        event.data.llm_generation = RAC_ANALYTICS_LLM_GENERATION_DEFAULT;
+        event.data.llm_generation.generation_id = generation_id.c_str();
+        event.data.llm_generation.model_id = model_id;
+        event.data.llm_generation.model_name = model_name;
+        event.data.llm_generation.error_code = result;
+        event.data.llm_generation.error_message = "Streaming generation failed";
+        rac_analytics_event_emit(RAC_EVENT_LLM_GENERATION_FAILED, &event);
+
+        if (timing_out != nullptr) {
+            timing_out->status = RAC_BENCHMARK_STATUS_ERROR;
+        }
+
+        if (error_callback) {
+            error_callback(result, "Streaming generation failed", user_data);
+        }
+        return result;
+    }
+
+    // Build final result for completion callback
+    auto end_time = std::chrono::steady_clock::now();
+    auto total_duration =
+        std::chrono::duration_cast<std::chrono::milliseconds>(end_time - ctx.start_time);
+    int64_t total_time_ms = total_duration.count();
+
+    rac_llm_result_t final_result = {};
+    final_result.text = strdup(ctx.full_text.c_str());
+    final_result.prompt_tokens = ctx.prompt_tokens;
+    final_result.completion_tokens = estimate_tokens(ctx.full_text.c_str());
+    final_result.total_tokens = final_result.prompt_tokens + final_result.completion_tokens;
+    final_result.total_time_ms = total_time_ms;
+
+    double ttft_ms = 0.0;
+    // Calculate TTFT
+    if (ctx.first_token_recorded) {
+        auto ttft_duration = std::chrono::duration_cast<std::chrono::milliseconds>(
+            ctx.first_token_time - ctx.start_time);
+        final_result.time_to_first_token_ms = ttft_duration.count();
+        ttft_ms = static_cast<double>(ttft_duration.count());
+    }
+
+    // Calculate tokens per second
+    double tokens_per_second = 0.0;
+    if (final_result.total_time_ms > 0) {
+        tokens_per_second = static_cast<double>(final_result.completion_tokens) /
+                            (static_cast<double>(final_result.total_time_ms) / 1000.0);
+        final_result.tokens_per_second = static_cast<float>(tokens_per_second);
+    }
+
+    // Record t6 (request end) before complete callback
+    if (timing_out != nullptr) {
+        timing_out->t6_request_end_ms = rac_monotonic_now_ms();
+        timing_out->prompt_tokens = final_result.prompt_tokens;
+        timing_out->output_tokens = final_result.completion_tokens;
+        timing_out->status = RAC_BENCHMARK_STATUS_SUCCESS;
+    }
+
+    if (complete_callback) {
+        complete_callback(&final_result, user_data);
+    }
+
+    // Emit generation completed event
+    {
+        rac_analytics_event_data_t event = {};
+        event.type = RAC_EVENT_LLM_GENERATION_COMPLETED;
+        event.data.llm_generation.generation_id = generation_id.c_str();
+        event.data.llm_generation.model_id = model_id;
+        event.data.llm_generation.model_name = model_name;
+        event.data.llm_generation.input_tokens = final_result.prompt_tokens;
+        event.data.llm_generation.output_tokens = final_result.completion_tokens;
+        event.data.llm_generation.duration_ms = static_cast<double>(total_time_ms);
+        event.data.llm_generation.tokens_per_second = tokens_per_second;
+        event.data.llm_generation.is_streaming = RAC_TRUE;
+        event.data.llm_generation.time_to_first_token_ms = ttft_ms;
+        event.data.llm_generation.framework =
+            static_cast<rac_inference_framework_t>(component->config.preferred_framework);
+        event.data.llm_generation.temperature = effective_options->temperature;
+        event.data.llm_generation.max_tokens = effective_options->max_tokens;
+        event.data.llm_generation.context_length = context_length;
+        event.data.llm_generation.error_code = RAC_SUCCESS;
+        rac_analytics_event_emit(RAC_EVENT_LLM_GENERATION_COMPLETED, &event);
+    }
+
+    // Free the duplicated text
+    free(final_result.text);
+
+    log_info("LLM.Component", "Streaming generation with timing completed");
+
+    return RAC_SUCCESS;
+}
+
 extern "C" rac_result_t rac_llm_component_cancel(rac_handle_t handle) {
     if (!handle)
         return RAC_ERROR_INVALID_HANDLE;
diff --git a/sdk/runanywhere-commons/src/features/llm/rac_llm_service.cpp b/sdk/runanywhere-commons/src/features/llm/rac_llm_service.cpp
index 14fe472b3..6867a4151 100644
--- a/sdk/runanywhere-commons/src/features/llm/rac_llm_service.cpp
+++ b/sdk/runanywhere-commons/src/features/llm/rac_llm_service.cpp
@@ -122,6 +122,33 @@ rac_result_t rac_llm_generate_stream(rac_handle_t handle, const char* prompt,
     return service->ops->generate_stream(service->impl, prompt, options, callback, user_data);
 }
 
+rac_result_t rac_llm_generate_stream_with_timing(rac_handle_t handle, const char* prompt,
+                                                 const rac_llm_options_t* options,
+                                                 rac_llm_stream_callback_fn callback,
+                                                 void* user_data,
+                                                 rac_benchmark_timing_t* timing_out) {
+    if (!handle || !prompt || !callback)
+        return RAC_ERROR_NULL_POINTER;
+
+    auto* service = static_cast<rac_llm_service_t*>(handle);
+    if (!service->ops) {
+        return RAC_ERROR_NOT_SUPPORTED;
+    }
+
+    // If backend implements timing-aware streaming, use it
+    if (service->ops->generate_stream_with_timing) {
+        return service->ops->generate_stream_with_timing(service->impl, prompt, options, callback,
+                                                         user_data, timing_out);
+    }
+
+    // Fallback to regular streaming (timing_out won't have t2/t3/t5)
+    if (service->ops->generate_stream) {
+        return service->ops->generate_stream(service->impl, prompt, options, callback, user_data);
+    }
+
+    return RAC_ERROR_NOT_SUPPORTED;
+}
+
 rac_result_t rac_llm_get_info(rac_handle_t handle, rac_llm_info_t* out_info) {
     if (!handle || !out_info)
         return RAC_ERROR_NULL_POINTER;
diff --git a/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp b/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp
index da2e337a4..a3b37d431 100644
--- a/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp
+++ b/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp
@@ -24,6 +24,7 @@
 // Include runanywhere-commons C API headers
 #include "rac/core/rac_analytics_events.h"
 #include "rac/core/rac_audio_utils.h"
+#include "rac/core/rac_benchmark.h"
 #include "rac/core/rac_core.h"
 #include "rac/core/rac_error.h"
 #include "rac/core/rac_logger.h"
@@ -981,6 +982,135 @@ Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentGenerate
     return env->NewStringUTF(json.c_str());
 }
 
+// ========================================================================
+// STREAMING WITH KOTLIN CALLBACK AND BENCHMARK TIMING
+// ========================================================================
+
+JNIEXPORT jstring JNICALL
+Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentGenerateStreamWithTiming(
+    JNIEnv* env, jclass clazz, jlong handle, jstring prompt, jstring configJson,
+    jobject tokenCallback) {
+    LOGi("racLlmComponentGenerateStreamWithTiming called with handle=%lld", (long long)handle);
+
+    if (handle == 0) {
+        LOGe("racLlmComponentGenerateStreamWithTiming: invalid handle");
+        return nullptr;
+    }
+
+    if (!tokenCallback) {
+        LOGe("racLlmComponentGenerateStreamWithTiming: null callback");
+        return nullptr;
+    }
+
+    std::string promptStr = getCString(env, prompt);
+    LOGi("racLlmComponentGenerateStreamWithTiming prompt length=%zu", promptStr.length());
+
+    std::string configStorage;
+    const char* config = getNullableCString(env, configJson, configStorage);
+
+    // Get JVM and callback method
+    JavaVM* jvm = nullptr;
+    env->GetJavaVM(&jvm);
+
+    jclass callbackClass = env->GetObjectClass(tokenCallback);
+    jmethodID onTokenMethod = env->GetMethodID(callbackClass, "onToken", "(Ljava/lang/String;)Z");
+
+    if (!onTokenMethod) {
+        LOGe("racLlmComponentGenerateStreamWithTiming: could not find onToken method");
+        return nullptr;
+    }
+
+    // Create global ref to callback to ensure it survives across threads
+    jobject globalCallback = env->NewGlobalRef(tokenCallback);
+
+    // Parse config for options
+    rac_llm_options_t options = {};
+    options.max_tokens = 512;
+    options.temperature = 0.7f;
+    options.top_p = 1.0f;
+    options.streaming_enabled = RAC_TRUE;
+
+    // Create streaming callback context
+    LLMStreamCallbackContext ctx;
+    ctx.jvm = jvm;
+    ctx.callback = globalCallback;
+    ctx.onTokenMethod = onTokenMethod;
+
+    // Initialize benchmark timing struct
+    rac_benchmark_timing_t timing = {};
+    rac_benchmark_timing_init(&timing);
+
+    LOGi("racLlmComponentGenerateStreamWithTiming calling rac_llm_component_generate_stream_with_timing...");
+
+    rac_result_t status = rac_llm_component_generate_stream_with_timing(
+        reinterpret_cast<rac_handle_t>(handle), promptStr.c_str(), &options,
+        llm_stream_callback_token, llm_stream_callback_complete, llm_stream_callback_error, &ctx,
+        &timing);
+
+    // Clean up global ref
+    env->DeleteGlobalRef(globalCallback);
+
+    if (status != RAC_SUCCESS) {
+        LOGe("rac_llm_component_generate_stream_with_timing failed with status=%d", status);
+        return nullptr;
+    }
+
+    if (ctx.has_error) {
+        LOGe("Streaming with timing failed: %s", ctx.error_message.c_str());
+        return nullptr;
+    }
+
+    LOGi("racLlmComponentGenerateStreamWithTiming result text length=%zu, tokens=%d",
+         ctx.accumulated_text.length(), ctx.token_count);
+
+    // Build JSON result with timing
+    std::string json = "{";
+    json += "\"text\":\"";
+    for (char c : ctx.accumulated_text) {
+        switch (c) {
+            case '"':
+                json += "\\\"";
+                break;
+            case '\\':
+                json += "\\\\";
+                break;
+            case '\n':
+                json += "\\n";
+                break;
+            case '\r':
+                json += "\\r";
+                break;
+            case '\t':
+                json += "\\t";
+                break;
+            default:
+                json += c;
+                break;
+        }
+    }
+    json += "\",";
+    json += "\"tokens_generated\":" + std::to_string(ctx.final_result.completion_tokens) + ",";
+    json += "\"tokens_evaluated\":" + std::to_string(ctx.final_result.prompt_tokens) + ",";
+    json += "\"stop_reason\":" + std::to_string(0) + ",";
+    json += "\"total_time_ms\":" + std::to_string(ctx.final_result.total_time_ms) + ",";
+    json += "\"tokens_per_second\":" + std::to_string(ctx.final_result.tokens_per_second) + ",";
+    // Add benchmark timing fields
+    json += "\"t0_request_start_ms\":" + std::to_string(timing.t0_request_start_ms) + ",";
+    json += "\"t2_prefill_start_ms\":" + std::to_string(timing.t2_prefill_start_ms) + ",";
+    json += "\"t3_prefill_end_ms\":" + std::to_string(timing.t3_prefill_end_ms) + ",";
+    json += "\"t4_first_token_ms\":" + std::to_string(timing.t4_first_token_ms) + ",";
+    json += "\"t5_last_token_ms\":" + std::to_string(timing.t5_last_token_ms) + ",";
+    json += "\"t6_request_end_ms\":" + std::to_string(timing.t6_request_end_ms) + ",";
+    json += "\"prompt_tokens\":" + std::to_string(timing.prompt_tokens) + ",";
+    json += "\"output_tokens\":" + std::to_string(timing.output_tokens) + ",";
+    json += "\"benchmark_status\":" + std::to_string(timing.status);
+    json += "}";
+
+    LOGi("racLlmComponentGenerateStreamWithTiming returning JSON: %zu bytes", json.length());
+
+    return env->NewStringUTF(json.c_str());
+}
+
 JNIEXPORT void JNICALL
 Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentCancel(JNIEnv* env,
                                                                                jclass clazz,