RunanywhereAI · abhisekupadhyaya · Feb 5, 2026 · Feb 5, 2026
diff --git a/sdk/runanywhere-commons/CMakeLists.txt b/sdk/runanywhere-commons/CMakeLists.txt
@@ -129,6 +129,7 @@ set(RAC_CORE_SOURCES
     src/core/rac_core.cpp
     src/core/rac_error.cpp
     src/core/rac_time.cpp
+    src/core/rac_benchmark.cpp
     src/core/rac_memory.cpp
     src/core/rac_logger.cpp
     src/core/rac_audio_utils.cpp

diff --git a/sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h b/sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h
@@ -11,6 +11,7 @@
 #ifndef RAC_LLM_LLAMACPP_H
 #define RAC_LLM_LLAMACPP_H
 
+#include "rac/core/rac_benchmark.h"
 #include "rac/core/rac_error.h"
 #include "rac/core/rac_types.h"
 #include "rac/features/llm/rac_llm.h"
@@ -163,6 +164,27 @@ RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_generate_stream(
     rac_handle_t handle, const char* prompt, const rac_llm_options_t* options,
     rac_llm_llamacpp_stream_callback_fn callback, void* user_data);
 
+/**
+ * Generates text with streaming callback and benchmark timing.
+ *
+ * Same as rac_llm_llamacpp_generate_stream but captures benchmark timing:
+ * - t2: Before prefill (llama_decode for prompt batch)
+ * - t3: After prefill completes
+ * - t5: When decode loop exits (last token)
+ *
+ * @param handle Service handle
+ * @param prompt Input prompt text
+ * @param options Generation options
+ * @param callback Callback for each token
+ * @param user_data User context passed to callback
+ * @param timing_out Output: Benchmark timing (can be NULL for no timing)
+ * @return RAC_SUCCESS or error code
+ */
+RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_generate_stream_with_timing(
+    rac_handle_t handle, const char* prompt, const rac_llm_options_t* options,
+    rac_llm_llamacpp_stream_callback_fn callback, void* user_data,
+    rac_benchmark_timing_t* timing_out);
+
 /**
  * Cancels ongoing generation.
  *

diff --git a/sdk/runanywhere-commons/include/rac/core/rac_benchmark.h b/sdk/runanywhere-commons/include/rac/core/rac_benchmark.h
@@ -0,0 +1,126 @@
+/**
+ * @file rac_benchmark.h
+ * @brief RunAnywhere Commons - Benchmark Timing Support
+ *
+ * This header provides types and functions for benchmark timing instrumentation.
+ * The timing struct captures key timestamps during LLM inference for performance
+ * measurement and analysis.
+ *
+ * Design principles:
+ * - Zero overhead when not benchmarking: timing is opt-in via pointer parameter
+ * - Monotonic clock: uses steady_clock for accurate cross-platform timing
+ * - All timestamps are relative to a process-local epoch (not wall-clock)
+ */
+
+#ifndef RAC_BENCHMARK_H
+#define RAC_BENCHMARK_H
+
+#include "rac/core/rac_types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// =============================================================================
+// BENCHMARK TIMING STRUCT
+// =============================================================================
+
+/**
+ * Benchmark timing structure for LLM inference.
+ *
+ * Captures timestamps at key points during inference:
+ * - t0: Request start (component API entry)
+ * - t2: Prefill start (backend, before llama_decode for prompt)
+ * - t3: Prefill end (backend, after llama_decode returns)
+ * - t4: First token (component, first token callback)
+ * - t5: Last token (backend, decode loop exits)
+ * - t6: Request end (component, before complete callback)
+ *
+ * All timestamps are in milliseconds from a process-local epoch.
+ * Use rac_monotonic_now_ms() to get comparable timestamps.
+ *
+ * Note: t1 is intentionally skipped to match the specification.
+ */
+typedef struct rac_benchmark_timing {
+    /** t0: Request start - recorded at component API entry */
+    int64_t t0_request_start_ms;
+
+    /** t2: Prefill start - recorded before llama_decode for prompt batch */
+    int64_t t2_prefill_start_ms;
+
+    /** t3: Prefill end - recorded after llama_decode returns for prompt */
+    int64_t t3_prefill_end_ms;
+
+    /** t4: First token - recorded when first token callback is invoked */
+    int64_t t4_first_token_ms;
+
+    /** t5: Last token - recorded when decode loop exits */
+    int64_t t5_last_token_ms;
+
+    /** t6: Request end - recorded before complete callback */
+    int64_t t6_request_end_ms;
+
+    /** Number of tokens in the prompt */
+    int32_t prompt_tokens;
+
+    /** Number of tokens generated */
+    int32_t output_tokens;
+
+    /**
+     * Status of the request:
+     * - 0: Success
+     * - Non-zero: Error code (from rac_result_t)
+     */
+    int32_t status;
+
+} rac_benchmark_timing_t;
+
+// =============================================================================
+// BENCHMARK STATUS CODES
+// =============================================================================
+
+/** Benchmark request completed successfully */
+#define RAC_BENCHMARK_STATUS_SUCCESS ((int32_t)0)
+
+/** Benchmark request failed due to error */
+#define RAC_BENCHMARK_STATUS_ERROR ((int32_t)1)
+
+/** Benchmark request timed out */
+#define RAC_BENCHMARK_STATUS_TIMEOUT ((int32_t)2)
+
+/** Benchmark request was cancelled */
+#define RAC_BENCHMARK_STATUS_CANCELLED ((int32_t)3)
+
+// =============================================================================
+// MONOTONIC TIME API
+// =============================================================================
+
+/**
+ * Gets the current monotonic time in milliseconds.
+ *
+ * Uses std::chrono::steady_clock for accurate, monotonic timing that is not
+ * affected by system clock changes. The returned value is relative to a
+ * process-local epoch (the first call to this function).
+ *
+ * This function is thread-safe and lock-free on all supported platforms.
+ *
+ * @return Current monotonic time in milliseconds from process-local epoch
+ */
+RAC_API int64_t rac_monotonic_now_ms(void);
+
+// =============================================================================
+// UTILITY FUNCTIONS
+// =============================================================================
+
+/**
+ * Initializes a benchmark timing struct to zero values.
+ *
+ * @param timing Pointer to timing struct to initialize
+ */
+RAC_API void rac_benchmark_timing_init(rac_benchmark_timing_t* timing);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* RAC_BENCHMARK_H */
diff --git a/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_component.h b/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_component.h
@@ -13,6 +13,7 @@
 #define RAC_LLM_COMPONENT_H
 
 #include "rac/core/capabilities/rac_lifecycle.h"
+#include "rac/core/rac_benchmark.h"
 #include "rac/core/rac_error.h"
 #include "rac/features/llm/rac_llm_types.h"
 
@@ -196,6 +197,36 @@ RAC_API rac_result_t rac_llm_component_generate_stream(
     rac_llm_component_complete_callback_fn complete_callback,
     rac_llm_component_error_callback_fn error_callback, void* user_data);
 
+/**
+ * @brief Generate text with streaming and benchmark timing
+ *
+ * Same as rac_llm_component_generate_stream but with optional benchmark timing.
+ * When timing_out is non-NULL, captures detailed timing information:
+ * - t0: Request start (set at API entry)
+ * - t4: First token (set in token callback)
+ * - t6: Request end (set before complete callback)
+ *
+ * Backend timestamps (t2, t3, t5) are captured by the backend if it supports timing.
+ *
+ * Zero overhead when timing_out is NULL - behaves exactly like generate_stream.
+ *
+ * @param handle Component handle
+ * @param prompt Input prompt
+ * @param options Generation options (can be NULL for defaults)
+ * @param token_callback Called for each generated token
+ * @param complete_callback Called when generation completes
+ * @param error_callback Called on error
+ * @param user_data User context passed to callbacks
+ * @param timing_out Output: Benchmark timing (can be NULL for no timing)
+ * @return RAC_SUCCESS or error code
+ */
+RAC_API rac_result_t rac_llm_component_generate_stream_with_timing(
+    rac_handle_t handle, const char* prompt, const rac_llm_options_t* options,
+    rac_llm_component_token_callback_fn token_callback,
+    rac_llm_component_complete_callback_fn complete_callback,
+    rac_llm_component_error_callback_fn error_callback, void* user_data,
+    rac_benchmark_timing_t* timing_out);
+
 /**
  * @brief Get lifecycle state
  *

diff --git a/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h b/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h
@@ -10,6 +10,7 @@
 #ifndef RAC_LLM_SERVICE_H
 #define RAC_LLM_SERVICE_H
 
+#include "rac/core/rac_benchmark.h"
 #include "rac/core/rac_error.h"
 #include "rac/features/llm/rac_llm_types.h"
 
@@ -38,6 +39,21 @@ typedef struct rac_llm_service_ops {
                                     const rac_llm_options_t* options,
                                     rac_llm_stream_callback_fn callback, void* user_data);
 
+    /**
+     * Generate text with streaming callback and benchmark timing.
+     * Optional: backends that don't support timing can leave this NULL.
+     * If NULL, rac_llm_generate_stream_with_timing falls back to generate_stream.
+     *
+     * Backends that implement this should capture:
+     * - t2: Before prefill (llama_decode for prompt)
+     * - t3: After prefill completes
+     * - t5: When decode loop exits (last token)
+     */
+    rac_result_t (*generate_stream_with_timing)(void* impl, const char* prompt,
+                                                const rac_llm_options_t* options,
+                                                rac_llm_stream_callback_fn callback, void* user_data,
+                                                rac_benchmark_timing_t* timing_out);
+
     /** Get service info */
     rac_result_t (*get_info)(void* impl, rac_llm_info_t* out_info);
 
@@ -117,6 +133,32 @@ RAC_API rac_result_t rac_llm_generate_stream(rac_handle_t handle, const char* pr
                                              const rac_llm_options_t* options,
                                              rac_llm_stream_callback_fn callback, void* user_data);
 
+/**
+ * @brief Stream generate text with benchmark timing
+ *
+ * Same as rac_llm_generate_stream but with optional benchmark timing.
+ * If timing_out is non-NULL and the backend supports timing, captures:
+ * - t2: Before prefill
+ * - t3: After prefill
+ * - t5: Last token generated
+ *
+ * If the backend doesn't implement generate_stream_with_timing, falls back
+ * to generate_stream (timing_out will have t2/t3/t5 as zeros).
+ *
+ * @param handle Service handle
+ * @param prompt Input prompt
+ * @param options Generation options (can be NULL for defaults)
+ * @param callback Callback for each token
+ * @param user_data User context passed to callback
+ * @param timing_out Output: Benchmark timing (can be NULL for no timing)
+ * @return RAC_SUCCESS or error code
+ */
+RAC_API rac_result_t rac_llm_generate_stream_with_timing(rac_handle_t handle, const char* prompt,
+                                                         const rac_llm_options_t* options,
+                                                         rac_llm_stream_callback_fn callback,
+                                                         void* user_data,
+                                                         rac_benchmark_timing_t* timing_out);
+
 /**
  * @brief Get service information
  *