Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions sdk/runanywhere-commons/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ set(RAC_CORE_SOURCES
src/core/rac_core.cpp
src/core/rac_error.cpp
src/core/rac_time.cpp
src/core/rac_benchmark.cpp
src/core/rac_memory.cpp
src/core/rac_logger.cpp
src/core/rac_audio_utils.cpp
Expand Down
22 changes: 22 additions & 0 deletions sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#ifndef RAC_LLM_LLAMACPP_H
#define RAC_LLM_LLAMACPP_H

#include "rac/core/rac_benchmark.h"
#include "rac/core/rac_error.h"
#include "rac/core/rac_types.h"
#include "rac/features/llm/rac_llm.h"
Expand Down Expand Up @@ -163,6 +164,27 @@ RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_generate_stream(
rac_handle_t handle, const char* prompt, const rac_llm_options_t* options,
rac_llm_llamacpp_stream_callback_fn callback, void* user_data);

/**
* Generates text with streaming callback and benchmark timing.
*
* Same as rac_llm_llamacpp_generate_stream but captures benchmark timing:
* - t2: Before prefill (llama_decode for prompt batch)
* - t3: After prefill completes
* - t5: When decode loop exits (last token)
*
* @param handle Service handle
* @param prompt Input prompt text
* @param options Generation options
* @param callback Callback for each token
* @param user_data User context passed to callback
* @param timing_out Output: Benchmark timing (can be NULL for no timing)
* @return RAC_SUCCESS or error code
*/
RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_generate_stream_with_timing(
rac_handle_t handle, const char* prompt, const rac_llm_options_t* options,
rac_llm_llamacpp_stream_callback_fn callback, void* user_data,
rac_benchmark_timing_t* timing_out);

/**
* Cancels ongoing generation.
*
Expand Down
126 changes: 126 additions & 0 deletions sdk/runanywhere-commons/include/rac/core/rac_benchmark.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/**
* @file rac_benchmark.h
* @brief RunAnywhere Commons - Benchmark Timing Support
*
* This header provides types and functions for benchmark timing instrumentation.
* The timing struct captures key timestamps during LLM inference for performance
* measurement and analysis.
*
* Design principles:
* - Zero overhead when not benchmarking: timing is opt-in via pointer parameter
* - Monotonic clock: uses steady_clock for accurate cross-platform timing
* - All timestamps are relative to a process-local epoch (not wall-clock)
*/

#ifndef RAC_BENCHMARK_H
#define RAC_BENCHMARK_H

#include "rac/core/rac_types.h"

#ifdef __cplusplus
extern "C" {
#endif

// =============================================================================
// BENCHMARK TIMING STRUCT
// =============================================================================

/**
* Benchmark timing structure for LLM inference.
*
* Captures timestamps at key points during inference:
* - t0: Request start (component API entry)
* - t2: Prefill start (backend, before llama_decode for prompt)
* - t3: Prefill end (backend, after llama_decode returns)
* - t4: First token (component, first token callback)
* - t5: Last token (backend, decode loop exits)
* - t6: Request end (component, before complete callback)
*
* All timestamps are in milliseconds from a process-local epoch.
* Use rac_monotonic_now_ms() to get comparable timestamps.
*
* Note: t1 is intentionally skipped to match the specification.
*/
typedef struct rac_benchmark_timing {
/** t0: Request start - recorded at component API entry */
int64_t t0_request_start_ms;

/** t2: Prefill start - recorded before llama_decode for prompt batch */
int64_t t2_prefill_start_ms;

/** t3: Prefill end - recorded after llama_decode returns for prompt */
int64_t t3_prefill_end_ms;

/** t4: First token - recorded when first token callback is invoked */
int64_t t4_first_token_ms;

/** t5: Last token - recorded when decode loop exits */
int64_t t5_last_token_ms;

/** t6: Request end - recorded before complete callback */
int64_t t6_request_end_ms;

/** Number of tokens in the prompt */
int32_t prompt_tokens;

/** Number of tokens generated */
int32_t output_tokens;

/**
* Status of the request:
* - 0: Success
* - Non-zero: Error code (from rac_result_t)
*/
int32_t status;

} rac_benchmark_timing_t;

// =============================================================================
// BENCHMARK STATUS CODES
// =============================================================================

/** Benchmark request completed successfully */
#define RAC_BENCHMARK_STATUS_SUCCESS ((int32_t)0)

/** Benchmark request failed due to error */
#define RAC_BENCHMARK_STATUS_ERROR ((int32_t)1)

/** Benchmark request timed out */
#define RAC_BENCHMARK_STATUS_TIMEOUT ((int32_t)2)

/** Benchmark request was cancelled */
#define RAC_BENCHMARK_STATUS_CANCELLED ((int32_t)3)

// =============================================================================
// MONOTONIC TIME API
// =============================================================================

/**
* Gets the current monotonic time in milliseconds.
*
* Uses std::chrono::steady_clock for accurate, monotonic timing that is not
* affected by system clock changes. The returned value is relative to a
* process-local epoch (the first call to this function).
*
* This function is thread-safe and lock-free on all supported platforms.
*
* @return Current monotonic time in milliseconds from process-local epoch
*/
RAC_API int64_t rac_monotonic_now_ms(void);

// =============================================================================
// UTILITY FUNCTIONS
// =============================================================================

/**
* Initializes a benchmark timing struct to zero values.
*
* @param timing Pointer to timing struct to initialize
*/
RAC_API void rac_benchmark_timing_init(rac_benchmark_timing_t* timing);

#ifdef __cplusplus
}
#endif

#endif /* RAC_BENCHMARK_H */
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#define RAC_LLM_COMPONENT_H

#include "rac/core/capabilities/rac_lifecycle.h"
#include "rac/core/rac_benchmark.h"
#include "rac/core/rac_error.h"
#include "rac/features/llm/rac_llm_types.h"

Expand Down Expand Up @@ -196,6 +197,36 @@ RAC_API rac_result_t rac_llm_component_generate_stream(
rac_llm_component_complete_callback_fn complete_callback,
rac_llm_component_error_callback_fn error_callback, void* user_data);

/**
* @brief Generate text with streaming and benchmark timing
*
* Same as rac_llm_component_generate_stream but with optional benchmark timing.
* When timing_out is non-NULL, captures detailed timing information:
* - t0: Request start (set at API entry)
* - t4: First token (set in token callback)
* - t6: Request end (set before complete callback)
*
* Backend timestamps (t2, t3, t5) are captured by the backend if it supports timing.
*
* Zero overhead when timing_out is NULL - behaves exactly like generate_stream.
*
* @param handle Component handle
* @param prompt Input prompt
* @param options Generation options (can be NULL for defaults)
* @param token_callback Called for each generated token
* @param complete_callback Called when generation completes
* @param error_callback Called on error
* @param user_data User context passed to callbacks
* @param timing_out Output: Benchmark timing (can be NULL for no timing)
* @return RAC_SUCCESS or error code
*/
RAC_API rac_result_t rac_llm_component_generate_stream_with_timing(
rac_handle_t handle, const char* prompt, const rac_llm_options_t* options,
rac_llm_component_token_callback_fn token_callback,
rac_llm_component_complete_callback_fn complete_callback,
rac_llm_component_error_callback_fn error_callback, void* user_data,
rac_benchmark_timing_t* timing_out);

/**
* @brief Get lifecycle state
*
Expand Down
42 changes: 42 additions & 0 deletions sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#ifndef RAC_LLM_SERVICE_H
#define RAC_LLM_SERVICE_H

#include "rac/core/rac_benchmark.h"
#include "rac/core/rac_error.h"
#include "rac/features/llm/rac_llm_types.h"

Expand Down Expand Up @@ -38,6 +39,21 @@ typedef struct rac_llm_service_ops {
const rac_llm_options_t* options,
rac_llm_stream_callback_fn callback, void* user_data);

/**
* Generate text with streaming callback and benchmark timing.
* Optional: backends that don't support timing can leave this NULL.
* If NULL, rac_llm_generate_stream_with_timing falls back to generate_stream.
*
* Backends that implement this should capture:
* - t2: Before prefill (llama_decode for prompt)
* - t3: After prefill completes
* - t5: When decode loop exits (last token)
*/
rac_result_t (*generate_stream_with_timing)(void* impl, const char* prompt,
const rac_llm_options_t* options,
rac_llm_stream_callback_fn callback, void* user_data,
rac_benchmark_timing_t* timing_out);

/** Get service info */
rac_result_t (*get_info)(void* impl, rac_llm_info_t* out_info);

Expand Down Expand Up @@ -117,6 +133,32 @@ RAC_API rac_result_t rac_llm_generate_stream(rac_handle_t handle, const char* pr
const rac_llm_options_t* options,
rac_llm_stream_callback_fn callback, void* user_data);

/**
* @brief Stream generate text with benchmark timing
*
* Same as rac_llm_generate_stream but with optional benchmark timing.
* If timing_out is non-NULL and the backend supports timing, captures:
* - t2: Before prefill
* - t3: After prefill
* - t5: Last token generated
*
* If the backend doesn't implement generate_stream_with_timing, falls back
* to generate_stream (timing_out will have t2/t3/t5 as zeros).
*
* @param handle Service handle
* @param prompt Input prompt
* @param options Generation options (can be NULL for defaults)
* @param callback Callback for each token
* @param user_data User context passed to callback
* @param timing_out Output: Benchmark timing (can be NULL for no timing)
* @return RAC_SUCCESS or error code
*/
RAC_API rac_result_t rac_llm_generate_stream_with_timing(rac_handle_t handle, const char* prompt,
const rac_llm_options_t* options,
rac_llm_stream_callback_fn callback,
void* user_data,
rac_benchmark_timing_t* timing_out);

/**
* @brief Get service information
*
Expand Down
Loading
Loading