diff --git a/sdk/runanywhere-commons/CMakeLists.txt b/sdk/runanywhere-commons/CMakeLists.txt index b958a86dd..1311257f4 100644 --- a/sdk/runanywhere-commons/CMakeLists.txt +++ b/sdk/runanywhere-commons/CMakeLists.txt @@ -129,6 +129,7 @@ set(RAC_CORE_SOURCES src/core/rac_core.cpp src/core/rac_error.cpp src/core/rac_time.cpp + src/core/rac_benchmark.cpp src/core/rac_memory.cpp src/core/rac_logger.cpp src/core/rac_audio_utils.cpp diff --git a/sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h b/sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h index 41ed7cacf..bce490311 100644 --- a/sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h +++ b/sdk/runanywhere-commons/include/rac/backends/rac_llm_llamacpp.h @@ -11,6 +11,7 @@ #ifndef RAC_LLM_LLAMACPP_H #define RAC_LLM_LLAMACPP_H +#include "rac/core/rac_benchmark.h" #include "rac/core/rac_error.h" #include "rac/core/rac_types.h" #include "rac/features/llm/rac_llm.h" @@ -163,6 +164,32 @@ RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_generate_stream( rac_handle_t handle, const char* prompt, const rac_llm_options_t* options, rac_llm_llamacpp_stream_callback_fn callback, void* user_data); +/** + * Generates text with streaming callback and benchmark timing. + * + * Same as rac_llm_llamacpp_generate_stream but captures benchmark timing: + * - t2: Before prefill (llama_decode for prompt batch) + * - t3: After prefill completes + * - t5: When decode loop exits (last token) + * + * @param handle Service handle + * @param prompt Input prompt text + * @param options Generation options + * @param callback Callback for each token + * @param user_data User context passed to callback + * @param timing_out Output: Benchmark timing struct, caller-allocated. + * Must remain valid for the duration of the call. + * Caller should initialize via rac_benchmark_timing_init() before passing. + * On success, all t2/t3/t5 fields are populated. + * On failure, status is set but timing fields may be partial. + * Pass NULL to skip timing (zero overhead). + * @return RAC_SUCCESS or error code + */ +RAC_LLAMACPP_API rac_result_t rac_llm_llamacpp_generate_stream_with_timing( + rac_handle_t handle, const char* prompt, const rac_llm_options_t* options, + rac_llm_llamacpp_stream_callback_fn callback, void* user_data, + rac_benchmark_timing_t* timing_out); + /** * Cancels ongoing generation. * diff --git a/sdk/runanywhere-commons/include/rac/core/rac_benchmark.h b/sdk/runanywhere-commons/include/rac/core/rac_benchmark.h new file mode 100644 index 000000000..d0ac8ec39 --- /dev/null +++ b/sdk/runanywhere-commons/include/rac/core/rac_benchmark.h @@ -0,0 +1,129 @@ +/** + * @file rac_benchmark.h + * @brief RunAnywhere Commons - Benchmark Timing Support + * + * This header provides types and functions for benchmark timing instrumentation. + * The timing struct captures key timestamps during LLM inference for performance + * measurement and analysis. + * + * Design principles: + * - Zero overhead when not benchmarking: timing is opt-in via pointer parameter + * - Monotonic clock: uses steady_clock for accurate cross-platform timing + * - All timestamps are relative to a process-local epoch (not wall-clock) + */ + +#ifndef RAC_BENCHMARK_H +#define RAC_BENCHMARK_H + +#include "rac/core/rac_types.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// ============================================================================= +// BENCHMARK TIMING STRUCT +// ============================================================================= + +/** + * Benchmark timing structure for LLM inference. + * + * Captures timestamps at key points during inference: + * - t0: Request start (component API entry) + * - t2: Prefill start (backend, before llama_decode for prompt) + * - t3: Prefill end (backend, after llama_decode returns) + * - t4: First token (component, first token callback) + * - t5: Last token (backend, decode loop exits) + * - t6: Request end (component, before complete callback) + * + * All timestamps are in milliseconds from a process-local epoch. + * Use rac_monotonic_now_ms() to get comparable timestamps. + * + * Note: t1 is intentionally skipped to match the specification. + */ +typedef struct rac_benchmark_timing { + /** t0: Request start - recorded at component API entry */ + int64_t t0_request_start_ms; + + /** t2: Prefill start - recorded before llama_decode for prompt batch */ + int64_t t2_prefill_start_ms; + + /** t3: Prefill end - recorded after llama_decode returns for prompt */ + int64_t t3_prefill_end_ms; + + /** t4: First token - recorded when first token callback is invoked */ + int64_t t4_first_token_ms; + + /** t5: Last token - recorded when decode loop exits */ + int64_t t5_last_token_ms; + + /** t6: Request end - recorded before complete callback */ + int64_t t6_request_end_ms; + + /** Number of tokens in the prompt */ + int32_t prompt_tokens; + + /** Number of tokens generated */ + int32_t output_tokens; + + /** + * Status of the benchmark request. + * Uses RAC_BENCHMARK_STATUS_* codes: + * - RAC_BENCHMARK_STATUS_SUCCESS (0): Completed successfully + * - RAC_BENCHMARK_STATUS_ERROR (1): Failed + * - RAC_BENCHMARK_STATUS_TIMEOUT (2): Timed out + * - RAC_BENCHMARK_STATUS_CANCELLED (3): Cancelled + */ + int32_t status; + +} rac_benchmark_timing_t; + +// ============================================================================= +// BENCHMARK STATUS CODES +// ============================================================================= + +/** Benchmark request completed successfully */ +#define RAC_BENCHMARK_STATUS_SUCCESS ((int32_t)0) + +/** Benchmark request failed due to error */ +#define RAC_BENCHMARK_STATUS_ERROR ((int32_t)1) + +/** Benchmark request timed out */ +#define RAC_BENCHMARK_STATUS_TIMEOUT ((int32_t)2) + +/** Benchmark request was cancelled */ +#define RAC_BENCHMARK_STATUS_CANCELLED ((int32_t)3) + +// ============================================================================= +// MONOTONIC TIME API +// ============================================================================= + +/** + * Gets the current monotonic time in milliseconds. + * + * Uses std::chrono::steady_clock for accurate, monotonic timing that is not + * affected by system clock changes. The returned value is relative to a + * process-local epoch (the first call to this function). + * + * This function is thread-safe and lock-free on all supported platforms. + * + * @return Current monotonic time in milliseconds from process-local epoch + */ +RAC_API int64_t rac_monotonic_now_ms(void); + +// ============================================================================= +// UTILITY FUNCTIONS +// ============================================================================= + +/** + * Initializes a benchmark timing struct to zero values. + * + * @param timing Pointer to timing struct to initialize + */ +RAC_API void rac_benchmark_timing_init(rac_benchmark_timing_t* timing); + +#ifdef __cplusplus +} +#endif + +#endif /* RAC_BENCHMARK_H */ diff --git a/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_component.h b/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_component.h index 0947e60bb..82ef249d7 100644 --- a/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_component.h +++ b/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_component.h @@ -13,6 +13,7 @@ #define RAC_LLM_COMPONENT_H #include "rac/core/capabilities/rac_lifecycle.h" +#include "rac/core/rac_benchmark.h" #include "rac/core/rac_error.h" #include "rac/features/llm/rac_llm_types.h" @@ -196,6 +197,42 @@ RAC_API rac_result_t rac_llm_component_generate_stream( rac_llm_component_complete_callback_fn complete_callback, rac_llm_component_error_callback_fn error_callback, void* user_data); +/** + * @brief Generate text with streaming and benchmark timing + * + * Same as rac_llm_component_generate_stream but with optional benchmark timing. + * When timing_out is non-NULL, captures detailed timing information: + * - t0: Request start (set at API entry) + * - t4: First token (set in token callback) + * - t6: Request end (set before complete callback) + * + * Backend timestamps (t2, t3, t5) are captured by the backend if it supports timing. + * + * Zero overhead when timing_out is NULL - behaves exactly like generate_stream. + * + * @param handle Component handle + * @param prompt Input prompt + * @param options Generation options (can be NULL for defaults) + * @param token_callback Called for each generated token + * @param complete_callback Called when generation completes + * @param error_callback Called on error + * @param user_data User context passed to callbacks + * @param timing_out Output: Benchmark timing struct, caller-allocated. + * Must remain valid for the duration of the call. + * Caller should initialize via rac_benchmark_timing_init() before passing. + * Component fills t0/t4/t6, backend fills t2/t3/t5. + * On success, all timing fields are populated. + * On failure, status is set but timing fields may be partial. + * Pass NULL to skip timing (zero overhead). + * @return RAC_SUCCESS or error code + */ +RAC_API rac_result_t rac_llm_component_generate_stream_with_timing( + rac_handle_t handle, const char* prompt, const rac_llm_options_t* options, + rac_llm_component_token_callback_fn token_callback, + rac_llm_component_complete_callback_fn complete_callback, + rac_llm_component_error_callback_fn error_callback, void* user_data, + rac_benchmark_timing_t* timing_out); + /** * @brief Get lifecycle state * diff --git a/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h b/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h index 74720f0c5..9b9960db1 100644 --- a/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h +++ b/sdk/runanywhere-commons/include/rac/features/llm/rac_llm_service.h @@ -10,6 +10,7 @@ #ifndef RAC_LLM_SERVICE_H #define RAC_LLM_SERVICE_H +#include "rac/core/rac_benchmark.h" #include "rac/core/rac_error.h" #include "rac/features/llm/rac_llm_types.h" @@ -38,6 +39,21 @@ typedef struct rac_llm_service_ops { const rac_llm_options_t* options, rac_llm_stream_callback_fn callback, void* user_data); + /** + * Generate text with streaming callback and benchmark timing. + * Optional: backends that don't support timing can leave this NULL. + * If NULL, rac_llm_generate_stream_with_timing falls back to generate_stream. + * + * Backends that implement this should capture: + * - t2: Before prefill (llama_decode for prompt) + * - t3: After prefill completes + * - t5: When decode loop exits (last token) + */ + rac_result_t (*generate_stream_with_timing)(void* impl, const char* prompt, + const rac_llm_options_t* options, + rac_llm_stream_callback_fn callback, void* user_data, + rac_benchmark_timing_t* timing_out); + /** Get service info */ rac_result_t (*get_info)(void* impl, rac_llm_info_t* out_info); @@ -117,6 +133,32 @@ RAC_API rac_result_t rac_llm_generate_stream(rac_handle_t handle, const char* pr const rac_llm_options_t* options, rac_llm_stream_callback_fn callback, void* user_data); +/** + * @brief Stream generate text with benchmark timing + * + * Same as rac_llm_generate_stream but with optional benchmark timing. + * If timing_out is non-NULL and the backend supports timing, captures: + * - t2: Before prefill + * - t3: After prefill + * - t5: Last token generated + * + * If the backend doesn't implement generate_stream_with_timing, falls back + * to generate_stream (timing_out will have t2/t3/t5 as zeros). + * + * @param handle Service handle + * @param prompt Input prompt + * @param options Generation options (can be NULL for defaults) + * @param callback Callback for each token + * @param user_data User context passed to callback + * @param timing_out Output: Benchmark timing (can be NULL for no timing) + * @return RAC_SUCCESS or error code + */ +RAC_API rac_result_t rac_llm_generate_stream_with_timing(rac_handle_t handle, const char* prompt, + const rac_llm_options_t* options, + rac_llm_stream_callback_fn callback, + void* user_data, + rac_benchmark_timing_t* timing_out); + /** * @brief Get service information * diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp index 27733806b..01da2b34c 100644 --- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp +++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.cpp @@ -557,6 +557,159 @@ bool LlamaCppTextGeneration::generate_stream(const TextGenerationRequest& reques return !cancel_requested_.load(); } +bool LlamaCppTextGeneration::generate_stream_with_timing(const TextGenerationRequest& request, + TextStreamCallback callback, + int* out_prompt_tokens, + rac_benchmark_timing_t* timing_out) { + std::lock_guard lock(mutex_); + + if (!is_ready()) { + LOGE("Model not ready for generation"); + return false; + } + + cancel_requested_.store(false); + + std::string prompt = build_prompt(request); + LOGI("Generating with timing, prompt length: %zu", prompt.length()); + + const auto tokens_list = common_tokenize(context_, prompt, true, true); + + int n_ctx = llama_n_ctx(context_); + int prompt_tokens = static_cast(tokens_list.size()); + + if (out_prompt_tokens) { + *out_prompt_tokens = prompt_tokens; + } + + int available_tokens = n_ctx - prompt_tokens - 4; + + if (available_tokens <= 0) { + LOGE("Prompt too long: %d tokens, context size: %d", prompt_tokens, n_ctx); + return false; + } + + int effective_max_tokens = std::min(request.max_tokens, available_tokens); + if (effective_max_tokens < request.max_tokens) { + LOGI("Capping max_tokens: %d → %d (context=%d, prompt=%d tokens)", request.max_tokens, + effective_max_tokens, n_ctx, prompt_tokens); + } + LOGI("Generation with timing: prompt_tokens=%d, max_tokens=%d, context=%d", prompt_tokens, + effective_max_tokens, n_ctx); + + llama_batch batch = llama_batch_init(n_ctx, 0, 1); + + batch.n_tokens = 0; + for (size_t i = 0; i < tokens_list.size(); i++) { + common_batch_add(batch, tokens_list[i], i, {0}, false); + } + batch.logits[batch.n_tokens - 1] = true; + + // t2: Record prefill start (before llama_decode for prompt) + if (timing_out != nullptr) { + timing_out->t2_prefill_start_ms = rac_monotonic_now_ms(); + } + + if (llama_decode(context_, batch) != 0) { + LOGE("llama_decode failed for prompt"); + if (timing_out != nullptr) { + int64_t now = rac_monotonic_now_ms(); + timing_out->t3_prefill_end_ms = now; + timing_out->t5_last_token_ms = now; + } + llama_batch_free(batch); + return false; + } + + // t3: Record prefill end (after llama_decode returns) + if (timing_out != nullptr) { + timing_out->t3_prefill_end_ms = rac_monotonic_now_ms(); + } + + llama_sampler_reset(sampler_); + + const auto vocab = llama_model_get_vocab(model_); + std::string cached_token_chars; + std::string accumulated_text; + int n_cur = batch.n_tokens; + int tokens_generated = 0; + + while (tokens_generated < effective_max_tokens && !cancel_requested_.load()) { + const llama_token new_token_id = llama_sampler_sample(sampler_, context_, -1); + + llama_sampler_accept(sampler_, new_token_id); + + if (llama_vocab_is_eog(vocab, new_token_id)) { + LOGI("End of generation token received"); + break; + } + + auto new_token_chars = common_token_to_piece(context_, new_token_id); + cached_token_chars += new_token_chars; + accumulated_text += new_token_chars; + + static const std::vector stop_sequences = { + "<|im_end|>", + "<|eot_id|>", + "", + "<|end|>", + "<|endoftext|>", + "\n\nUser:", + "\n\nHuman:", + }; + + bool hit_stop_sequence = false; + for (const auto& stop_seq : stop_sequences) { + size_t pos = accumulated_text.find(stop_seq); + if (pos != std::string::npos) { + LOGI("Stop sequence detected: %s", stop_seq.c_str()); + hit_stop_sequence = true; + break; + } + } + + if (hit_stop_sequence) { + break; + } + + if (is_valid_utf8(cached_token_chars.c_str())) { + if (!callback(cached_token_chars)) { + LOGI("Generation cancelled by callback"); + cancel_requested_.store(true); + break; + } + cached_token_chars.clear(); + } + + batch.n_tokens = 0; + common_batch_add(batch, new_token_id, n_cur, {0}, true); + + n_cur++; + tokens_generated++; + + if (llama_decode(context_, batch) != 0) { + LOGE("llama_decode failed during generation"); + break; + } + } + + // t5: Record last token time (decode loop exit) + if (timing_out != nullptr) { + timing_out->t5_last_token_ms = rac_monotonic_now_ms(); + } + + if (!cached_token_chars.empty() && is_valid_utf8(cached_token_chars.c_str())) { + callback(cached_token_chars); + } + + llama_memory_clear(llama_get_memory(context_), true); + + llama_batch_free(batch); + + LOGI("Generation with timing complete: %d tokens", tokens_generated); + return !cancel_requested_.load(); +} + void LlamaCppTextGeneration::cancel() { cancel_requested_.store(true); LOGI("Generation cancel requested"); diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h index 2d8deb065..29fa4de20 100644 --- a/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h +++ b/sdk/runanywhere-commons/src/backends/llamacpp/llamacpp_backend.h @@ -18,6 +18,8 @@ #include +#include "rac/core/rac_benchmark.h" + namespace runanywhere { // ============================================================================= @@ -117,6 +119,16 @@ class LlamaCppTextGeneration { } bool generate_stream(const TextGenerationRequest& request, TextStreamCallback callback, int* out_prompt_tokens); + + /** + * Generate text with streaming and benchmark timing. + * Captures t2 (prefill start), t3 (prefill end), t5 (last token). + * @param timing_out Benchmark timing struct (can be NULL for no timing) + */ + bool generate_stream_with_timing(const TextGenerationRequest& request, + TextStreamCallback callback, int* out_prompt_tokens, + rac_benchmark_timing_t* timing_out); + void cancel(); nlohmann::json get_model_info() const; diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/rac_backend_llamacpp_register.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/rac_backend_llamacpp_register.cpp index 5123ab923..5aeeee662 100644 --- a/sdk/runanywhere-commons/src/backends/llamacpp/rac_backend_llamacpp_register.cpp +++ b/sdk/runanywhere-commons/src/backends/llamacpp/rac_backend_llamacpp_register.cpp @@ -64,6 +64,18 @@ static rac_result_t llamacpp_vtable_generate_stream(void* impl, const char* prom &adapter); } +// Generate stream with benchmark timing +static rac_result_t llamacpp_vtable_generate_stream_with_timing(void* impl, const char* prompt, + const rac_llm_options_t* options, + rac_llm_stream_callback_fn callback, + void* user_data, + rac_benchmark_timing_t* timing_out) { + StreamAdapter adapter = {callback, user_data}; + return rac_llm_llamacpp_generate_stream_with_timing(impl, prompt, options, + stream_adapter_callback, &adapter, + timing_out); +} + // Get info static rac_result_t llamacpp_vtable_get_info(void* impl, rac_llm_info_t* out_info) { if (!out_info) @@ -114,6 +126,7 @@ static const rac_llm_service_ops_t g_llamacpp_ops = { .initialize = llamacpp_vtable_initialize, .generate = llamacpp_vtable_generate, .generate_stream = llamacpp_vtable_generate_stream, + .generate_stream_with_timing = llamacpp_vtable_generate_stream_with_timing, .get_info = llamacpp_vtable_get_info, .cancel = llamacpp_vtable_cancel, .cleanup = llamacpp_vtable_cleanup, diff --git a/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp b/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp index cde9dc275..a65f40291 100644 --- a/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp +++ b/sdk/runanywhere-commons/src/backends/llamacpp/rac_llm_llamacpp.cpp @@ -225,6 +225,52 @@ rac_result_t rac_llm_llamacpp_generate_stream(rac_handle_t handle, const char* p return success ? RAC_SUCCESS : RAC_ERROR_INFERENCE_FAILED; } +rac_result_t rac_llm_llamacpp_generate_stream_with_timing(rac_handle_t handle, const char* prompt, + const rac_llm_options_t* options, + rac_llm_llamacpp_stream_callback_fn callback, + void* user_data, + rac_benchmark_timing_t* timing_out) { + if (handle == nullptr || prompt == nullptr || callback == nullptr) { + return RAC_ERROR_NULL_POINTER; + } + + auto* h = static_cast(handle); + if (!h->text_gen) { + return RAC_ERROR_INVALID_HANDLE; + } + + runanywhere::TextGenerationRequest request; + request.prompt = prompt; + if (options != nullptr) { + request.max_tokens = options->max_tokens; + request.temperature = options->temperature; + request.top_p = options->top_p; + if (options->stop_sequences != nullptr && options->num_stop_sequences > 0) { + for (int32_t i = 0; i < options->num_stop_sequences; i++) { + if (options->stop_sequences[i]) { + request.stop_sequences.push_back(options->stop_sequences[i]); + } + } + } + } + + // Stream using C++ class with timing + bool success = h->text_gen->generate_stream_with_timing( + request, + [callback, user_data](const std::string& token) -> bool { + return callback(token.c_str(), RAC_FALSE, user_data) == RAC_TRUE; + }, + nullptr, // out_prompt_tokens not needed, timing is captured internally + timing_out // Pass timing struct to backend + ); + + if (success) { + callback("", RAC_TRUE, user_data); // Final token + } + + return success ? RAC_SUCCESS : RAC_ERROR_INFERENCE_FAILED; +} + void rac_llm_llamacpp_cancel(rac_handle_t handle) { if (handle == nullptr) { return; diff --git a/sdk/runanywhere-commons/src/core/rac_benchmark.cpp b/sdk/runanywhere-commons/src/core/rac_benchmark.cpp new file mode 100644 index 000000000..44f5840e1 --- /dev/null +++ b/sdk/runanywhere-commons/src/core/rac_benchmark.cpp @@ -0,0 +1,55 @@ +/** + * @file rac_benchmark.cpp + * @brief RunAnywhere Commons - Benchmark Timing Implementation + * + * Implements monotonic time helper and benchmark timing utilities. + * Uses std::chrono::steady_clock for accurate, cross-platform timing + * that is not affected by system clock adjustments. + */ + +#include "rac/core/rac_benchmark.h" + +#include +#include + +namespace { + +/** + * Process-local epoch for monotonic timing. + * Initialized on first call to rac_monotonic_now_ms(). + * Using a local epoch keeps timestamp values small and manageable. + */ +class MonotonicEpoch { + public: + static MonotonicEpoch& instance() { + static MonotonicEpoch epoch; + return epoch; + } + + int64_t elapsed_ms() const { + auto now = std::chrono::steady_clock::now(); + auto duration = now - epoch_; + return std::chrono::duration_cast(duration).count(); + } + + private: + MonotonicEpoch() : epoch_(std::chrono::steady_clock::now()) {} + + std::chrono::steady_clock::time_point epoch_; +}; + +} // namespace + +extern "C" { + +int64_t rac_monotonic_now_ms(void) { + return MonotonicEpoch::instance().elapsed_ms(); +} + +void rac_benchmark_timing_init(rac_benchmark_timing_t* timing) { + if (timing != nullptr) { + std::memset(timing, 0, sizeof(rac_benchmark_timing_t)); + } +} + +} // extern "C" diff --git a/sdk/runanywhere-commons/src/features/llm/llm_component.cpp b/sdk/runanywhere-commons/src/features/llm/llm_component.cpp index bbb7b51ac..7628b9459 100644 --- a/sdk/runanywhere-commons/src/features/llm/llm_component.cpp +++ b/sdk/runanywhere-commons/src/features/llm/llm_component.cpp @@ -17,6 +17,7 @@ #include "rac/core/capabilities/rac_lifecycle.h" #include "rac/core/rac_analytics_events.h" +#include "rac/core/rac_benchmark.h" #include "rac/core/rac_logger.h" #include "rac/core/rac_platform_adapter.h" #include "rac/core/rac_structured_error.h" @@ -459,6 +460,9 @@ struct llm_stream_context { float temperature; int32_t max_tokens; int32_t token_count; // Track tokens for streaming updates + + // Benchmark timing (optional, NULL when not benchmarking) + rac_benchmark_timing_t* timing_out; }; /** @@ -472,6 +476,11 @@ static rac_bool_t llm_stream_token_callback(const char* token, void* user_data) ctx->first_token_recorded = true; ctx->first_token_time = std::chrono::steady_clock::now(); + // Record t4 (first token) for benchmark timing + if (ctx->timing_out != nullptr) { + ctx->timing_out->t4_first_token_ms = rac_monotonic_now_ms(); + } + // Calculate TTFT auto ttft_duration = std::chrono::duration_cast( ctx->first_token_time - ctx->start_time); @@ -618,6 +627,7 @@ extern "C" rac_result_t rac_llm_component_generate_stream( ctx.temperature = effective_options->temperature; ctx.max_tokens = effective_options->max_tokens; ctx.token_count = 0; + ctx.timing_out = nullptr; // No benchmark timing for regular generate_stream // Perform streaming generation result = rac_llm_generate_stream(service, prompt, effective_options, llm_stream_token_callback, @@ -708,6 +718,231 @@ extern "C" rac_result_t rac_llm_component_generate_stream( return RAC_SUCCESS; } +extern "C" rac_result_t rac_llm_component_generate_stream_with_timing( + rac_handle_t handle, const char* prompt, const rac_llm_options_t* options, + rac_llm_component_token_callback_fn token_callback, + rac_llm_component_complete_callback_fn complete_callback, + rac_llm_component_error_callback_fn error_callback, void* user_data, + rac_benchmark_timing_t* timing_out) { + if (!handle) + return RAC_ERROR_INVALID_HANDLE; + if (!prompt) + return RAC_ERROR_INVALID_ARGUMENT; + + auto* component = reinterpret_cast(handle); + std::lock_guard lock(component->mtx); + + // Initialize timing if provided + if (timing_out != nullptr) { + rac_benchmark_timing_init(timing_out); + // Record t0 (request start) - first thing after validation + timing_out->t0_request_start_ms = rac_monotonic_now_ms(); + } + + // Generate unique ID for this generation + std::string generation_id = generate_unique_id(); + const char* model_id = rac_lifecycle_get_model_id(component->lifecycle); + const char* model_name = rac_lifecycle_get_model_name(component->lifecycle); + + // Get service from lifecycle manager + rac_handle_t service = nullptr; + rac_result_t result = rac_lifecycle_require_service(component->lifecycle, &service); + if (result != RAC_SUCCESS) { + log_error("LLM.Component", "No model loaded - cannot generate stream"); + + // Emit generation failed event + rac_analytics_event_data_t event = {}; + event.type = RAC_EVENT_LLM_GENERATION_FAILED; + event.data.llm_generation = RAC_ANALYTICS_LLM_GENERATION_DEFAULT; + event.data.llm_generation.generation_id = generation_id.c_str(); + event.data.llm_generation.model_id = model_id; + event.data.llm_generation.model_name = model_name; + event.data.llm_generation.error_code = result; + event.data.llm_generation.error_message = "No model loaded"; + rac_analytics_event_emit(RAC_EVENT_LLM_GENERATION_FAILED, &event); + + if (timing_out != nullptr) { + timing_out->status = RAC_BENCHMARK_STATUS_ERROR; + } + + if (error_callback) { + error_callback(result, "No model loaded", user_data); + } + return result; + } + + // Check if streaming is supported + rac_llm_info_t info; + result = rac_llm_get_info(service, &info); + if (result != RAC_SUCCESS || (info.supports_streaming == 0)) { + log_error("LLM.Component", "Streaming not supported"); + + // Emit generation failed event + rac_analytics_event_data_t event = {}; + event.type = RAC_EVENT_LLM_GENERATION_FAILED; + event.data.llm_generation = RAC_ANALYTICS_LLM_GENERATION_DEFAULT; + event.data.llm_generation.generation_id = generation_id.c_str(); + event.data.llm_generation.model_id = model_id; + event.data.llm_generation.model_name = model_name; + event.data.llm_generation.error_code = RAC_ERROR_NOT_SUPPORTED; + event.data.llm_generation.error_message = "Streaming not supported"; + rac_analytics_event_emit(RAC_EVENT_LLM_GENERATION_FAILED, &event); + + if (timing_out != nullptr) { + timing_out->status = RAC_BENCHMARK_STATUS_ERROR; + } + + if (error_callback) { + error_callback(RAC_ERROR_NOT_SUPPORTED, "Streaming not supported", user_data); + } + return RAC_ERROR_NOT_SUPPORTED; + } + + log_info("LLM.Component", "Starting streaming generation with timing"); + + // Get context_length from service info + int32_t context_length = info.context_length; + + // Use provided options or defaults + const rac_llm_options_t* effective_options = options ? options : &component->default_options; + + // Emit generation started event + { + rac_analytics_event_data_t event = {}; + event.type = RAC_EVENT_LLM_GENERATION_STARTED; + event.data.llm_generation = RAC_ANALYTICS_LLM_GENERATION_DEFAULT; + event.data.llm_generation.generation_id = generation_id.c_str(); + event.data.llm_generation.model_id = model_id; + event.data.llm_generation.model_name = model_name; + event.data.llm_generation.is_streaming = RAC_TRUE; + event.data.llm_generation.framework = + static_cast(component->config.preferred_framework); + event.data.llm_generation.temperature = effective_options->temperature; + event.data.llm_generation.max_tokens = effective_options->max_tokens; + event.data.llm_generation.context_length = context_length; + rac_analytics_event_emit(RAC_EVENT_LLM_GENERATION_STARTED, &event); + } + + // Setup streaming context + llm_stream_context ctx; + ctx.token_callback = token_callback; + ctx.complete_callback = complete_callback; + ctx.error_callback = error_callback; + ctx.user_data = user_data; + ctx.start_time = std::chrono::steady_clock::now(); + ctx.first_token_recorded = false; + ctx.prompt_tokens = estimate_tokens(prompt); + ctx.generation_id = generation_id; + ctx.model_id = model_id; + ctx.model_name = model_name; + ctx.framework = static_cast(component->config.preferred_framework); + ctx.temperature = effective_options->temperature; + ctx.max_tokens = effective_options->max_tokens; + ctx.token_count = 0; + ctx.timing_out = timing_out; // Pass timing for t4 capture in callback + + // Perform streaming generation with timing + // Note: Backend timing (t2, t3, t5) will be captured if backend supports it + result = rac_llm_generate_stream_with_timing(service, prompt, effective_options, + llm_stream_token_callback, &ctx, timing_out); + + if (result != RAC_SUCCESS) { + log_error("LLM.Component", "Streaming generation failed"); + rac_lifecycle_track_error(component->lifecycle, result, "generateStream"); + + // Emit generation failed event + rac_analytics_event_data_t event = {}; + event.type = RAC_EVENT_LLM_GENERATION_FAILED; + event.data.llm_generation = RAC_ANALYTICS_LLM_GENERATION_DEFAULT; + event.data.llm_generation.generation_id = generation_id.c_str(); + event.data.llm_generation.model_id = model_id; + event.data.llm_generation.model_name = model_name; + event.data.llm_generation.error_code = result; + event.data.llm_generation.error_message = "Streaming generation failed"; + rac_analytics_event_emit(RAC_EVENT_LLM_GENERATION_FAILED, &event); + + if (timing_out != nullptr) { + timing_out->status = RAC_BENCHMARK_STATUS_ERROR; + } + + if (error_callback) { + error_callback(result, "Streaming generation failed", user_data); + } + return result; + } + + // Build final result for completion callback + auto end_time = std::chrono::steady_clock::now(); + auto total_duration = + std::chrono::duration_cast(end_time - ctx.start_time); + int64_t total_time_ms = total_duration.count(); + + rac_llm_result_t final_result = {}; + final_result.text = strdup(ctx.full_text.c_str()); + final_result.prompt_tokens = ctx.prompt_tokens; + final_result.completion_tokens = estimate_tokens(ctx.full_text.c_str()); + final_result.total_tokens = final_result.prompt_tokens + final_result.completion_tokens; + final_result.total_time_ms = total_time_ms; + + double ttft_ms = 0.0; + // Calculate TTFT + if (ctx.first_token_recorded) { + auto ttft_duration = std::chrono::duration_cast( + ctx.first_token_time - ctx.start_time); + final_result.time_to_first_token_ms = ttft_duration.count(); + ttft_ms = static_cast(ttft_duration.count()); + } + + // Calculate tokens per second + double tokens_per_second = 0.0; + if (final_result.total_time_ms > 0) { + tokens_per_second = static_cast(final_result.completion_tokens) / + (static_cast(final_result.total_time_ms) / 1000.0); + final_result.tokens_per_second = static_cast(tokens_per_second); + } + + // Record t6 (request end) before complete callback + if (timing_out != nullptr) { + timing_out->t6_request_end_ms = rac_monotonic_now_ms(); + timing_out->prompt_tokens = final_result.prompt_tokens; + timing_out->output_tokens = final_result.completion_tokens; + timing_out->status = RAC_BENCHMARK_STATUS_SUCCESS; + } + + if (complete_callback) { + complete_callback(&final_result, user_data); + } + + // Emit generation completed event + { + rac_analytics_event_data_t event = {}; + event.type = RAC_EVENT_LLM_GENERATION_COMPLETED; + event.data.llm_generation.generation_id = generation_id.c_str(); + event.data.llm_generation.model_id = model_id; + event.data.llm_generation.model_name = model_name; + event.data.llm_generation.input_tokens = final_result.prompt_tokens; + event.data.llm_generation.output_tokens = final_result.completion_tokens; + event.data.llm_generation.duration_ms = static_cast(total_time_ms); + event.data.llm_generation.tokens_per_second = tokens_per_second; + event.data.llm_generation.is_streaming = RAC_TRUE; + event.data.llm_generation.time_to_first_token_ms = ttft_ms; + event.data.llm_generation.framework = + static_cast(component->config.preferred_framework); + event.data.llm_generation.temperature = effective_options->temperature; + event.data.llm_generation.max_tokens = effective_options->max_tokens; + event.data.llm_generation.context_length = context_length; + event.data.llm_generation.error_code = RAC_SUCCESS; + rac_analytics_event_emit(RAC_EVENT_LLM_GENERATION_COMPLETED, &event); + } + + // Free the duplicated text + free(final_result.text); + + log_info("LLM.Component", "Streaming generation with timing completed"); + + return RAC_SUCCESS; +} + extern "C" rac_result_t rac_llm_component_cancel(rac_handle_t handle) { if (!handle) return RAC_ERROR_INVALID_HANDLE; diff --git a/sdk/runanywhere-commons/src/features/llm/rac_llm_service.cpp b/sdk/runanywhere-commons/src/features/llm/rac_llm_service.cpp index 14fe472b3..6867a4151 100644 --- a/sdk/runanywhere-commons/src/features/llm/rac_llm_service.cpp +++ b/sdk/runanywhere-commons/src/features/llm/rac_llm_service.cpp @@ -122,6 +122,33 @@ rac_result_t rac_llm_generate_stream(rac_handle_t handle, const char* prompt, return service->ops->generate_stream(service->impl, prompt, options, callback, user_data); } +rac_result_t rac_llm_generate_stream_with_timing(rac_handle_t handle, const char* prompt, + const rac_llm_options_t* options, + rac_llm_stream_callback_fn callback, + void* user_data, + rac_benchmark_timing_t* timing_out) { + if (!handle || !prompt || !callback) + return RAC_ERROR_NULL_POINTER; + + auto* service = static_cast(handle); + if (!service->ops) { + return RAC_ERROR_NOT_SUPPORTED; + } + + // If backend implements timing-aware streaming, use it + if (service->ops->generate_stream_with_timing) { + return service->ops->generate_stream_with_timing(service->impl, prompt, options, callback, + user_data, timing_out); + } + + // Fallback to regular streaming (timing_out won't have t2/t3/t5) + if (service->ops->generate_stream) { + return service->ops->generate_stream(service->impl, prompt, options, callback, user_data); + } + + return RAC_ERROR_NOT_SUPPORTED; +} + rac_result_t rac_llm_get_info(rac_handle_t handle, rac_llm_info_t* out_info) { if (!handle || !out_info) return RAC_ERROR_NULL_POINTER; diff --git a/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp b/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp index da2e337a4..a3b37d431 100644 --- a/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp +++ b/sdk/runanywhere-commons/src/jni/runanywhere_commons_jni.cpp @@ -24,6 +24,7 @@ // Include runanywhere-commons C API headers #include "rac/core/rac_analytics_events.h" #include "rac/core/rac_audio_utils.h" +#include "rac/core/rac_benchmark.h" #include "rac/core/rac_core.h" #include "rac/core/rac_error.h" #include "rac/core/rac_logger.h" @@ -981,6 +982,135 @@ Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentGenerate return env->NewStringUTF(json.c_str()); } +// ======================================================================== +// STREAMING WITH KOTLIN CALLBACK AND BENCHMARK TIMING +// ======================================================================== + +JNIEXPORT jstring JNICALL +Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentGenerateStreamWithTiming( + JNIEnv* env, jclass clazz, jlong handle, jstring prompt, jstring configJson, + jobject tokenCallback) { + LOGi("racLlmComponentGenerateStreamWithTiming called with handle=%lld", (long long)handle); + + if (handle == 0) { + LOGe("racLlmComponentGenerateStreamWithTiming: invalid handle"); + return nullptr; + } + + if (!tokenCallback) { + LOGe("racLlmComponentGenerateStreamWithTiming: null callback"); + return nullptr; + } + + std::string promptStr = getCString(env, prompt); + LOGi("racLlmComponentGenerateStreamWithTiming prompt length=%zu", promptStr.length()); + + std::string configStorage; + const char* config = getNullableCString(env, configJson, configStorage); + + // Get JVM and callback method + JavaVM* jvm = nullptr; + env->GetJavaVM(&jvm); + + jclass callbackClass = env->GetObjectClass(tokenCallback); + jmethodID onTokenMethod = env->GetMethodID(callbackClass, "onToken", "(Ljava/lang/String;)Z"); + + if (!onTokenMethod) { + LOGe("racLlmComponentGenerateStreamWithTiming: could not find onToken method"); + return nullptr; + } + + // Create global ref to callback to ensure it survives across threads + jobject globalCallback = env->NewGlobalRef(tokenCallback); + + // Parse config for options + rac_llm_options_t options = {}; + options.max_tokens = 512; + options.temperature = 0.7f; + options.top_p = 1.0f; + options.streaming_enabled = RAC_TRUE; + + // Create streaming callback context + LLMStreamCallbackContext ctx; + ctx.jvm = jvm; + ctx.callback = globalCallback; + ctx.onTokenMethod = onTokenMethod; + + // Initialize benchmark timing struct + rac_benchmark_timing_t timing = {}; + rac_benchmark_timing_init(&timing); + + LOGi("racLlmComponentGenerateStreamWithTiming calling rac_llm_component_generate_stream_with_timing..."); + + rac_result_t status = rac_llm_component_generate_stream_with_timing( + reinterpret_cast(handle), promptStr.c_str(), &options, + llm_stream_callback_token, llm_stream_callback_complete, llm_stream_callback_error, &ctx, + &timing); + + // Clean up global ref + env->DeleteGlobalRef(globalCallback); + + if (status != RAC_SUCCESS) { + LOGe("rac_llm_component_generate_stream_with_timing failed with status=%d", status); + return nullptr; + } + + if (ctx.has_error) { + LOGe("Streaming with timing failed: %s", ctx.error_message.c_str()); + return nullptr; + } + + LOGi("racLlmComponentGenerateStreamWithTiming result text length=%zu, tokens=%d", + ctx.accumulated_text.length(), ctx.token_count); + + // Build JSON result with timing + std::string json = "{"; + json += "\"text\":\""; + for (char c : ctx.accumulated_text) { + switch (c) { + case '"': + json += "\\\""; + break; + case '\\': + json += "\\\\"; + break; + case '\n': + json += "\\n"; + break; + case '\r': + json += "\\r"; + break; + case '\t': + json += "\\t"; + break; + default: + json += c; + break; + } + } + json += "\","; + json += "\"tokens_generated\":" + std::to_string(ctx.final_result.completion_tokens) + ","; + json += "\"tokens_evaluated\":" + std::to_string(ctx.final_result.prompt_tokens) + ","; + json += "\"stop_reason\":" + std::to_string(0) + ","; + json += "\"total_time_ms\":" + std::to_string(ctx.final_result.total_time_ms) + ","; + json += "\"tokens_per_second\":" + std::to_string(ctx.final_result.tokens_per_second) + ","; + // Add benchmark timing fields + json += "\"t0_request_start_ms\":" + std::to_string(timing.t0_request_start_ms) + ","; + json += "\"t2_prefill_start_ms\":" + std::to_string(timing.t2_prefill_start_ms) + ","; + json += "\"t3_prefill_end_ms\":" + std::to_string(timing.t3_prefill_end_ms) + ","; + json += "\"t4_first_token_ms\":" + std::to_string(timing.t4_first_token_ms) + ","; + json += "\"t5_last_token_ms\":" + std::to_string(timing.t5_last_token_ms) + ","; + json += "\"t6_request_end_ms\":" + std::to_string(timing.t6_request_end_ms) + ","; + json += "\"prompt_tokens\":" + std::to_string(timing.prompt_tokens) + ","; + json += "\"output_tokens\":" + std::to_string(timing.output_tokens) + ","; + json += "\"benchmark_status\":" + std::to_string(timing.status); + json += "}"; + + LOGi("racLlmComponentGenerateStreamWithTiming returning JSON: %zu bytes", json.length()); + + return env->NewStringUTF(json.c_str()); +} + JNIEXPORT void JNICALL Java_com_runanywhere_sdk_native_bridge_RunAnywhereBridge_racLlmComponentCancel(JNIEnv* env, jclass clazz,