6969DEFAULT_ENABLE_CACHE_STATISTICS = False
7070
7171
72+ def _get_audio_transcription_from_session (
73+ invocation_context : InvocationContext ,
74+ ) -> list [types .Content ]:
75+ """Get audio and transcription content from session events.
76+
77+ Collects audio file references and transcription text from session events
78+ to reconstruct the conversation history including multimodal content.
79+ Args:
80+ invocation_context: The invocation context containing session data.
81+ Returns:
82+ A list of Content objects containing audio files and transcriptions.
83+ """
84+ contents = []
85+
86+ for event in invocation_context .session .events :
87+ # Collect transcription text events
88+ if hasattr (event , 'input_transcription' ) and event .input_transcription :
89+ contents .append (
90+ types .Content (
91+ role = 'user' ,
92+ parts = [types .Part .from_text (text = event .input_transcription .text )],
93+ )
94+ )
95+
96+ if hasattr (event , 'output_transcription' ) and event .output_transcription :
97+ contents .append (
98+ types .Content (
99+ role = 'model' ,
100+ parts = [
101+ types .Part .from_text (text = event .output_transcription .text )
102+ ],
103+ )
104+ )
105+ return contents
106+
107+
72108class BaseLlmFlow (ABC ):
73109 """A basic flow that calls the LLM in a loop until a final response is generated.
74110
@@ -129,25 +165,12 @@ async def run_live(
129165 if llm_request .contents :
130166 # Sends the conversation history to the model.
131167 with tracer .start_as_current_span ('send_data' ):
132- if invocation_context .transcription_cache :
133- from . import audio_transcriber
134-
135- audio_transcriber = audio_transcriber .AudioTranscriber (
136- init_client = True
137- if invocation_context .run_config .input_audio_transcription
138- is None
139- else False
140- )
141- contents = audio_transcriber .transcribe_file (invocation_context )
142- logger .debug ('Sending history to model: %s' , contents )
143- await llm_connection .send_history (contents )
144- invocation_context .transcription_cache = None
145- trace_send_data (invocation_context , event_id , contents )
146- else :
147- await llm_connection .send_history (llm_request .contents )
148- trace_send_data (
149- invocation_context , event_id , llm_request .contents
150- )
168+ # Combine regular contents with audio/transcription from session
169+ logger .debug ('Sending history to model: %s' , llm_request .contents )
170+ await llm_connection .send_history (llm_request .contents )
171+ trace_send_data (
172+ invocation_context , event_id , llm_request .contents
173+ )
151174
152175 send_task = asyncio .create_task (
153176 self ._send_to_model (llm_connection , invocation_context )
@@ -324,22 +347,6 @@ def get_author_for_event(llm_response):
324347 author = get_author_for_event (llm_response ),
325348 )
326349
327- # Handle transcription events ONCE per llm_response, outside the event loop
328- if llm_response .input_transcription :
329- await self .transcription_manager .handle_input_transcription (
330- invocation_context , llm_response .input_transcription
331- )
332-
333- if llm_response .output_transcription :
334- await self .transcription_manager .handle_output_transcription (
335- invocation_context , llm_response .output_transcription
336- )
337-
338- # Flush audio caches based on control events using configurable settings
339- await self ._handle_control_event_flush (
340- invocation_context , llm_response
341- )
342-
343350 async with Aclosing (
344351 self ._postprocess_live (
345352 invocation_context ,
@@ -349,28 +356,11 @@ def get_author_for_event(llm_response):
349356 )
350357 ) as agen :
351358 async for event in agen :
352- if (
353- event .content
354- and event .content .parts
355- and event .content .parts [0 ].inline_data is None
356- and not event .partial
357- ):
358- # This can be either user data or transcription data.
359- # when output transcription enabled, it will contain model's
360- # transcription.
361- # when input transcription enabled, it will contain user
362- # transcription.
363- if not invocation_context .transcription_cache :
364- invocation_context .transcription_cache = []
365- invocation_context .transcription_cache .append (
366- TranscriptionEntry (
367- role = event .content .role , data = event .content
368- )
369- )
370359 # Cache output audio chunks from model responses
371360 # TODO: support video data
372361 if (
373- event .content
362+ invocation_context .run_config .save_live_audio
363+ and event .content
374364 and event .content .parts
375365 and event .content .parts [0 ].inline_data
376366 and event .content .parts [0 ].inline_data .mime_type .startswith (
@@ -578,6 +568,36 @@ async def _postprocess_live(
578568 ):
579569 return
580570
571+ # Handle transcription events ONCE per llm_response, outside the event loop
572+ if llm_response .input_transcription :
573+ input_transcription_event = (
574+ await self .transcription_manager .handle_input_transcription (
575+ invocation_context , llm_response .input_transcription
576+ )
577+ )
578+ yield input_transcription_event
579+ return
580+
581+ if llm_response .output_transcription :
582+ output_transcription_event = (
583+ await self .transcription_manager .handle_output_transcription (
584+ invocation_context , llm_response .output_transcription
585+ )
586+ )
587+ yield output_transcription_event
588+ return
589+
590+ # Flush audio caches based on control events using configurable settings
591+ if invocation_context .run_config .save_live_audio :
592+ _handle_control_event_flush_event = (
593+ await self ._handle_control_event_flush (
594+ invocation_context , llm_response
595+ )
596+ )
597+ if _handle_control_event_flush_event :
598+ yield _handle_control_event_flush_event
599+ return
600+
581601 # Builds the event.
582602 model_response_event = self ._finalize_model_response_event (
583603 llm_request , llm_response , model_response_event
@@ -877,33 +897,34 @@ async def _handle_control_event_flush(
877897 invocation_context: The invocation context containing audio caches.
878898 llm_response: The LLM response containing control event information.
879899 """
900+
901+ # Log cache statistics if enabled
902+ if DEFAULT_ENABLE_CACHE_STATISTICS :
903+ stats = self .audio_cache_manager .get_cache_stats (invocation_context )
904+ logger .debug ('Audio cache stats: %s' , stats )
905+
880906 if llm_response .interrupted :
881907 # user interrupts so the model will stop. we can flush model audio here
882- await self .audio_cache_manager .flush_caches (
908+ return await self .audio_cache_manager .flush_caches (
883909 invocation_context ,
884910 flush_user_audio = False ,
885911 flush_model_audio = True ,
886912 )
887913 elif llm_response .turn_complete :
888914 # turn completes so we can flush both user and model
889- await self .audio_cache_manager .flush_caches (
915+ return await self .audio_cache_manager .flush_caches (
890916 invocation_context ,
891917 flush_user_audio = True ,
892918 flush_model_audio = True ,
893919 )
894920 elif getattr (llm_response , 'generation_complete' , False ):
895921 # model generation complete so we can flush model audio
896- await self .audio_cache_manager .flush_caches (
922+ return await self .audio_cache_manager .flush_caches (
897923 invocation_context ,
898924 flush_user_audio = False ,
899925 flush_model_audio = True ,
900926 )
901927
902- # Log cache statistics if enabled
903- if DEFAULT_ENABLE_CACHE_STATISTICS :
904- stats = self .audio_cache_manager .get_cache_stats (invocation_context )
905- logger .debug ('Audio cache stats: %s' , stats )
906-
907928 async def _run_and_handle_error (
908929 self ,
909930 response_generator : AsyncGenerator [LlmResponse , None ],
0 commit comments