From e8864b1dd698461616789757d4d18cce663cf846 Mon Sep 17 00:00:00 2001 From: Krzysztof Czuszynski Date: Sun, 7 Dec 2025 22:58:33 +0100 Subject: [PATCH 1/6] add fix to ensure finished transcript has whitespaces --- .../adk/models/gemini_llm_connection.py | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/google/adk/models/gemini_llm_connection.py b/src/google/adk/models/gemini_llm_connection.py index 55d4b62e96..ddf543a230 100644 --- a/src/google/adk/models/gemini_llm_connection.py +++ b/src/google/adk/models/gemini_llm_connection.py @@ -181,13 +181,16 @@ async def receive(self) -> AsyncGenerator[LlmResponse, None]: # generation_complete, causing transcription to appear after # tool_call in the session log. if message.server_content.input_transcription: - if message.server_content.input_transcription.text: - self._input_transcription_text += ( - message.server_content.input_transcription.text + if ( + new_input_transcription_chunk := message.server_content.input_transcription.text + ): + self._input_transcription_text = ( + f'{self._input_transcription_text} {new_input_transcription_chunk.strip()}' + .strip() ) yield LlmResponse( input_transcription=types.Transcription( - text=message.server_content.input_transcription.text, + text=new_input_transcription_chunk, finished=False, ), partial=True, @@ -204,13 +207,16 @@ async def receive(self) -> AsyncGenerator[LlmResponse, None]: ) self._input_transcription_text = '' if message.server_content.output_transcription: - if message.server_content.output_transcription.text: - self._output_transcription_text += ( - message.server_content.output_transcription.text + if ( + new_output_transcription_chunk := message.server_content.output_transcription.text + ): + self._output_transcription_text = ( + f'{self._output_transcription_text} {new_output_transcription_chunk.strip()}' + .strip() ) yield LlmResponse( output_transcription=types.Transcription( - text=message.server_content.output_transcription.text, + text=new_output_transcription_chunk, finished=False, ), partial=True, From 5901190c0fe206a6ed640f1e2bb3cd10ddc0532c Mon Sep 17 00:00:00 2001 From: Krzysztof Czuszynski Date: Sun, 7 Dec 2025 23:00:19 +0100 Subject: [PATCH 2/6] add unittest for final transcription whitespaces appearance --- .../models/test_gemini_llm_connection.py | 98 +++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/tests/unittests/models/test_gemini_llm_connection.py b/tests/unittests/models/test_gemini_llm_connection.py index 190007603c..5d93d7edb5 100644 --- a/tests/unittests/models/test_gemini_llm_connection.py +++ b/tests/unittests/models/test_gemini_llm_connection.py @@ -593,3 +593,101 @@ async def mock_receive_generator(): assert responses[2].output_transcription.text == 'How can I help?' assert responses[2].output_transcription.finished is True assert responses[2].partial is False + + +@pytest.mark.asyncio +@pytest.mark.parametrize('tx_direction', ['input', 'output']) +@pytest.mark.parametrize( + 'fragments', + [ + ('Hello', 'world'), + ('Hello', ' world'), + ('Hello ', 'world'), + ], +) +async def test_receive_final_transcription_space_between_fragments( + gemini_connection, mock_gemini_session, tx_direction, fragments +): + """Test receive final transcription fragments are joined with a space between words.""" + fragment1, fragment2 = fragments + + message1 = mock.Mock() + message1.usage_metadata = None + message1.server_content = mock.Mock() + message1.server_content.model_turn = None + message1.server_content.interrupted = False + message1.server_content.turn_complete = False + message1.server_content.generation_complete = False + message1.tool_call = None + message1.session_resumption_update = None + message1.server_content.input_transcription = ( + types.Transcription(text=fragment1, finished=False) + if tx_direction == 'input' + else None + ) + message1.server_content.output_transcription = ( + types.Transcription(text=fragment1, finished=False) + if tx_direction == 'output' + else None + ) + + message2 = mock.Mock() + message2.usage_metadata = None + message2.server_content = mock.Mock() + message2.server_content.model_turn = None + message2.server_content.interrupted = False + message2.server_content.turn_complete = False + message2.server_content.generation_complete = False + message2.tool_call = None + message2.session_resumption_update = None + message2.server_content.input_transcription = ( + types.Transcription(text=fragment2, finished=False) + if tx_direction == 'input' + else None + ) + message2.server_content.output_transcription = ( + types.Transcription(text=fragment2, finished=False) + if tx_direction == 'output' + else None + ) + + message3 = mock.Mock() + message3.usage_metadata = None + message3.server_content = mock.Mock() + message3.server_content.model_turn = None + message3.server_content.interrupted = False + message3.server_content.turn_complete = False + message3.server_content.generation_complete = False + message3.tool_call = None + message3.session_resumption_update = None + message3.server_content.input_transcription = ( + types.Transcription(text=None, finished=True) + if tx_direction == 'input' + else None + ) + message3.server_content.output_transcription = ( + types.Transcription(text=None, finished=True) + if tx_direction == 'output' + else None + ) + + async def mock_receive_generator(): + yield message1 + yield message2 + yield message3 + + receive_mock = mock.Mock(return_value=mock_receive_generator()) + mock_gemini_session.receive = receive_mock + + responses = [resp async for resp in gemini_connection.receive()] + + # find the finished transcription response + attr_name = f'{tx_direction}_transcription' + finished_resps = [ + r + for r in responses + if getattr(r, attr_name) and getattr(r, attr_name).finished + ] + assert finished_resps, 'Expected finished transcription response' + transcription = getattr(finished_resps[0], attr_name) + assert transcription.text == 'Hello world' From ce4e26914eaa172880516e52320b8b54d1782e7b Mon Sep 17 00:00:00 2001 From: Krzysztof Czuszynski Date: Mon, 8 Dec 2025 23:51:49 +0100 Subject: [PATCH 3/6] update test cases --- tests/unittests/models/test_gemini_llm_connection.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/unittests/models/test_gemini_llm_connection.py b/tests/unittests/models/test_gemini_llm_connection.py index 5d93d7edb5..83144f8cac 100644 --- a/tests/unittests/models/test_gemini_llm_connection.py +++ b/tests/unittests/models/test_gemini_llm_connection.py @@ -600,9 +600,11 @@ async def mock_receive_generator(): @pytest.mark.parametrize( 'fragments', [ - ('Hello', 'world'), - ('Hello', ' world'), - ('Hello ', 'world'), + ('That', "'s great"), + ("That'", 's great'), + ("That's", 'great'), + ("That's", ' great'), + ("That's ", 'great'), ], ) async def test_receive_final_transcription_space_between_fragments( @@ -690,4 +692,4 @@ async def mock_receive_generator(): ] assert finished_resps, 'Expected finished transcription response' transcription = getattr(finished_resps[0], attr_name) - assert transcription.text == 'Hello world' + assert transcription.text == "That's great" From 38cc1c26a3f8b61ea7137923cb609be26d6c5d10 Mon Sep 17 00:00:00 2001 From: Krzysztof Czuszynski Date: Tue, 9 Dec 2025 00:01:05 +0100 Subject: [PATCH 4/6] fix fragments stich respecting punctuation marks --- .../adk/models/gemini_llm_connection.py | 34 +++++++++++++++---- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/src/google/adk/models/gemini_llm_connection.py b/src/google/adk/models/gemini_llm_connection.py index ddf543a230..065b359f4c 100644 --- a/src/google/adk/models/gemini_llm_connection.py +++ b/src/google/adk/models/gemini_llm_connection.py @@ -30,6 +30,8 @@ RealtimeInput = Union[types.Blob, types.ActivityStart, types.ActivityEnd] from typing import TYPE_CHECKING +PUNCTUATION_CHARS = {'.', ',', '!', '?', ';', ':', "'", '"', ')', ']', '}', '(', '[', '{'} + if TYPE_CHECKING: from google.genai import live @@ -184,10 +186,20 @@ async def receive(self) -> AsyncGenerator[LlmResponse, None]: if ( new_input_transcription_chunk := message.server_content.input_transcription.text ): - self._input_transcription_text = ( - f'{self._input_transcription_text} {new_input_transcription_chunk.strip()}' - .strip() + existing = self._input_transcription_text + # Insert a space only when there is existing text and neither + # the new chunk starts with punctuation nor the existing text + # ends with punctuation. + conditional_space = ( + ' ' + if existing + and not ( + new_input_transcription_chunk[0] in PUNCTUATION_CHARS + or existing[-1] in PUNCTUATION_CHARS + ) + else '' ) + self._input_transcription_text = f'{existing}{conditional_space}{new_input_transcription_chunk.strip()}'.strip() yield LlmResponse( input_transcription=types.Transcription( text=new_input_transcription_chunk, @@ -210,10 +222,20 @@ async def receive(self) -> AsyncGenerator[LlmResponse, None]: if ( new_output_transcription_chunk := message.server_content.output_transcription.text ): - self._output_transcription_text = ( - f'{self._output_transcription_text} {new_output_transcription_chunk.strip()}' - .strip() + existing = self._output_transcription_text + # Insert a space only when there is existing text and neither + # the new chunk starts with punctuation nor the existing text + # ends with punctuation. + conditional_space = ( + ' ' + if existing + and not ( + new_output_transcription_chunk[0] in PUNCTUATION_CHARS + or existing[-1] in PUNCTUATION_CHARS + ) + else '' ) + self._output_transcription_text = f'{existing}{conditional_space}{new_output_transcription_chunk.strip()}'.strip() yield LlmResponse( output_transcription=types.Transcription( text=new_output_transcription_chunk, From c0f9d623e3519f97f5bc537bf7af0d60881d9b54 Mon Sep 17 00:00:00 2001 From: Krzysztof Czuszynski Date: Tue, 9 Dec 2025 00:30:56 +0100 Subject: [PATCH 5/6] add more test cases --- .../models/test_gemini_llm_connection.py | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/tests/unittests/models/test_gemini_llm_connection.py b/tests/unittests/models/test_gemini_llm_connection.py index 83144f8cac..1a6460b3a5 100644 --- a/tests/unittests/models/test_gemini_llm_connection.py +++ b/tests/unittests/models/test_gemini_llm_connection.py @@ -600,18 +600,24 @@ async def mock_receive_generator(): @pytest.mark.parametrize( 'fragments', [ - ('That', "'s great"), - ("That'", 's great'), - ("That's", 'great'), - ("That's", ' great'), - ("That's ", 'great'), + ('That', "'s great", "That's great"), + ("That'", 's great', "That's great"), + ("That's", 'great', "That's great"), + ("That's", ' great', "That's great"), + ("That's ", 'great', "That's great"), + ("Great", '! Good to hear', 'Great! Good to hear'), + ("Great!", 'Good to hear', 'Great! Good to hear'), + ("Great! ", 'Good to hear', 'Great! Good to hear'), + ("Great! Good", 'to hear', 'Great! Good to hear'), + ("Great! Good ", 'to hear', 'Great! Good to hear'), + ("Great! Good", ' to hear', 'Great! Good to hear'), ], ) async def test_receive_final_transcription_space_between_fragments( gemini_connection, mock_gemini_session, tx_direction, fragments ): """Test receive final transcription fragments are joined with a space between words.""" - fragment1, fragment2 = fragments + fragment1, fragment2, expected = fragments message1 = mock.Mock() message1.usage_metadata = None @@ -692,4 +698,4 @@ async def mock_receive_generator(): ] assert finished_resps, 'Expected finished transcription response' transcription = getattr(finished_resps[0], attr_name) - assert transcription.text == "That's great" + assert transcription.text == expected From ed863a3e3a44248d23fd27e458f710c3ad1032c1 Mon Sep 17 00:00:00 2001 From: Krzysztof Czuszynski Date: Tue, 9 Dec 2025 00:31:11 +0100 Subject: [PATCH 6/6] update logic of conditional_space --- .../adk/models/gemini_llm_connection.py | 44 ++++++++++--------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/src/google/adk/models/gemini_llm_connection.py b/src/google/adk/models/gemini_llm_connection.py index 065b359f4c..da0b3cc895 100644 --- a/src/google/adk/models/gemini_llm_connection.py +++ b/src/google/adk/models/gemini_llm_connection.py @@ -30,7 +30,7 @@ RealtimeInput = Union[types.Blob, types.ActivityStart, types.ActivityEnd] from typing import TYPE_CHECKING -PUNCTUATION_CHARS = {'.', ',', '!', '?', ';', ':', "'", '"', ')', ']', '}', '(', '[', '{'} +PUNCTUATION_CHARS = {'.', '!', '?', ';', ':', "'"} if TYPE_CHECKING: from google.genai import live @@ -187,17 +187,18 @@ async def receive(self) -> AsyncGenerator[LlmResponse, None]: new_input_transcription_chunk := message.server_content.input_transcription.text ): existing = self._input_transcription_text - # Insert a space only when there is existing text and neither - # the new chunk starts with punctuation nor the existing text - # ends with punctuation. + # Insert a space when joining fragments except when the new + # chunk starts with a punctuation character that should attach + # to the previous token, or the existing text ends with an + # apostrophe. conditional_space = ( - ' ' - if existing - and not ( - new_input_transcription_chunk[0] in PUNCTUATION_CHARS - or existing[-1] in PUNCTUATION_CHARS - ) - else '' + ' ' + if existing + and not ( + new_input_transcription_chunk[0] in PUNCTUATION_CHARS + or existing.endswith("'") + ) + else '' ) self._input_transcription_text = f'{existing}{conditional_space}{new_input_transcription_chunk.strip()}'.strip() yield LlmResponse( @@ -223,17 +224,18 @@ async def receive(self) -> AsyncGenerator[LlmResponse, None]: new_output_transcription_chunk := message.server_content.output_transcription.text ): existing = self._output_transcription_text - # Insert a space only when there is existing text and neither - # the new chunk starts with punctuation nor the existing text - # ends with punctuation. + # Insert a space when joining fragments except when the new + # chunk starts with a punctuation character that should attach + # to the previous token, or the existing text ends with an + # apostrophe. conditional_space = ( - ' ' - if existing - and not ( - new_output_transcription_chunk[0] in PUNCTUATION_CHARS - or existing[-1] in PUNCTUATION_CHARS - ) - else '' + ' ' + if existing + and not ( + new_output_transcription_chunk[0] in PUNCTUATION_CHARS + or existing.endswith("'") + ) + else '' ) self._output_transcription_text = f'{existing}{conditional_space}{new_output_transcription_chunk.strip()}'.strip() yield LlmResponse(