16
16
LLMConfig ,
17
17
LLMConfigValue ,
18
18
LLMError ,
19
+ LLMResponse ,
19
20
Message ,
20
21
MessageRole ,
21
22
OutOfTokensOrSymbolsError ,
@@ -76,6 +77,7 @@ class ChatCompletionsRequest(pydantic.BaseModel):
76
77
class OneStreamingChoiceDelta (pydantic .BaseModel ):
77
78
role : typing .Literal [MessageRole .assistant ] | None = None
78
79
content : str | None = None
80
+ reasoning_content : str | None = None
79
81
80
82
81
83
class OneStreamingChoice (pydantic .BaseModel ):
@@ -89,6 +91,7 @@ class ChatCompletionsStreamingEvent(pydantic.BaseModel):
89
91
class OneNotStreamingChoiceMessage (pydantic .BaseModel ):
90
92
role : MessageRole
91
93
content : str
94
+ reasoning_content : str | None = None
92
95
93
96
94
97
class OneNotStreamingChoice (pydantic .BaseModel ):
@@ -143,14 +146,16 @@ def _make_user_assistant_alternate_messages(
143
146
else :
144
147
if current_message_content_chunks :
145
148
yield ChatCompletionsInputMessage (
146
- role = current_message_role , content = _merge_content_chunks (current_message_content_chunks )
149
+ role = current_message_role ,
150
+ content = _merge_content_chunks (current_message_content_chunks ),
147
151
)
148
152
current_message_content_chunks = [one_message .content ]
149
153
current_message_role = one_message .role
150
154
151
155
if current_message_content_chunks :
152
156
yield ChatCompletionsInputMessage (
153
- role = current_message_role , content = _merge_content_chunks (current_message_content_chunks )
157
+ role = current_message_role ,
158
+ content = _merge_content_chunks (current_message_content_chunks ),
154
159
)
155
160
156
161
@@ -195,7 +200,12 @@ def _prepare_messages(self, messages: str | list[Message]) -> list[ChatCompletio
195
200
)
196
201
197
202
def _prepare_payload (
198
- self , * , messages : str | list [Message ], temperature : float , stream : bool , extra : dict [str , typing .Any ] | None
203
+ self ,
204
+ * ,
205
+ messages : str | list [Message ],
206
+ temperature : float ,
207
+ stream : bool ,
208
+ extra : dict [str , typing .Any ] | None ,
199
209
) -> dict [str , typing .Any ]:
200
210
return ChatCompletionsRequest (
201
211
stream = stream ,
@@ -211,9 +221,12 @@ async def request_llm_message(
211
221
* ,
212
222
temperature : float = LLMConfigValue (attr = "temperature" ),
213
223
extra : dict [str , typing .Any ] | None = None ,
214
- ) -> str :
224
+ ) -> LLMResponse :
215
225
payload : typing .Final = self ._prepare_payload (
216
- messages = messages , temperature = temperature , stream = False , extra = extra
226
+ messages = messages ,
227
+ temperature = temperature ,
228
+ stream = False ,
229
+ extra = extra ,
217
230
)
218
231
try :
219
232
response : typing .Final = await make_http_request (
@@ -224,18 +237,27 @@ async def request_llm_message(
224
237
except httpx .HTTPStatusError as exception :
225
238
_handle_status_error (status_code = exception .response .status_code , content = exception .response .content )
226
239
try :
227
- return ChatCompletionsNotStreamingResponse .model_validate_json (response .content ).choices [0 ].message .content
240
+ validated_message_model : typing .Final = (
241
+ ChatCompletionsNotStreamingResponse .model_validate_json (response .content ).choices [0 ].message
242
+ )
243
+ return LLMResponse (
244
+ content = validated_message_model .content ,
245
+ reasoning_content = validated_message_model .reasoning_content ,
246
+ )
228
247
finally :
229
248
await response .aclose ()
230
249
231
- async def _iter_response_chunks (self , response : httpx .Response ) -> typing .AsyncIterable [str ]:
250
+ async def _iter_response_chunks (self , response : httpx .Response ) -> typing .AsyncIterable [LLMResponse ]:
232
251
async for event in httpx_sse .EventSource (response ).aiter_sse ():
233
252
if event .data == "[DONE]" :
234
253
break
235
254
validated_response = ChatCompletionsStreamingEvent .model_validate_json (event .data )
236
- if not (one_chunk := validated_response .choices [0 ].delta .content ):
255
+ if not (
256
+ (validated_delta := validated_response .choices [0 ].delta )
257
+ and (validated_delta .content or validated_delta .reasoning_content )
258
+ ):
237
259
continue
238
- yield one_chunk
260
+ yield LLMResponse ( content = validated_delta . content , reasoning_content = validated_delta . reasoning_content )
239
261
240
262
@contextlib .asynccontextmanager
241
263
async def stream_llm_message_chunks (
@@ -244,9 +266,12 @@ async def stream_llm_message_chunks(
244
266
* ,
245
267
temperature : float = LLMConfigValue (attr = "temperature" ),
246
268
extra : dict [str , typing .Any ] | None = None ,
247
- ) -> typing .AsyncIterator [typing .AsyncIterable [str ]]:
269
+ ) -> typing .AsyncIterator [typing .AsyncIterable [LLMResponse ]]:
248
270
payload : typing .Final = self ._prepare_payload (
249
- messages = messages , temperature = temperature , stream = True , extra = extra
271
+ messages = messages ,
272
+ temperature = temperature ,
273
+ stream = True ,
274
+ extra = extra ,
250
275
)
251
276
try :
252
277
async with make_streaming_http_request (
0 commit comments