diff --git a/comps/llms/text-generation/tgi/llm.py b/comps/llms/text-generation/tgi/llm.py index a825a5d2a0..dcd926f612 100644 --- a/comps/llms/text-generation/tgi/llm.py +++ b/comps/llms/text-generation/tgi/llm.py @@ -112,11 +112,12 @@ async def stream_generator(): chat_response = "" async for text in text_generation: stream_gen_time.append(time.time() - start) - chat_response += text - chunk_repr = repr(text.encode("utf-8")) - if logflag: - logger.info(f"[ SearchedDoc ] chunk:{chunk_repr}") - yield f"data: {chunk_repr}\n\n" + if text not in ["<|im_end|>", "<|endoftext|>"]: + chat_response += text + chunk_repr = repr(text.encode("utf-8")) + if logflag: + logger.info(f"[ SearchedDoc ] chunk:{chunk_repr}") + yield f"data: {chunk_repr}\n\n" if logflag: logger.info(f"[ SearchedDoc ] stream response: {chat_response}") statistics_dict["opea_service@llm_tgi"].append_latency(stream_gen_time[-1], stream_gen_time[0]) @@ -162,11 +163,12 @@ async def stream_generator(): chat_response = "" async for text in text_generation: stream_gen_time.append(time.time() - start) - chat_response += text - chunk_repr = repr(text.encode("utf-8")) - if logflag: - logger.info(f"[ LLMParamsDoc ] chunk:{chunk_repr}") - yield f"data: {chunk_repr}\n\n" + if text not in ["<|im_end|>", "<|endoftext|>"]: + chat_response += text + chunk_repr = repr(text.encode("utf-8")) + if logflag: + logger.info(f"[ LLMParamsDoc ] chunk:{chunk_repr}") + yield f"data: {chunk_repr}\n\n" if logflag: logger.info(f"[ LLMParamsDoc ] stream response: {chat_response}") statistics_dict["opea_service@llm_tgi"].append_latency(stream_gen_time[-1], stream_gen_time[0]) @@ -271,7 +273,9 @@ def stream_generator(): for c in chat_completion: if logflag: logger.info(c) - yield f"data: {c.model_dump_json()}\n\n" + chunk = c.model_dump_json() + if chunk not in ["<|im_end|>", "<|endoftext|>"]: + yield f"data: {chunk}\n\n" yield "data: [DONE]\n\n" return StreamingResponse(stream_generator(), media_type="text/event-stream") diff --git a/comps/llms/text-generation/vllm/langchain/llm.py b/comps/llms/text-generation/vllm/langchain/llm.py index bd434696fe..ccedec4513 100644 --- a/comps/llms/text-generation/vllm/langchain/llm.py +++ b/comps/llms/text-generation/vllm/langchain/llm.py @@ -124,11 +124,12 @@ async def llm_generate(input: Union[LLMParamsDoc, ChatCompletionRequest, Searche async def stream_generator(): chat_response = "" async for text in llm.astream(new_input.query, **parameters): - chat_response += text - chunk_repr = repr(text.encode("utf-8")) - if logflag: - logger.info(f"[ SearchedDoc ] chunk: {chunk_repr}") - yield f"data: {chunk_repr}\n\n" + if text not in ["<|im_end|>", "<|endoftext|>"]: + chat_response += text + chunk_repr = repr(text.encode("utf-8")) + if logflag: + logger.info(f"[ SearchedDoc ] chunk: {chunk_repr}") + yield f"data: {chunk_repr}\n\n" if logflag: logger.info(f"[ SearchedDoc ] stream response: {chat_response}") yield "data: [DONE]\n\n" @@ -175,11 +176,12 @@ async def stream_generator(): async def stream_generator(): chat_response = "" async for text in llm.astream(prompt, **parameters): - chat_response += text - chunk_repr = repr(text.encode("utf-8")) - if logflag: - logger.info(f"[ LLMParamsDoc ] chunk: {chunk_repr}") - yield f"data: {chunk_repr}\n\n" + if text not in ["<|im_end|>", "<|endoftext|>"]: + chat_response += text + chunk_repr = repr(text.encode("utf-8")) + if logflag: + logger.info(f"[ LLMParamsDoc ] chunk: {chunk_repr}") + yield f"data: {chunk_repr}\n\n" if logflag: logger.info(f"[ LLMParamsDoc ] stream response: {chat_response}") yield "data: [DONE]\n\n" diff --git a/comps/llms/text-generation/vllm/llama_index/llm.py b/comps/llms/text-generation/vllm/llama_index/llm.py index 76afa24a98..335f406295 100644 --- a/comps/llms/text-generation/vllm/llama_index/llm.py +++ b/comps/llms/text-generation/vllm/llama_index/llm.py @@ -58,8 +58,9 @@ async def llm_generate(input: LLMParamsDoc): async def stream_generator(): async for text in llm.astream_complete(input.query): - output = text.text - yield f"data: {output}\n\n" + if text.text not in ["<|im_end|>", "<|endoftext|>"]: + output = text.text + yield f"data: {output}\n\n" if logflag: logger.info(f"[llm - chat_stream] stream response: {output}") yield "data: [DONE]\n\n"