diff --git a/comps/cores/proto/api_protocol.py b/comps/cores/proto/api_protocol.py index 20f36697a7..98b2b56669 100644 --- a/comps/cores/proto/api_protocol.py +++ b/comps/cores/proto/api_protocol.py @@ -1016,5 +1016,42 @@ class FineTuningJobCheckpoint(BaseModel): """The step number that the checkpoint was created at.""" +# Args allowed in openai-like chat completions API calls in OpeaTextGenService +ALLOWED_CHATCOMPLETION_ARGS = ( + "model", + "messages", + "frequency_penalty", + "max_tokens", + "n", + "presence_penalty", + "response_format", + "seed", + "stop", + "stream", + "stream_options", + "temperature", + "top_p", + "user", +) + +# Args allowed in openai-like regular completion API calls in OpeaTextGenService +ALLOWED_COMPLETION_ARGS = ( + "model", + "prompt", + "echo", + "frequency_penalty", + "max_tokens", + "n", + "presence_penalty", + "seed", + "stop", + "stream", + "suffix", + "temperature", + "top_p", + "user", +) + + class RouteEndpointDoc(BaseModel): url: str = Field(..., description="URL of the chosen inference endpoint") diff --git a/comps/llms/deployment/docker_compose/compose_text-generation.yaml b/comps/llms/deployment/docker_compose/compose_text-generation.yaml index d20490c0d0..010e4d0355 100644 --- a/comps/llms/deployment/docker_compose/compose_text-generation.yaml +++ b/comps/llms/deployment/docker_compose/compose_text-generation.yaml @@ -118,6 +118,15 @@ services: LLM_COMPONENT_NAME: ${LLM_COMPONENT_NAME:-OpeaTextGenPredictionguard} PREDICTIONGUARD_API_KEY: ${PREDICTIONGUARD_API_KEY} + textgen-service-endpoint-openai: + extends: textgen + container_name: textgen-service-endpoint-openai + environment: + LLM_COMPONENT_NAME: OpeaTextGenService + LLM_ENDPOINT: ${LLM_ENDPOINT} # an endpoint that uses OpenAI API style e.g., https://openrouter.ai/api + OPENAI_API_KEY: ${OPENAI_API_KEY} # the key associated with the endpoint + LLM_MODEL_ID: ${LLM_MODEL_ID:-google/gemma-3-1b-it:free} + textgen-native-gaudi: extends: textgen-gaudi container_name: textgen-native-gaudi diff --git a/comps/llms/src/text-generation/README_endpoint_openai.md b/comps/llms/src/text-generation/README_endpoint_openai.md new file mode 100644 index 0000000000..1773bd6ff1 --- /dev/null +++ b/comps/llms/src/text-generation/README_endpoint_openai.md @@ -0,0 +1,70 @@ +# Introduction + +This OPEA text generation service can connect to any OpenAI-compatible API endpoint, including local deployments (like vLLM or TGI) and remote services (like OpenRouter.ai). + +## 1 Prepare TextGen docker image. + +```bash +# Build the microservice docker + +git clone https://github.com/opea-project/GenAIComps +cd GenAIComps + +docker build \ + --no-cache \ + --build-arg https_proxy=$https_proxy \ + --build-arg http_proxy=$http_proxy \ + -t opea/llm-textgen:latest \ + -f comps/llms/src/text-generation/Dockerfile . +``` + +## 2 Setup Environment Variables + +The key environment variable is `LLM_ENDPOINT`, which specifies the URL of the OpenAI-compatible API. This can be a local address (e.g., for vLLM or TGI) or a remote address. + +``` +export host_ip=$(hostname -I | awk '{print $1}') +export LLM_MODEL_ID="" # e.g. "google/gemma-3-1b-it:free" +export LLM_ENDPOINT="" # e.g., "http://localhost:8000" (for local vLLM) or "https://openrouter.ai/api" (please make sure to omit /v1 suffix) +export OPENAI_API_KEY="" +``` + +## 3 Run the Textgen Service + +``` +export service_name="textgen-service-endpoint-openai" +docker compose -f comps/llms/deployment/docker_compose/compose_text-generation.yaml up ${service_name} -d +``` + +To observe logs: + +``` +docker logs textgen-service-endpoint-openai +``` + +## 4 Test the service + +You can first test the remote/local endpoint with `curl`. If you're using a service like OpenRouter, you can test it directly first: + +``` +curl https://openrouter.ai/api/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer $OPENAI_API_KEY" \ + -d '{ + "model": "'${LLM_MODEL_ID}'", + "messages": [ + { + "role": "user", + "content": "Tell me a joke?" + } + ] +}' +``` + +Then you can test the OPEA text generation service that wrapped the endpoint, with the following: + +``` +curl http://localhost:9000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{"model":"'${LLM_MODEL_ID}'","messages":[{"role":"user","content":"Tell me a joke?"}]}' +``` diff --git a/comps/llms/src/text-generation/integrations/service.py b/comps/llms/src/text-generation/integrations/service.py index 485571dc07..4979e107af 100644 --- a/comps/llms/src/text-generation/integrations/service.py +++ b/comps/llms/src/text-generation/integrations/service.py @@ -2,7 +2,9 @@ # SPDX-License-Identified: Apache-2.0 import asyncio +import logging import os +from pprint import pformat from typing import Union from fastapi.responses import StreamingResponse @@ -11,12 +13,18 @@ from comps import CustomLogger, LLMParamsDoc, OpeaComponent, OpeaComponentRegistry, SearchedDoc, ServiceType from comps.cores.mega.utils import ConfigError, get_access_token, load_model_configs -from comps.cores.proto.api_protocol import ChatCompletionRequest +from comps.cores.proto.api_protocol import ALLOWED_CHATCOMPLETION_ARGS, ALLOWED_COMPLETION_ARGS, ChatCompletionRequest from .template import ChatTemplate logger = CustomLogger("opea_llm") -logflag = os.getenv("LOGFLAG", False) + +# Configure advanced logging based on LOGFLAG environment variable +logflag = os.getenv("LOGFLAG", "False").lower() in ("true", "1", "yes") +if logflag: + logger.logger.setLevel(logging.DEBUG) +else: + logger.logger.setLevel(logging.INFO) # Environment variables MODEL_NAME = os.getenv("LLM_MODEL_ID") @@ -96,27 +104,30 @@ async def send_simple_request(): def align_input( self, input: Union[LLMParamsDoc, ChatCompletionRequest, SearchedDoc], prompt_template, input_variables ): + """Aligns different input types to a standardized chat completion format. + + Args: + input: SearchedDoc, LLMParamsDoc, or ChatCompletionRequest + prompt_template: Optional template for formatting prompts + input_variables: Variables expected by the prompt template + """ if isinstance(input, SearchedDoc): - if logflag: - logger.info("[ SearchedDoc ] input from retriever microservice") + logger.debug(f"Processing SearchedDoc input from retriever microservice:\n{pformat(vars(input), indent=2)}") prompt = input.initial_query if input.retrieved_docs: docs = [doc.text for doc in input.retrieved_docs] - if logflag: - logger.info(f"[ SearchedDoc ] combined retrieved docs: {docs}") + logger.debug(f"Retrieved documents:\n{pformat(docs, indent=2)}") prompt = ChatTemplate.generate_rag_prompt(input.initial_query, docs, MODEL_NAME) + logger.debug(f"Generated RAG prompt:\n{prompt}") - ## use default ChatCompletionRequest parameters - new_input = ChatCompletionRequest(messages=prompt) + # Convert to ChatCompletionRequest with default parameters + new_input = ChatCompletionRequest(messages=prompt) + logger.debug(f"Final converted input:\n{pformat(vars(new_input), indent=2)}") - if logflag: - logger.info(f"[ SearchedDoc ] final input: {new_input}") - - return prompt, new_input + return prompt, new_input elif isinstance(input, LLMParamsDoc): - if logflag: - logger.info("[ LLMParamsDoc ] input from rerank microservice") + logger.debug(f"Processing LLMParamsDoc input from rerank microservice:\n{pformat(vars(input), indent=2)}") prompt = input.query if prompt_template: if sorted(input_variables) == ["context", "question"]: @@ -124,8 +135,8 @@ def align_input( elif input_variables == ["question"]: prompt = prompt_template.format(question=input.query) else: - logger.info( - f"[ LLMParamsDoc ] {prompt_template} not used, we only support 2 input variables ['question', 'context']" + logger.warning( + f"Prompt template not used - unsupported variables. Template: {prompt_template}\nOnly ['question', 'context'] or ['question'] are supported" ) else: if input.documents: @@ -145,8 +156,7 @@ def align_input( return prompt, new_input else: - if logflag: - logger.info("[ ChatCompletionRequest ] input in opea format") + logger.debug(f"Processing ChatCompletionRequest input:\n{pformat(vars(input), indent=2)}") prompt = input.messages if prompt_template: @@ -179,8 +189,7 @@ async def invoke(self, input: Union[LLMParamsDoc, ChatCompletionRequest, Searche input_variables = prompt_template.input_variables if isinstance(input, ChatCompletionRequest) and not isinstance(input.messages, str): - if logflag: - logger.info("[ ChatCompletionRequest ] input in opea format") + logger.debug("[ ChatCompletionRequest ] input in opea format") if input.messages[0]["role"] == "system": if "{context}" in input.messages[0]["content"]: @@ -200,22 +209,11 @@ async def invoke(self, input: Union[LLMParamsDoc, ChatCompletionRequest, Searche input.messages.insert(0, {"role": "system", "content": system_prompt}) - chat_completion = await self.client.chat.completions.create( - model=MODEL_NAME, - messages=input.messages, - frequency_penalty=input.frequency_penalty, - max_tokens=input.max_tokens, - n=input.n, - presence_penalty=input.presence_penalty, - response_format=input.response_format, - seed=input.seed, - stop=input.stop, - stream=input.stream, - stream_options=input.stream_options, - temperature=input.temperature, - top_p=input.top_p, - user=input.user, - ) + # Create input params directly from input object attributes + input_params = {**vars(input), "model": MODEL_NAME} + filtered_params = self._filter_api_params(input_params, ALLOWED_CHATCOMPLETION_ARGS) + logger.debug(f"Filtered chat completion parameters:\n{pformat(filtered_params, indent=2)}") + chat_completion = await self.client.chat.completions.create(**filtered_params) """TODO need validate following parameters for vllm logit_bias=input.logit_bias, logprobs=input.logprobs, @@ -226,22 +224,10 @@ async def invoke(self, input: Union[LLMParamsDoc, ChatCompletionRequest, Searche parallel_tool_calls=input.parallel_tool_calls,""" else: prompt, input = self.align_input(input, prompt_template, input_variables) - chat_completion = await self.client.completions.create( - model=MODEL_NAME, - prompt=prompt, - echo=input.echo, - frequency_penalty=input.frequency_penalty, - max_tokens=input.max_tokens, - n=input.n, - presence_penalty=input.presence_penalty, - seed=input.seed, - stop=input.stop, - stream=input.stream, - suffix=input.suffix, - temperature=input.temperature, - top_p=input.top_p, - user=input.user, - ) + input_params = {**vars(input), "model": MODEL_NAME, "prompt": prompt} + filtered_params = self._filter_api_params(input_params, ALLOWED_COMPLETION_ARGS) + logger.debug(f"Filtered completion parameters:\n{pformat(filtered_params, indent=2)}") + chat_completion = await self.client.completions.create(**filtered_params) """TODO need validate following parameters for vllm best_of=input.best_of, logit_bias=input.logit_bias, @@ -251,8 +237,7 @@ async def invoke(self, input: Union[LLMParamsDoc, ChatCompletionRequest, Searche async def stream_generator(): async for c in chat_completion: - if logflag: - logger.info(c) + logger.debug(c) chunk = c.model_dump_json() if chunk not in ["<|im_end|>", "<|endoftext|>"]: yield f"data: {chunk}\n\n" @@ -260,6 +245,21 @@ async def stream_generator(): return StreamingResponse(stream_generator(), media_type="text/event-stream") else: - if logflag: - logger.info(chat_completion) + logger.debug(chat_completion) return chat_completion + + def _filter_api_params(self, input_params: dict, allowed_args: tuple) -> dict: + """Filters input parameters to only include allowed non-None arguments. + + Only allow allowed args, and and filter non-None default arguments because + some open AI-like APIs e.g. OpenRouter.ai will disallow None parameters. + Works for both chat completion and regular completion API calls. + + Args: + input_params: Dictionary of input parameters + allowed_args: Tuple of allowed argument names + + Returns: + Filtered dictionary containing only allowed non-None arguments + """ + return {arg: input_params[arg] for arg in allowed_args if arg in input_params and input_params[arg] is not None} diff --git a/tests/llms/test_llms_textgen_endpoint_openai.sh b/tests/llms/test_llms_textgen_endpoint_openai.sh new file mode 100644 index 0000000000..8416211574 --- /dev/null +++ b/tests/llms/test_llms_textgen_endpoint_openai.sh @@ -0,0 +1,176 @@ +#!/bin/bash +# Copyright (C) 2025 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +# tests/llms/test_llms_textgen_remote_openai.sh +# SPDX-License-Identifier: Apache-2.0 + +# This script tests the textgen connection with an openai endpoint based on a vLLM openAI endpoint. +# The vLLM server is started in a docker container, and the textgen service is started in another container. +# The textgen service is configured to connect to the vLLM server using a test key. +# The test sends a request to the textgen service and validates the response. + +set -e # Exit on error + +# --- Check for jq --- +if ! command -v jq &> /dev/null +then + echo "jq could not be found. Please install it (e.g., apt install jq)." + exit 1 +fi + +# --- Setup --- +export WORKPATH=$(dirname "$PWD") +export host_ip=$(hostname -I | awk '{print $1}') +export http_proxy="" +export LOG_PATH="$WORKPATH/tests" +export VLLM_MODEL="Qwen/Qwen2.5-0.5B-Instruct" +export LLM_ENDPOINT_PORT="8000" +export OPENAI_API_KEY=testkey + + +function build_vllm_image() { + # This image is used to test textgen-service-endpoint-openai + rm -rf $WORKPATH/vllm # Remove existing vllm directory if it exists + cd $WORKPATH + + # Pull the last tagged version of vLLM. + git clone https://github.com/vllm-project/vllm.git && cd vllm + VLLM_VER="$(git describe --tags "$(git rev-list --tags --max-count=1)" )" + echo "Checked out vLLM tag ${VLLM_VER}" + git checkout ${VLLM_VER} &> /dev/null + + docker build --no-cache -f docker/Dockerfile.cpu -t opea/vllm-cpu:test . + cd $WORKPATH +} + +function start_vllm() { + export HF_TOKEN=${HF_TOKEN} # Remember to set HF_TOKEN before invoking this test! + export VLLM_API_KEY=${OPENAI_API_KEY} # This is the VLLM environment variable to set keys. + export host_ip=$(hostname -I | awk '{print $1}') + + # Block size must be 16 for CPU backend unless Intel Extension for PyTorch (IPEX) is installed + BLOCK_SIZE=16 # If IPEX is installed can use 128. + + # Environment variables for vLLM CPU configuration: + # - VLLM_USE_CPU=1: Explicitly force CPU backend usage + # - VLLM_CPU_OMP_THREADS_BIND=all: Configure OpenMP to use all available CPU threads + # - VLLM_CPU_KVCACHE_SPACE=4: Allocate 4GB of CPU memory for KV cache + # - VLLM_MLA_DISABLE=1: Disable MLA (Multi-head Linear Attention) optimizations which aren't supported on CPU + docker run --rm -d \ + -p ${LLM_ENDPOINT_PORT}:8000 \ + -e VLLM_API_KEY=${OPENAI_API_KEY} \ + -e VLLM_USE_CPU=1 \ + -e VLLM_CPU_OMP_THREADS_BIND=all \ + -e VLLM_CPU_KVCACHE_SPACE=4 \ + -e VLLM_MLA_DISABLE=1 \ + --name vllm-server \ + --network bridge \ + opea/vllm-cpu:test \ + --model ${VLLM_MODEL} \ + --port 8000 \ + --host 0.0.0.0 \ + --block-size ${BLOCK_SIZE} + + echo "Waiting for vLLM server to initialize..." + # Wait for server to be ready by checking logs + while ! docker logs vllm-server 2>&1 | grep -q "Application startup complete"; do + echo " Still waiting for vLLM server..." + sleep 10 + done + echo "vLLM server is ready!" + + # Verify server is responding + curl -v http://localhost:${LLM_ENDPOINT_PORT}/health 2>&1 || echo "Warning: vLLM health check failed" +} + +function start_textgen() { + # Testing if the textgen can connect to a vllm endpoint, with a testkey. + export OPENAI_API_KEY=testkey + # Use host.docker.internal to access the host machine from within the container + export LLM_ENDPOINT="http://${host_ip}:8000" + export LLM_MODEL_ID=$VLLM_MODEL # Must match vLLM + export service_name="textgen-service-endpoint-openai" + export LOGFLAG=True + + echo "Starting textgen service connecting to vllm at: ${LLM_ENDPOINT}" + + # textgen-service-endpoint-openai extends the textgen service. This test uses the image: opea/llm-textgen:latest + cd $WORKPATH/comps/llms/deployment/docker_compose + docker compose -f compose_text-generation.yaml up ${service_name} -d + + echo "Waiting for textgen-service-endpoint-openai to initialize..." + while ! docker logs textgen-service-endpoint-openai 2>&1 | grep -q "Application startup complete"; do + echo " Still waiting for textgen-service-endpoint-openai server..." + sleep 5 + done + echo "textgen-service-endpoint-openai is ready!" + + curl http://localhost:9000/health 2>&1 || echo "Warning: textgen health check failed" +} + +function validate_chat_completions() { + echo "Validating chat completions endpoint" + local response=$(curl -s -X POST http://${host_ip}:9000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer testkey" \ + -d '{ + "model": "'${VLLM_MODEL}'", + "messages": [ + { + "role": "user", + "content": "Hello, world!" + } + ] + }') + + echo "Raw chat response: $response" + + local status_code=$(curl -s -o /dev/null -w "%{http_code}" -X POST http://${host_ip}:9000/v1/chat/completions \ + -H "Content-Type: application/json" \ + -H "Authorization: Bearer testkey" \ + -d '{ + "model": "'${VLLM_MODEL}'", + "messages": [ + { + "role": "user", + "content": "Hello, world!" + } + ] + }') + + if [[ "$status_code" -ne 200 ]]; then + echo "Error: Chat completions HTTP status code is not 200. Received: $status_code" + docker logs textgen-service-endpoint-openai || true + docker logs vllm-server || true + exit 1 + fi + + local generated_text=$(echo "$response" | jq -r '.choices[0].message.content') + + if [[ -z "$generated_text" ]]; then + echo "Error: No generated text found in chat response." + docker logs textgen-service-endpoint-openai || true + docker logs vllm-server || true + exit 1 + fi + + echo "Chat completions test passed. Generated text: $generated_text" +} + + +function stop_containers() { + docker compose -f compose_text-generation.yaml down + docker stop vllm-server || true # the --rm flag will ensure it is removed +} + + +# Assumes containers from other test runs are already cleared. +build_vllm_image +start_vllm +start_textgen +validate_chat_completions +stop_containers +docker system prune -a -f + +echo "All tests completed."