diff --git a/ChatQnA/chatqna.py b/ChatQnA/chatqna.py
index d802bf3a51..6266dace8f 100644
--- a/ChatQnA/chatqna.py
+++ b/ChatQnA/chatqna.py
@@ -3,9 +3,15 @@
import argparse
import json
+import logging
import os
import re
+# Configure logging
+logger = logging.getLogger(__name__)
+log_level = logging.DEBUG if os.getenv("LOGFLAG", "").lower() == "true" else logging.INFO
+logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+
from comps import MegaServiceEndpoint, MicroService, ServiceOrchestrator, ServiceRoleType, ServiceType
from comps.cores.mega.utils import handle_message
from comps.cores.proto.api_protocol import (
@@ -62,6 +68,10 @@ def generate_rag_prompt(question, documents):
def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
+ logger.debug(
+ f"Aligning inputs for service: {self.services[cur_node].name}, type: {self.services[cur_node].service_type}"
+ )
+
if self.services[cur_node].service_type == ServiceType.EMBEDDING:
inputs["inputs"] = inputs["text"]
del inputs["text"]
@@ -83,6 +93,9 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
# next_inputs["repetition_penalty"] = inputs["repetition_penalty"]
next_inputs["temperature"] = inputs["temperature"]
inputs = next_inputs
+
+ # Log the aligned inputs (be careful with sensitive data)
+ logger.debug(f"Aligned inputs for {self.services[cur_node].name}: {type(inputs)}")
return inputs
@@ -123,7 +136,9 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di
elif input_variables == ["question"]:
prompt = prompt_template.format(question=data["initial_query"])
else:
- print(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']")
+ logger.warning(
+ f"{prompt_template} not used, we only support 2 input variables ['question', 'context']"
+ )
prompt = ChatTemplate.generate_rag_prompt(data["initial_query"], docs)
else:
prompt = ChatTemplate.generate_rag_prompt(data["initial_query"], docs)
@@ -152,7 +167,7 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di
elif input_variables == ["question"]:
prompt = prompt_template.format(question=prompt)
else:
- print(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']")
+ logger.warning(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']")
prompt = ChatTemplate.generate_rag_prompt(prompt, reranked_docs)
else:
prompt = ChatTemplate.generate_rag_prompt(prompt, reranked_docs)
@@ -171,29 +186,65 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di
def align_generator(self, gen, **kwargs):
- # OpenAI response format
- # b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"},"logprobs":null,"finish_reason":null}]}\n\n'
- for line in gen:
- line = line.decode("utf-8")
- start = line.find("{")
- end = line.rfind("}") + 1
+ """Aligns the generator output to match ChatQnA's format of sending bytes.
+
+ Handles different LLM output formats (TGI, OpenAI) and properly filters
+ empty or null content chunks to avoid UI display issues.
+ """
+ # OpenAI response format example:
+ # b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct",
+ # "system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"},
+ # "logprobs":null,"finish_reason":null}]}\n\n'
- json_str = line[start:end]
+ for line in gen:
try:
- # sometimes yield empty chunk, do a fallback here
+ line = line.decode("utf-8")
+ start = line.find("{")
+ end = line.rfind("}") + 1
+
+ # Skip lines with invalid JSON structure
+ if start == -1 or end <= start:
+ logger.debug("Skipping line with invalid JSON structure")
+ continue
+
+ json_str = line[start:end]
+
+ # Parse the JSON data
json_data = json.loads(json_str)
+
+ # Handle TGI format responses
if "ops" in json_data and "op" in json_data["ops"][0]:
if "value" in json_data["ops"][0] and isinstance(json_data["ops"][0]["value"], str):
yield f"data: {repr(json_data['ops'][0]['value'].encode('utf-8'))}\n\n"
- else:
- pass
- elif (
- json_data["choices"][0]["finish_reason"] != "eos_token"
- and "content" in json_data["choices"][0]["delta"]
- ):
- yield f"data: {repr(json_data['choices'][0]['delta']['content'].encode('utf-8'))}\n\n"
+ # Empty value chunks are silently skipped
+
+ # Handle OpenAI format responses
+ elif "choices" in json_data and len(json_data["choices"]) > 0:
+ # Only yield content if it exists and is not null
+ if (
+ "delta" in json_data["choices"][0]
+ and "content" in json_data["choices"][0]["delta"]
+ and json_data["choices"][0]["delta"]["content"] is not None
+ ):
+ content = json_data["choices"][0]["delta"]["content"]
+ yield f"data: {repr(content.encode('utf-8'))}\n\n"
+ # Null content chunks are silently skipped
+ elif (
+ "delta" in json_data["choices"][0]
+ and "content" in json_data["choices"][0]["delta"]
+ and json_data["choices"][0]["delta"]["content"] is None
+ ):
+ logger.debug("Skipping null content chunk")
+
+ except json.JSONDecodeError as e:
+ # Log the error with the problematic JSON string for better debugging
+ logger.error(f"JSON parsing error in align_generator: {e}\nProblematic JSON: {json_str[:200]}")
+ # Skip sending invalid JSON to avoid UI issues
+ continue
except Exception as e:
- yield f"data: {repr(json_str.encode('utf-8'))}\n\n"
+ logger.error(f"Unexpected error in align_generator: {e}, line snippet: {line[:100]}...")
+ # Skip sending to avoid UI issues
+ continue
yield "data: [DONE]\n\n"
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/README_endpoint_openai.md b/ChatQnA/docker_compose/intel/cpu/xeon/README_endpoint_openai.md
new file mode 100644
index 0000000000..c7e6b3bc31
--- /dev/null
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/README_endpoint_openai.md
@@ -0,0 +1,453 @@
+# Build Mega Service of ChatQnA on Xeon with an LLM Endpoint
+
+This document outlines the single node deployment process for a ChatQnA application utilizing the [GenAIComps](https://github.com/opea-project/GenAIComps.git) microservices on Intel Xeon server. The steps include pulling Docker images, container deployment via Docker Compose, and service execution to integrate microservices such as `embedding`, `retriever`, `rerank` and `llm`.
+
+## Table of contents
+
+1. [ChatQnA Quick Start Deployment](#chatqna-quick-start-Deployment)
+2. [ChatQnA Docker Compose file Options](#chatqna-docker-compose-files)
+3. [ChatQnA with Conversational UI](#chatqna-with-conversational-ui-optional)
+
+## ChatQnA Quick Start Deployment
+
+This section describes how to quickly deploy and test the ChatQnA service manually on an Intel® Xeon® processor. The basic steps are:
+
+1. [Access the Code](#access-the-code)
+2. [Generate a HuggingFace Access Token](#generate-a-huggingface-access-token)
+3. [Configure the Deployment Environment](#configure-the-deployment-environment)
+4. [Deploy the Services Using Docker Compose](#deploy-the-services-using-docker-compose)
+5. [Check the Deployment Status](#check-the-deployment-status)
+6. [Test the Pipeline](#test-the-pipeline)
+7. [Cleanup the Deployment](#cleanup-the-deployment)
+
+### Access the Code
+
+Clone the GenAIExample repository and access the ChatQnA Intel® Gaudi® platform Docker Compose files and supporting scripts:
+
+```
+git clone https://github.com/opea-project/GenAIComps
+cd GenAIComps
+
+# Build the opea/llm-textgen image.
+
+docker build \
+ --no-cache \
+ --build-arg https_proxy=$https_proxy \
+ --build-arg http_proxy=$http_proxy \
+ -t opea/llm-textgen:latest \
+ -f comps/llms/src/text-generation/Dockerfile .
+
+
+cd ../
+git clone https://github.com/opea-project/GenAIExamples.git
+cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon/
+```
+
+### Generate a HuggingFace Access Token
+
+Some HuggingFace resources, such as some models, are only accessible if the developer have an access token. In the absence of a HuggingFace access token, the developer can create one by first creating an account by following the steps provided at [HuggingFace](https://huggingface.co/) and then generating a [user access token](https://huggingface.co/docs/transformers.js/en/guides/private#step-1-generating-a-user-access-token).
+
+## Endpoint Access
+
+An OpenAI-compatible endpoint is required e.g., OpenRouter.ai. Please obtain a valid API key.
+
+### Configure the Deployment Environment
+
+To set up environment variables for deploying ChatQnA services, set up some parameters specific to the deployment environment and source the _setup_env.sh_ script in this directory:
+
+```bash
+cd GenAIExamples/ChatQnA/docker_compose/intel/cpu/xeon
+source set_env.sh # source environment variables then override below.
+
+export host_ip="External_Public_IP" # e.g. export host_ip=$(hostname -I | awk '{print $1}')
+export HF_TOKEN="Your_Huggingface_API_Token"
+export OPENAI_API_KEY="key for openAI-like endpoint"
+
+export LLM_MODEL_ID="" # e.g. "google/gemma-3-1b-it:free"
+export LLM_ENDPOINT="" # e.g. "https://openrouter.ai/api" (please make sure to omit /v1 suffix)
+export no_proxy="" # Can set if any no proxy variables. See set_envh.sh
+```
+
+Consult the section on [ChatQnA Service configuration](#chatqna-configuration) for information on how service specific configuration parameters affect deployments.
+
+### Deploy the Services Using Docker Compose
+
+To deploy the ChatQnA services, execute the `docker compose up` command with the appropriate arguments. For a default deployment, execute the command below. It uses the 'compose.yaml' file.
+
+```bash
+NGINX_PORT=8080 docker compose -f compose_endpoint_openai.yaml up -d
+```
+
+Usage of NGINX_PORT=8080 allows you to access the chat console on localhost:8080 since webbrowser may use port 80.
+
+To enable Open Telemetry Tracing, compose.telemetry.yaml file need to be merged along with default compose.yaml file.
+CPU example with Open Telemetry feature:
+
+> NOTE : To get supported Grafana Dashboard, please run download_opea_dashboard.sh following below commands.
+
+```bash
+./grafana/dashboards/download_opea_dashboard.sh
+docker compose -f compose_endpoint_openai.yaml -f compose.telemetry.yaml up -d
+```
+
+**Note**: developers should build docker image from source when:
+
+- Developing off the git main branch (as the container's ports in the repo may be different from the published docker image).
+- Unable to download the docker image.
+- Use a specific version of Docker image.
+
+Please refer to the table below to build different microservices from source:
+
+| Microservice | Deployment Guide |
+| ------------ | --------------------------------------------------------------------------------------------- |
+| Dataprep | https://github.com/opea-project/GenAIComps/tree/main/comps/dataprep |
+| Embedding | https://github.com/opea-project/GenAIComps/tree/main/comps/embeddings |
+| Retriever | https://github.com/opea-project/GenAIComps/tree/main/comps/retrievers |
+| Reranker | https://github.com/opea-project/GenAIComps/tree/main/comps/rerankings |
+| LLM | https://github.com/opea-project/GenAIComps/tree/main/comps/llms |
+| Megaservice | [Megaservice build guide](../../../../README_miscellaneous.md#build-megaservice-docker-image) |
+| UI | [Basic UI build guide](../../../../README_miscellaneous.md#build-ui-docker-image) |
+
+### Check the Deployment Status
+
+After running docker compose, check if all the containers launched via docker compose have started:
+
+```
+docker ps -a
+```
+
+For the endpoint-based deployment, the following 9 containers should be running:
+
+```bash
+CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
+04f0e3607457 opea/nginx:${RELEASE_VERSION} "/docker-entrypoint.…" 17 minutes ago Up 16 minutes 0.0.0.0:8080->80/tcp, [::]:8080->80/tcp chatqna-xeon-nginx-server
+6d7fe1bfd0a5 opea/chatqna-ui:${RELEASE_VERSION} "docker-entrypoint.s…" 17 minutes ago Up 16 minutes 0.0.0.0:5173->5173/tcp, :::5173->5173/tcp chatqna-xeon-ui-server
+71d01fe8bc94 opea/chatqna:${RELEASE_VERSION} "python chatqna.py" 17 minutes ago Up 16 minutes 0.0.0.0:8888->8888/tcp, :::8888->8888/tcp chatqna-xeon-backend-server
+ea12fab1c70e opea/retriever:${RELEASE_VERSION} "python opea_retriev…" 17 minutes ago Up 17 minutes 0.0.0.0:7000->7000/tcp, :::7000->7000/tcp retriever-redis-server
+253622403ed6 opea/dataprep:${RELEASE_VERSION} "sh -c 'python $( [ …" 17 minutes ago Up 17 minutes (healthy) 0.0.0.0:6007->5000/tcp, [::]:6007->5000/tcp dataprep-redis-server
+a552cf4f0dd0 redis/redis-stack:7.2.0-v9 "/entrypoint.sh" 17 minutes ago Up 17 minutes (healthy) 0.0.0.0:6379->6379/tcp, :::6379->6379/tcp, 0.0.0.0:8001->8001/tcp, :::8001->8001/tcp redis-vector-db
+6795a52137f7 ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 "text-embeddings-rou…" 17 minutes ago Up 17 minutes 0.0.0.0:6006->80/tcp, [::]:6006->80/tcp tei-embedding-server
+3e55313e714b opea/llm-textgen:${RELEASE_VERSION} "bash entrypoint.sh" 17 minutes ago Up 17 minutes 0.0.0.0:9000->9000/tcp, :::9000->9000/tcp textgen-service-endpoint-openai
+10318f82c943 ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 "text-embeddings-rou…" 17 minutes ago Up 17 minutes 0.0.0.0:8808->80/tcp, [::]:8808->80/tcp tei-reranking-server
+```
+
+If any issues are encountered during deployment, refer to the [troubleshooting](../../../../README_miscellaneous.md##troubleshooting) section.
+
+### Test the Pipeline
+
+Once the ChatQnA services are running, test the pipeline using the following command. This will send a sample query to the ChatQnA service and return a response.
+
+```bash
+curl http://${host_ip}:8888/v1/chatqna \
+ -H "Content-Type: application/json" \
+ -d '{
+ "messages": "What is the revenue of Nike in 2023?"
+ }'
+```
+
+**Note** : Access the ChatQnA UI by web browser through this URL: `http://${host_ip}:8080`. Please confirm the `8080` port is opened in the firewall. To validate each microservice used in the pipeline refer to the [Validate microservices](#validate-microservices) section.
+
+### Cleanup the Deployment
+
+To stop the containers associated with the deployment, execute the following command:
+
+```
+docker compose -f compose.yaml down
+```
+
+## ChatQnA Docker Compose Files
+
+In the context of deploying a ChatQnA pipeline on an Intel® Xeon® platform, we can pick and choose different vector databases, large language model serving frameworks, and remove pieces of the pipeline such as the reranker. The table below outlines the various configurations that are available as part of the application. These configurations can be used as templates and can be extended to different components available in [GenAIComps](https://github.com/opea-project/GenAIComps.git).
+
+| File | Description |
+| -------------------------------------------------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| [compose.yaml](./compose.yaml) | Default compose file using vllm as serving framework and redis as vector database |
+| [compose_endpoint_openai.yaml](./compose_endpoint_openai.yaml) | Uses OpenAI-compatible endpoint (remote or local) as LLM serving framework with redis as vector database. |
+| [compose_milvus.yaml](./compose_milvus.yaml) | Uses Milvus as the vector database. All other configurations remain the same as the default |
+| [compose_pinecone.yaml](./compose_pinecone.yaml) | Uses Pinecone as the vector database. All other configurations remain the same as the default. For more details, refer to [README_pinecone.md](./README_pinecone.md). |
+| [compose_qdrant.yaml](./compose_qdrant.yaml) | Uses Qdrant as the vector database. All other configurations remain the same as the default. For more details, refer to [README_qdrant.md](./README_qdrant.md). |
+| [compose_tgi.yaml](./compose_tgi.yaml) | Uses TGI as the LLM serving framework. All other configurations remain the same as the default |
+| [compose_without_rerank.yaml](./compose_without_rerank.yaml) | Default configuration without the reranker |
+| [compose_faqgen.yaml](./compose_faqgen.yaml) | Enables FAQ generation using vLLM as the LLM serving framework. For more details, refer to [README_faqgen.md](./README_faqgen.md). |
+| [compose_faqgen_tgi.yaml](./compose_faqgen_tgi.yaml) | Enables FAQ generation using TGI as the LLM serving framework. For more details, refer to [README_faqgen.md](./README_faqgen.md). |
+| [compose.telemetry.yaml](./compose.telemetry.yaml) | Helper file for telemetry features for vllm. Can be used along with any compose files that serves vllm |
+| [compose_tgi.telemetry.yaml](./compose_tgi.telemetry.yaml) | Helper file for telemetry features for tgi. Can be used along with any compose files that serves tgi |
+| [compose_mariadb.yaml](./compose_mariadb.yaml) | Uses MariaDB Server as the vector database. All other configurations remain the same as the default |
+
+## ChatQnA with Conversational UI (Optional)
+
+To access the Conversational UI (react based) frontend, modify the UI service in the `compose` file used to deploy. Replace `chaqna-xeon-ui-server` service with the `chatqna-xeon-conversation-ui-server` service as per the config below:
+
+```yaml
+chatqna-xeon-conversation-ui-server:
+ image: opea/chatqna-conversation-ui:latest
+ container_name: chatqna-xeon-conversation-ui-server
+ environment:
+ - APP_BACKEND_SERVICE_ENDPOINT=${BACKEND_SERVICE_ENDPOINT}
+ - APP_DATA_PREP_SERVICE_URL=${DATAPREP_SERVICE_ENDPOINT}
+ ports:
+ - "5174:80"
+ depends_on:
+ - chaqna-xeon-backend-server
+ ipc: host
+ restart: always
+```
+
+Once the services are up, open the following URL in the browser: http://{host_ip}:5174. By default, the UI runs on port 80 internally. If the developer prefers to use a different host port to access the frontend, it can be modified by port mapping in the `compose.yaml` file as shown below:
+
+```yaml
+ chaqna-gaudi-conversation-ui-server:
+ image: opea/chatqna-conversation-ui:latest
+ ...
+ ports:
+ - "80:80"
+```
+
+Here is an example of running ChatQnA (default UI):
+
+
+
+Here is an example of running ChatQnA with Conversational UI (React):
+
+
+
+### Validate Microservices
+
+Note, when verifying the microservices by curl or API from remote client, please make sure the **ports** of the microservices are opened in the firewall of the cloud node.
+Follow the instructions to validate MicroServices.
+For details on how to verify the correctness of the response, refer to [how-to-validate_service](../../hpu/gaudi/how_to_validate_service.md).
+
+1. **TEI Embedding Service**
+ Send a test request to the TEI Embedding Service to ensure it is running correctly:
+
+ ```bash
+ curl http://${host_ip}:6006/embed \
+ -X POST \
+ -d '{"inputs":"What is Deep Learning?"}' \
+ -H 'Content-Type: application/json'
+ ```
+
+ If you receive a connection error, ensure that the service is running and the port 6006 is open in the firewall.
+
+2. **Retriever Microservice**
+
+ To consume the retriever microservice, you need to generate a mock embedding vector by Python script. The length of embedding vector
+ is determined by the embedding model.
+ Here we use the model `EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"`, which vector size is 768.
+
+ Check the vector dimension of your embedding model, set `your_embedding` dimension equal to it.
+
+ ```bash
+ export your_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
+ curl http://${host_ip}:7000/v1/retrieval \
+ -X POST \
+ -d "{\"text\":\"test\",\"embedding\":${your_embedding}}" \
+ -H 'Content-Type: application/json'
+ ```
+
+ If the response indicates an invalid embedding vector, verify that the vector size matches the model's expected dimension.
+
+3. **TEI Reranking Service**
+
+ To test the TEI Reranking Service, use the following `curl` command:
+
+ > Skip for ChatQnA without Rerank pipeline
+
+ ```bash
+ curl http://${host_ip}:8808/rerank \
+ -X POST \
+ -d '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}' \
+ -H 'Content-Type: application/json'
+ ```
+
+4. **LLM Backend Service**
+
+ In the first startup, this service will take more time to download, load and warm up the model. After it's finished, the service will be ready.
+
+ Try the command below to check whether the LLM serving is ready.
+
+ ```bash
+ docker logs textgen-service-endpoint-openai 2>&1 | grep complete
+ # If the service is ready, you will get the response like below.
+ INFO: Application startup complete.
+ ```
+
+ Then try the `cURL` command below to validate services.
+
+You may also test your underlying LLM endpoint. E.g., if OpenRouter.ai:
+
+```bash
+curl https://openrouter.ai/api/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -H "Authorization: Bearer $OPENAI_API_KEY" \
+ -d '{
+ "model": ${LLM_MODEL_ID},
+ "messages": [
+ {
+ "role": "user",
+ "content": "What is the meaning of life?"
+ }
+ ]
+}'
+```
+
+To test the OPEA service that is based on the above:
+
+```bash
+ curl http://${host_ip}:9000/v1/chat/completions \
+ -X POST \
+ -d '{"model": "{$LLM_MODEL_ID}", "messages": [{"role": "user", "content": "What is Deep Learning?"}], "max_tokens":17}' \
+ -H 'Content-Type: application/json'
+```
+
+5. **MegaService**
+
+ Use the following `curl` command to test the MegaService:
+
+ ```bash
+ curl http://${host_ip}:8888/v1/chatqna -H "Content-Type: application/json" -d '{
+ "messages": "What is the revenue of Nike in 2023?"
+ }'
+ ```
+
+6. **Nginx Service**
+
+ Use the following curl command to test the Nginx Service:
+
+ ```bash
+ curl http://${host_ip}:${NGINX_PORT}/v1/chatqna \
+ -H "Content-Type: application/json" \
+ -d '{"messages": "What is the revenue of Nike in 2023?"}'
+ ```
+
+7. **Dataprep Microservice(Optional) **
+
+ If you want to update the default knowledge base, you can use the following commands:
+
+ Update Knowledge Base via Local File [nke-10k-2023.pdf](https://github.com/opea-project/GenAIComps/blob/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf). Or
+ click [here](https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf) to download the file via any web browser.
+ Or run this command to get the file on a terminal.
+
+ ```bash
+ wget https://raw.githubusercontent.com/opea-project/GenAIComps/v1.1/comps/retrievers/redis/data/nke-10k-2023.pdf
+ ```
+
+ Upload:
+
+ ```bash
+ curl -X POST "http://${host_ip}:6007/v1/dataprep/ingest" \
+ -H "Content-Type: multipart/form-data" \
+ -F "files=@./nke-10k-2023.pdf"
+ ```
+
+ This command updates a knowledge base by uploading a local file for processing. Update the file path according to your environment.
+
+ Add Knowledge Base via HTTP Links:
+
+ ```bash
+ curl -X POST "http://${host_ip}:6007/v1/dataprep/ingest" \
+ -H "Content-Type: multipart/form-data" \
+ -F 'link_list=["https://opea.dev"]'
+ ```
+
+ This command updates a knowledge base by submitting a list of HTTP links for processing.
+
+ Also, you are able to get the file list that you uploaded:
+
+ ```bash
+ curl -X POST "http://${host_ip}:6007/v1/dataprep/get" \
+ -H "Content-Type: application/json"
+ ```
+
+ Then you will get the response JSON like this. Notice that the returned `name`/`id` of the uploaded link is `https://xxx.txt`.
+
+ ```json
+ [
+ {
+ "name": "nke-10k-2023.pdf",
+ "id": "nke-10k-2023.pdf",
+ "type": "File",
+ "parent": ""
+ },
+ {
+ "name": "https://opea.dev.txt",
+ "id": "https://opea.dev.txt",
+ "type": "File",
+ "parent": ""
+ }
+ ]
+ ```
+
+ To delete the file/link you uploaded:
+
+ The `file_path` here should be the `id` get from `/v1/dataprep/get` API.
+
+ ```bash
+ # delete link
+ curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \
+ -d '{"file_path": "https://opea.dev.txt"}' \
+ -H "Content-Type: application/json"
+
+ # delete file
+ curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \
+ -d '{"file_path": "nke-10k-2023.pdf"}' \
+ -H "Content-Type: application/json"
+
+ # delete all uploaded files and links
+ curl -X POST "http://${host_ip}:6007/v1/dataprep/delete" \
+ -d '{"file_path": "all"}' \
+ -H "Content-Type: application/json"
+ ```
+
+### Profile Microservices
+
+To further analyze MicroService Performance, users could follow the instructions to profile MicroServices.
+
+#### 1. LLM Endpoint Service
+
+Users can profile the performance of the endpoint service using standard HTTP/network profiling tools such as:
+
+- cURL timing statistics
+- Browser developer tools
+- Network monitoring tools
+
+Example using cURL with timing data:
+
+```bash
+curl -w "\nTime Statistics:\n-----------------\n\
+DNS Lookup: %{time_namelookup}s\n\
+TCP Connect: %{time_connect}s\n\
+TLS Handshake: %{time_appconnect}s\n\
+First Byte: %{time_starttransfer}s\n\
+Total Time: %{time_total}s\n" \
+-H "Content-Type: application/json" \
+-H "Authorization: Bearer $OPENAI_API_KEY" \
+-d '{
+ "model": "${LLM_MODEL_ID}",
+ "messages": [
+ {
+ "role": "user",
+ "content": "What is machine learning?"
+ }
+ ]
+}' \
+${LLM_ENDPOINT}/v1/chat/completions
+```
+
+You can also use tools like `ab` (Apache Benchmark) for load testing:
+
+```bash
+ab -n 100 -c 10 -p payload.json -T 'application/json' \
+ -H "Authorization: Bearer $OPENAI_API_KEY" \
+ ${LLM_ENDPOINT}/v1/chat/completions
+```
+
+For detailed API latency monitoring, consider using:
+
+- Grafana for visualization
+- Prometheus for metrics collection
+- OpenTelemetry for distributed tracing
+
+## Conclusion
+
+This guide should enable developer to deploy the default configuration or any of the other compose yaml files for different configurations. It also highlights the configurable parameters that can be set before deployment.
diff --git a/ChatQnA/docker_compose/intel/cpu/xeon/compose_remote.yaml b/ChatQnA/docker_compose/intel/cpu/xeon/compose_endpoint_openai.yaml
similarity index 75%
rename from ChatQnA/docker_compose/intel/cpu/xeon/compose_remote.yaml
rename to ChatQnA/docker_compose/intel/cpu/xeon/compose_endpoint_openai.yaml
index a69a420aaa..802d2020ec 100644
--- a/ChatQnA/docker_compose/intel/cpu/xeon/compose_remote.yaml
+++ b/ChatQnA/docker_compose/intel/cpu/xeon/compose_endpoint_openai.yaml
@@ -32,6 +32,12 @@ services:
INDEX_NAME: ${INDEX_NAME}
TEI_ENDPOINT: http://tei-embedding-service:80
HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN}
+ healthcheck:
+ test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
+ interval: 10s
+ timeout: 5s
+ retries: 50
+ restart: unless-stopped
tei-embedding-service:
image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
container_name: tei-embedding-server
@@ -81,14 +87,37 @@ services:
HF_HUB_DISABLE_PROGRESS_BARS: 1
HF_HUB_ENABLE_HF_TRANSFER: 0
command: --model-id ${RERANK_MODEL_ID} --auto-truncate
+ # Substitute vllm with OpeaTextGenService
+ textgen-service-endpoint-openai: # Used in stead of vllm
+ image: opea/llm-textgen:${TAG:-latest} # Changed image
+ container_name: textgen-service-endpoint-openai # Updated container name
+ ipc: host
+ ports:
+ - "9000:9000" # Changed port mapping
+ environment:
+ LLM_COMPONENT_NAME: OpeaTextGenService
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ LLM_MODEL_ID: ${LLM_MODEL_ID} # Set to model ID
+ LLM_ENDPOINT: ${LLM_ENDPOINT} # An openai compatible endpoint, e.g. Hugging Face, OpenRouter, OpenAI
+ OPENAI_API_KEY: ${OPENAI_API_KEY} # Add OpenRouter API Key
chatqna-xeon-backend-server:
image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
container_name: chatqna-xeon-backend-server
depends_on:
- - redis-vector-db
- - tei-embedding-service
- - retriever
- - tei-reranking-service
+ redis-vector-db:
+ condition: service_started
+ dataprep-redis-service:
+ condition: service_healthy
+ tei-embedding-service:
+ condition: service_started
+ retriever:
+ condition: service_started
+ tei-reranking-service:
+ condition: service_started
+ textgen-service-endpoint-openai:
+ condition: service_started
ports:
- "8888:8888"
environment:
@@ -101,9 +130,8 @@ services:
- RETRIEVER_SERVICE_HOST_IP=retriever
- RERANK_SERVER_HOST_IP=tei-reranking-service
- RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
- - LLM_SERVER_HOST_IP=${REMOTE_ENDPOINT}
- - OPENAI_API_KEY= ${OPENAI_API_KEY}
- - LLM_SERVER_PORT=80
+ - LLM_SERVER_HOST_IP=textgen-service-endpoint-openai # Updated host IP
+ - LLM_SERVER_PORT=${LLM_SERVER_PORT:-9000}
- LLM_MODEL=${LLM_MODEL_ID}
- LOGFLAG=${LOGFLAG}
ipc: host
diff --git a/GraphRAG/README.md b/GraphRAG/README.md
index 0870b3d829..0cdc3b5905 100644
--- a/GraphRAG/README.md
+++ b/GraphRAG/README.md
@@ -33,7 +33,7 @@ To set up environment variables for deploying GraphRAG services, follow these st
export NEO4J_PASSWORD=${your_neo4j_password}
export PYTHONPATH=${path_to_comps}
export OPENAI_KEY=${your_openai_api_key} #optional, when not provided will use smaller models TGI/TEI
- export HF_TOKEN=${your_hf_token} #needed for TGI/TEI models
+ export HUGGINGFACEHUB_API_TOKEN=${your_hf_token} #needed for TGI/TEI models
```
2. If you are in a proxy environment, also set the proxy-related environment variables:
diff --git a/GraphRAG/docker_compose/intel/cpu/xeon/GraphRAG_LLM_notes.md b/GraphRAG/docker_compose/intel/cpu/xeon/GraphRAG_LLM_notes.md
new file mode 100644
index 0000000000..f18566fda1
--- /dev/null
+++ b/GraphRAG/docker_compose/intel/cpu/xeon/GraphRAG_LLM_notes.md
@@ -0,0 +1,78 @@
+# About GraphRAG LLMs
+
+## Overview
+
+This GraphRAG app uses three distinct LLMs, each optimized for different tasks in the pipeline:
+
+1. Dataprep LLM (endpoint)
+2. Retriever LLM (endpoint)
+3. Final LLM (cpu)
+
+It also uses an embedding service that runs on CPU.
+
+## 1. Dataprep LLM
+
+Used during data ingestion phase to:
+
+- Process and understand document structure
+- Extract entities and relationships between entities
+- Generate and store community summaries in Neo4j
+
+[dataprep code that build communities and summarizes](https://github.com/opea-project/GenAIComps/blob/main/comps/dataprep/src/integrations/neo4j_llamaindex.py#L94):
+
+**Key Requirements:**
+
+- High-quality model for accurate relationship understanding.
+- Larger context window for document processing
+- Can be slower since it's one-time processing
+
+## 2. Retriever LLM
+
+Used during retrieval to:
+
+- Evaluate relevance of the query to pre-computed community summaries (auery focused summarization)
+- Generate specific answers from relevant communities
+- Process multiple communities in parallel
+
+[retriever code](https://github.com/opea-project/GenAIComps/blob/main/comps/retrievers/src/integrations/neo4j.py):
+
+**Key Requirements:**
+
+- Fast inference for real-time processing
+- Efficient batch processing capabilities
+- Balance between quality and speed
+
+## 3. Final LLM
+
+Used as the last step to:
+
+- Process all retriever-generated answers
+- Synthesize information from multiple communities
+- Generate coherent final response
+
+**Key Requirements:**
+
+- Good at synthesizing multiple sources
+- Strong natural language generation
+- Maintains context across multiple inputs
+
+## Data Flow
+
+1. **Ingestion Phase**
+
+ - Documents → Dataprep LLM → Community Summaries
+ - Summaries stored in Neo4j
+
+2. **Query Phase**
+ - Query → Retriever LLM → Individual Community Answers
+ - Answers → Final LLM → Coherent Response
+
+## Configuration
+
+Each LLM can be configured independently through environment variables:
+
+- `DATAPREP_LLM_ENDPOINT` and `DATAPREP_LLM_MODEL_ID`
+- `RETRIEVER_LLM_ENDPOINT` and `RETRIEVER_LLM_MODEL_ID`
+- `FINAL_LLM_ENDPOINT` and `FINAL_LLM_MODEL_ID`
+
+This allows for optimization of each LLM for its specific task in the pipeline.
diff --git a/GraphRAG/docker_compose/intel/cpu/xeon/README.md b/GraphRAG/docker_compose/intel/cpu/xeon/README.md
new file mode 100644
index 0000000000..0bdd439be4
--- /dev/null
+++ b/GraphRAG/docker_compose/intel/cpu/xeon/README.md
@@ -0,0 +1,298 @@
+# GraphRAG Application
+
+While naive RAG works well in fetching precise information it fails on global questions directed at an entire text corpus, such as "What are the main themes in the dataset?".
+GraphRAG was introduced by Microsoft paper "From Local to Global: A Graph RAG Approach to Query-Focused Summarization". The key elements are:
+
+- Uses LLM to derive an entity knowledge graph from the source documents
+- Uses hierarchical leiden algorithm to identify communities of closely-related entities and summaries are extracted for each community
+- For an input query the relevant communities are identified and partial answers are generated from each of the community summaries with a retrieval LLM (query-focused summarization (QFS))
+- There is a final generation stage (last LLM) that responds to the query based on the intermediate community answers (QFS). See [GraphRAG Model Notes](GraphRAG_LLM_notes.md)
+- In this app three LLM models are used: dataprep (knowledge graph), retriever (query-focused summaries), and final generation. CPU (Xeon) is used for the final generation LLM, and embedding, and dataprep and retriever LLMs are used by endpoints.
+
+## Deploy GraphRAG Service
+
+Quick Start Deployment Steps:
+
+1. Set up the environment variables.
+2. Run Docker Compose.
+3. Consume the GraphRAG Service.
+
+Note: If you do not have docker installed you can run this script to install docker : `bash docker_compose/install_docker.sh`
+
+## Pre-requisites
+
+Build images:
+
+```bash
+cd ~/
+git clone https://github.com/opea-project/GenAIExamples.git
+git clone https://github.com/vllm-project/vllm.git
+git clone https://github.com/opea-project/GenAIComps.git
+
+# vllm-service
+cd vllm/
+VLLM_VER="v0.8.3"
+git checkout ${VLLM_VER}
+docker build --no-cache -f docker/Dockerfile.cpu -t opea/vllm-cpu:${TAG:-latest} --shm-size=128g .
+
+# opea/dataprep
+cd ~/GenAIComps
+docker build -t opea/dataprep:latest --build-arg no_proxy=$no_proxy --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/src/Dockerfile .
+
+# opea/retrievers
+cd ~/GenAIComps
+docker build -t opea/retriever:latest --build-arg no_proxy=$no_proxy --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/src/Dockerfile .
+
+# opea/graphrag-ui
+cd ~/GenAIExamples/GraphRAG/ui
+docker build -t opea/graphrag-ui:latest --build-arg no_proxy=$no_proxy --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f docker/Dockerfile .
+
+# opea/graphrag
+cd ~/GenAIExamples/GraphRAG
+docker build -t opea/graphrag:latest .
+
+
+Note: it's important to be in the correct path before builds so that docker has the correct context to COPY relevant code to containers.
+```
+
+### Quick Start: 1.Setup Environment Variable
+
+To set up environment variables for deploying GraphRAG services, follow these steps:
+
+1. Set the required private environment variables:
+
+ ```bash
+ # For simplicity Openrouter.ai is used as an endpoint for both dataprep and retriever components.
+ # These endpoints could be configured to any openAI-like endpoint.
+ export OPENROUTER_KEY="mykey"
+ export HUGGINGFACEHUB_API_TOKEN="mytoken"
+
+ source set_env.sh
+
+ # Below will override some of these defaults in set_env.sh
+
+ export host_ip=$(hostname -I | awk '{print $1}')
+
+ export NEO4J_PORT1=11631
+ export NEO4J_PORT2=11632
+ export NEO4J_URI="bolt://${host_ip}:${NEO4J_PORT2}"
+ export NEO4J_URL="bolt://${host_ip}:${NEO4J_PORT2}"
+
+ export NEO4J_USERNAME="neo4j"
+ export NEO4J_PASSWORD="neo4jtest"
+
+ export DATAPREP_SERVICE_ENDPOINT="http://${host_ip}:5000/v1/dataprep/ingest"
+
+ # Must explicitly override default to not use OpenAI.
+ export OPENAI_LLM_MODEL=""
+ export OPENAI_EMBEDDING_MODEL=""
+
+ # Embedder endpoint
+ export TEI_EMBEDDER_PORT=6006
+ export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:${TEI_EMBEDDER_PORT}"
+
+ # LLM for dataprep is used to extract knowledge graph
+ export DATAPREP_LLM_ENDPOINT="https://openrouter.ai/api"
+ export DATAPREP_LLM_MODEL_ID="anthropic/claude-3-haiku"
+ export DATAPREP_LLM_ENDPOINT_KEY=${OPENROUTER_KEY}
+
+ # LLM for retriever performs community summaries at retrieval time
+ export RETRIEVER_LLM_ENDPOINT="https://openrouter.ai/api"
+ export RETRIEVER_LLM_MODEL_ID="anthropic/claude-3-haiku"
+ export RETRIEVER_LLM_ENDPOINT_KEY=${OPENROUTER_KEY}
+
+ # Final LLM to formulates response based on relevant community summaries.
+ export FINAL_LLM_MODEL_ID="Qwen/Qwen2.5-0.5B-Instruct"
+
+ export LOGFLAG=True
+ export MAX_INPUT_TOKENS=4096
+ export MAX_TOTAL_TOKENS=8192
+ export DATAPREP_PORT=11103
+ export RETRIEVER_PORT=11635
+ export MEGA_SERVICE_PORT=8888
+
+ ```
+
+2. If you are in a proxy environment, also set the proxy-related environment variables:
+
+ ```bash
+ export http_proxy="Your_HTTP_Proxy"
+ export https_proxy="Your_HTTPs_Proxy"
+ export no_proxy=$no_proxy,${host_ip} #important to add {host_ip} for containers communication
+ ```
+
+### Quick Start: 2.Run Docker Compose
+
+If the microservice images are available in Docker Hub they will be pulled, otherwise you will need to build the container images manually. Please refer to the 'Build Docker Images' in [Guide](../../../../../ChatQnA/docker_compose/intel/cpu/xeon/README.md). [test_compose_on_xeon.sh](../../../../../ChatQnA/tests/test_compose_on_xeon.sh) can be a good resource as it shows how to do image build, starting services, validated each microservices and megaservices. This is what is used in CI/CD.
+
+```bash
+cd GraphRAG/docker_compose/intel/cpu/xeon
+NGINX_PORT=8080 docker compose -f compose.yaml up -d
+```
+
+Here NGINX_PORT=8080 because typically port 80 is used for internet browsing.
+
+#### Check the Deployment Status
+
+After running docker compose, check if all the containers launched via docker compose have started:
+
+```bash
+docker ps -a
+```
+
+The following containers should have started:
+
+```bash
+CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
+740d0061fce2 opea/nginx:latest "/docker-entrypoint.…" 3 hours ago Up 3 hours 0.0.0.0:8080->80/tcp, [::]:8080->80/tcp graphrag-xeon-nginx-server
+3010243786cd opea/graphrag-ui:latest "docker-entrypoint.s…" 3 hours ago Up 3 hours 0.0.0.0:5173->5173/tcp, :::5173->5173/tcp graphrag-ui-server
+f63d10453e22 opea/graphrag:latest "python graphrag.py" 3 hours ago Up 3 hours 0.0.0.0:8888->8888/tcp, :::8888->8888/tcp graphrag-xeon-backend-server
+a48d0fba13e6 opea/dataprep:latest "sh -c 'python $( [ …" 3 hours ago Up 3 hours 0.0.0.0:6004->5000/tcp, [::]:6004->5000/tcp dataprep-neo4j-server
+9301a833f220 opea/retriever:latest "python opea_retriev…" 3 hours ago Up 3 hours 0.0.0.0:7000->7000/tcp, :::7000->7000/tcp retriever-neo4j-server
+eda369268406 ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 "text-embeddings-rou…" 3 hours ago Up 3 hours 0.0.0.0:6006->80/tcp, [::]:6006->80/tcp tei-embedding-server
+f21e82efa1fa opea/vllm-cpu:latest "python3 -m vllm.ent…" 3 hours ago Up 3 hours (healthy) 0.0.0.0:9009->80/tcp, [::]:9009->80/tcp vllm-service
+3b541ceeaf9f neo4j:latest "tini -g -- /startup…" 3 hours ago Up 3 hours 7473/tcp, 0.0.0.0:11631->7474/tcp, [::]:11631->7474/tcp, 0.0.0.0:11632->7687/tcp, [::]:11632->7687/tcp neo4j-apoc
+```
+
+##### Test Final vLLM
+
+```
+curl http://localhost:9009/v1/chat/completions \
+ -H "Content-Type: application/json" \
+ -d '{"model":"'${FINAL_LLM_MODEL_ID}'","messages":[{"role":"user","content":"Tell me a joke?"}]}'
+```
+
+### QuickStart: 3.Upload RAG Files and Consume the GraphRAG Service
+
+To chat with retrieved information, you need to upload a file using `Dataprep` service.
+
+Here is an example of uploading sample graph data (which can also be uploaded via the UI):
+
+```bash
+cd ~/GenAIExamples/GraphRAG/example_data
+
+# First file
+curl -X POST "http://${host_ip}:6004/v1/dataprep/ingest" -H "Content-Type: multipart/form-data" -F "files=@./programming_languages.txt"
+
+# Second file
+curl -X POST "http://${host_ip}:6004/v1/dataprep/ingest" -H "Content-Type: multipart/form-data" -F "files=@./programming_languages2.txt"
+```
+
+To login into the Neo4j UI you may browse to http://localhost:{NEO4J_PORT1}/browser, and login with your NEO4J login and password defined in the environment variables section.
+
+The backend graphrag service can be queried via curl:
+
+```bash
+curl http://${host_ip}:8888/v1/graphrag \
+ -H "Content-Type: application/json" \
+ -d '{"messages": [{"role": "user","content": "what are the main themes of the programming dataset?
+ "}]}'
+```
+
+## Architecture and Deploy details
+
+The GraphRAG example is implemented using the component-level microservices defined in [GenAIComps](https://github.com/opea-project/GenAIComps). The flow chart below shows the information flow between different microservices for this example.
+
+```mermaid
+---
+config:
+ flowchart:
+ nodeSpacing: 400
+ rankSpacing: 100
+ curve: linear
+ themeVariables:
+ fontSize: 50px
+---
+flowchart LR
+ %% Colors %%
+ classDef blue fill:#ADD8E6,stroke:#ADD8E6,stroke-width:2px,fill-opacity:0.5
+ classDef orange fill:#FBAA60,stroke:#ADD8E6,stroke-width:2px,fill-opacity:0.5
+ classDef orchid fill:#C26DBC,stroke:#ADD8E6,stroke-width:2px,fill-opacity:0.5
+ classDef invisible fill:transparent,stroke:transparent;
+ style GraphRAG-MegaService stroke:#000000
+
+ %% Subgraphs %%
+ subgraph GraphRAG-MegaService["GraphRAG MegaService "]
+ direction LR
+ RET([Retrieval MicroService]):::blue
+ LLM([LLM MicroService]):::blue
+ EM([Embedding MicroService]):::blue
+ end
+ subgraph UserInterface[" User Interface "]
+ direction LR
+ a([User Input Query]):::orchid
+ Ingest([Ingest data]):::orchid
+ UI([UI server
]):::orchid
+ end
+
+
+ GDB{{Graph DB
}}
+ DP([Data Preparation MicroService]):::blue
+ GW([GraphRAG GateWay
]):::orange
+
+
+ %% Data Preparation flow
+ %% Ingest data flow
+ direction LR
+ Ingest[Ingest data] --> UI
+ UI --> DP
+
+ %% interactions buried inside the DP and RET microservice implementations
+ DP <-.-> EM
+ DP <-.-> LLM
+ RET <-.-> EM
+ RET <-.-> LLM
+
+
+ %% Questions interaction
+ direction LR
+ a[User Input Query] --> UI
+ UI --> GW
+ GW <==> GraphRAG-MegaService
+ RET ==> LLM
+
+
+ direction TB
+ %% Graph DB interaction
+ RET <-.-> |d|GDB
+ DP <-.-> |d|GDB
+
+ linkStyle 2 stroke:#000000,stroke-width:2px;
+ linkStyle 3 stroke:#000000,stroke-width:2px;
+ linkStyle 4 stroke:#000000,stroke-width:2px;
+ linkStyle 5 stroke:#000000,stroke-width:2px;
+
+
+```
+
+Xeon default configuration:
+| MicroService | Open Source Project | HW | Default Port | Endpoint |
+| ------------ | ------------------- | --- | ------------ | -------- |
+| Dataprep | Neo4j, LlamaIndex | OpenAI-like Endpoint | 6004 | /v1/dataprep/ingest |
+| Embedding | Llama-index, TEI | Xeon or CPU | 6006 | /v1/embeddings |
+| Retriever | Llama-index, Neo4j | OpenAI-like Endpoint | 7000 | /v1/retrieval |
+| Final LLM | vLLM | Xeon or CPU | 9009 | /v1/chat/completions |
+
+### Models Selection
+
+[GraphRAG Model Notes](GraphRAG_LLM_notes.md)
+
+## Consume GraphRAG Service with RAG
+
+### 1. Check Service Status
+
+Before consuming GraphRAG Service, make sure each microservice is ready by checking the docker logs of each microservice.
+
+```bash
+docker logs container_name
+```
+
+### 2. Access via frontend
+
+To access the frontend, open the following URL in your browser: `http://{host_ip}:NGINX_PORT`
+
+In the above example, the UI runs on port 8080 internally.
+
+## Monitoring OPEA Service with Prometheus and Grafana dashboard
+
+OPEA microservice deployment can easily be monitored through Grafana dashboards in conjunction with Prometheus data collection. Follow the [README](https://github.com/opea-project/GenAIEval/blob/main/evals/benchmark/grafana/README.md) to setup Prometheus and Grafana servers and import dashboards to monitor the OPEA service.
diff --git a/GraphRAG/docker_compose/intel/cpu/xeon/compose.yaml b/GraphRAG/docker_compose/intel/cpu/xeon/compose.yaml
new file mode 100644
index 0000000000..0a8e84fa8c
--- /dev/null
+++ b/GraphRAG/docker_compose/intel/cpu/xeon/compose.yaml
@@ -0,0 +1,206 @@
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+ neo4j-apoc:
+ image: neo4j:latest
+ container_name: neo4j-apoc
+ volumes:
+ - /$HOME/neo4j/logs:/logs
+ - /$HOME/neo4j/config:/config
+ - /$HOME/neo4j/data:/data
+ - /$HOME/neo4j/plugins:/plugins
+ ipc: host
+ environment:
+ - NEO4J_AUTH=${NEO4J_USERNAME}/${NEO4J_PASSWORD}
+ - NEO4J_PLUGINS=["apoc"]
+ - NEO4J_apoc_export_file_enabled=true
+ - NEO4J_apoc_import_file_enabled=true
+ - NEO4J_apoc_import_file_use__neo4j__config=true
+ - NEO4J_dbms_security_procedures_unrestricted=apoc.\*
+ ports:
+ - "${NEO4J_PORT1:-7474}:7474"
+ - "${NEO4J_PORT2:-7687}:7687"
+ restart: always
+
+ tei-embedding-service:
+ image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
+ container_name: tei-embedding-server
+ ports:
+ - "6006:80"
+ volumes:
+ - "./data:/data"
+ shm_size: 1g
+ environment:
+ no_proxy: ${no_proxy}
+ NO_PROXY: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ ipc: host
+ command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
+
+ dataprep-neo4j-llamaindex:
+ image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
+ container_name: dataprep-neo4j-server
+ depends_on:
+ - neo4j-apoc
+ - vllm-service
+ - tei-embedding-service
+ ports:
+ - "6004:5000"
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ host_ip: ${host_ip}
+ DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_NEO4J_LLAMAINDEX"
+ HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ # HF_TOKEN: ${HF_TOKEN} # Ed; don't think this is needed for dataprep-neo4j-llamaindex
+ NEO4J_URL: ${NEO4J_URL}
+ NEO4J_USERNAME: ${NEO4J_USERNAME}
+ NEO4J_PASSWORD: ${NEO4J_PASSWORD}
+ TGI_LLM_ENDPOINT: ${DATAPREP_LLM_ENDPOINT}
+ TGI_LLM_ENDPOINT_KEY: ${DATAPREP_LLM_ENDPOINT_KEY}
+ LLM_MODEL_ID: ${DATAPREP_LLM_MODEL_ID}
+ TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
+ OPENAI_API_KEY: ${OPENAI_API_KEY}
+ OPENAI_EMBEDDING_MODEL: ${OPENAI_EMBEDDING_MODEL}
+ OPENAI_LLM_MODEL: ${OPENAI_LLM_MODEL}
+ EMBEDDING_MODEL_ID: ${EMBEDDING_MODEL_ID}
+ LOGFLAG: ${LOGFLAG}
+ restart: unless-stopped
+
+ retriever-neo4j-llamaindex:
+ image: ${REGISTRY:-opea}/retriever:${TAG:-latest}
+ container_name: retriever-neo4j-server
+ ports:
+ - "7000:7000"
+ ipc: host
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ host_ip: ${host_ip}
+ HUGGING_FACE_HUB_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ LOGFLAG: ${LOGFLAG}
+ RETRIEVER_COMPONENT_NAME: "OPEA_RETRIEVER_NEO4J"
+ # Embedding endpoint
+ TEI_EMBEDDING_ENDPOINT: ${TEI_EMBEDDING_ENDPOINT}
+ EMBEDDING_MODEL_ID: ${EMBEDDING_MODEL_ID}
+ # Retriever LLM
+ TGI_LLM_ENDPOINT: ${RETRIEVER_LLM_ENDPOINT}
+ TGI_LLM_ENDPOINT_KEY: ${RETRIEVER_LLM_ENDPOINT_KEY}
+ LLM_MODEL_ID: ${RETRIEVER_LLM_MODEL_ID} # This is used for graph indexing and different from vLLM model ID.
+
+ # Only used if using OpenAI models
+ OPENAI_API_KEY: ${OPENAI_API_KEY}
+ OPENAI_LLM_MODEL: ${OPENAI_LLM_MODEL}
+ OPENAI_EMBEDDING_MODEL: ${OPENAI_EMBEDDING_MODEL}
+ VDMS_USE_CLIP: 0
+ NEO4J_URL: ${NEO4J_URL}
+ NEO4J_URI: ${NEO4J_URI}
+ NEO4J_USERNAME: ${NEO4J_USERNAME}
+ NEO4J_PASSWORD: ${NEO4J_PASSWORD}
+ depends_on:
+ - neo4j-apoc
+ - vllm-service
+ - tei-embedding-service
+ restart: unless-stopped
+
+ # vllm-service is the LLM server that will be used by the retriever and backend.
+ vllm-service:
+ image: ${REGISTRY:-opea}/vllm-cpu:${TAG:-latest} # must build add to README.
+ container_name: vllm-service
+ ports:
+ - "9009:80" # this can be accessed by graphrag-xeon-backend-server on internal port 80.
+ ipc: host
+ volumes:
+ - "${MODEL_CACHE:-./data}:/root/.cache/huggingface/hub"
+ shm_size: 128g
+ environment:
+ no_proxy: ${no_proxy}
+ http_proxy: ${http_proxy}
+ https_proxy: ${https_proxy}
+ HF_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
+ LLM_MODEL_ID: ${FINAL_LLM_MODEL_ID}
+ VLLM_TORCH_PROFILER_DIR: "/mnt"
+ # Laptop environment variables
+ VLLM_USE_CPU: 1
+ VLLM_CPU_OMP_THREADS_BIND: all
+ VLLM_CPU_KVCACHE_SPACE: 4
+ VLLM_MLA_DISABLE: 1
+ healthcheck:
+ test: ["CMD-SHELL", "curl -f http://$host_ip:9009/health || exit 1"]
+ interval: 10s
+ timeout: 10s
+ retries: 100
+ command: --model $LLM_MODEL_ID --host 0.0.0.0 --port 80 --block-size ${BLOCK_SIZE:-16} # for Qwen on laptop set to 16
+
+ graphrag-xeon-backend-server:
+ image: ${REGISTRY:-opea}/graphrag:${TAG:-latest}
+ container_name: graphrag-xeon-backend-server
+ depends_on:
+ - neo4j-apoc
+ - tei-embedding-service
+ - retriever-neo4j-llamaindex
+ - vllm-service
+ ports:
+ - "8888:8888"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - MEGA_SERVICE_HOST_IP=graphrag-xeon-backend-server
+ - RETRIEVER_SERVICE_HOST_IP=retriever-neo4j-llamaindex
+ - RETRIEVER_SERVICE_PORT=7000
+ - LLM_SERVER_HOST_IP=vllm-service # this is the final LLM server that will be used by the backend.
+ - LLM_SERVER_PORT=80
+ - LLM_MODEL_ID=${FINAL_LLM_MODEL_ID} # backend will format the input and provide model to vLLM
+ - LOGFLAG=${LOGFLAG}
+ ipc: host
+ restart: always
+
+ graphrag-ui-server:
+ image: ${REGISTRY:-opea}/graphrag-ui:${TAG:-latest}
+ container_name: graphrag-ui-server
+ depends_on:
+ - graphrag-xeon-backend-server
+ ports:
+ - "5173:5173"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - LLM_MODEL_ID=${FINAL_LLM_MODEL_ID} # this is the LLM model ID for payload request
+ - UPLOAD_FILE_BASE_URL=http://${host_ip}:6004/v1/dataprep/ingest
+ - GET_FILE=http://${host_ip}:6004/v1/dataprep/get
+ - DELETE_FILE=http://${host_ip}:6004/v1/dataprep/delete
+ ipc: host
+ restart: always
+
+ graphrag-xeon-nginx-server:
+ image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
+ container_name: graphrag-xeon-nginx-server
+ depends_on:
+ - graphrag-xeon-backend-server
+ - graphrag-ui-server
+ ports:
+ - "${NGINX_PORT:-80}:80"
+ environment:
+ - no_proxy=${no_proxy}
+ - https_proxy=${https_proxy}
+ - http_proxy=${http_proxy}
+ - FRONTEND_SERVICE_IP=graphrag-ui-server
+ - FRONTEND_SERVICE_PORT=5173
+ - BACKEND_SERVICE_NAME=graphrag
+ - BACKEND_SERVICE_IP=graphrag-xeon-backend-server
+ - BACKEND_SERVICE_PORT=8888
+ - DATAPREP_SERVICE_IP=dataprep-neo4j-llamaindex
+ - DATAPREP_SERVICE_PORT=6004
+ ipc: host
+ restart: always
+networks:
+ default:
+ driver: bridge
diff --git a/GraphRAG/docker_compose/intel/cpu/xeon/set_env.sh b/GraphRAG/docker_compose/intel/cpu/xeon/set_env.sh
new file mode 100644
index 0000000000..60c9b6ce27
--- /dev/null
+++ b/GraphRAG/docker_compose/intel/cpu/xeon/set_env.sh
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+# Remember to set your private variables mentioned in README
+
+# host_ip, OPENAI_API_KEY, HUGGINGFACEHUB_API_TOKEN, proxies...
+pushd "../../../../../" > /dev/null
+source .set_env.sh
+popd > /dev/null
+
+export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
+export OPENAI_EMBEDDING_MODEL="text-embedding-3-small"
+export LLM_MODEL_ID="Qwen/Qwen2.5-0.5B-Instruct" # Will use smaller model for Xeon.
+export OPENAI_LLM_MODEL="gpt-4o"
+export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:6006"
+export TGI_LLM_ENDPOINT="http://${host_ip}:6005"
+export NEO4J_URL="bolt://${host_ip}:7687"
+export NEO4J_USERNAME=neo4j
+export LOGFLAG=True
+export RETRIEVER_SERVICE_PORT=80
diff --git a/GraphRAG/docker_compose/intel/hpu/gaudi/compose.yaml b/GraphRAG/docker_compose/intel/hpu/gaudi/compose.yaml
index 7107a560fe..50ba3bd981 100644
--- a/GraphRAG/docker_compose/intel/hpu/gaudi/compose.yaml
+++ b/GraphRAG/docker_compose/intel/hpu/gaudi/compose.yaml
@@ -199,6 +199,7 @@ services:
- no_proxy=${no_proxy}
- https_proxy=${https_proxy}
- http_proxy=${http_proxy}
+ - LLM_MODEL_ID=${LLM_MODEL_ID} # must be set for the UI to make request.
ipc: host
restart: always
chatqna-gaudi-nginx-server:
diff --git a/GraphRAG/docker_compose/intel/hpu/gaudi/set_env.sh b/GraphRAG/docker_compose/intel/hpu/gaudi/set_env.sh
index d5b7e64b5b..441ea183be 100644
--- a/GraphRAG/docker_compose/intel/hpu/gaudi/set_env.sh
+++ b/GraphRAG/docker_compose/intel/hpu/gaudi/set_env.sh
@@ -5,13 +5,14 @@
# Remember to set your private variables mentioned in README
-# host_ip, OPENAI_API_KEY, HF_TOKEN, proxies...
+# host_ip, OPENAI_API_KEY, HUGGINGFACEHUB_API_TOKEN, proxies...
pushd "../../../../../" > /dev/null
source .set_env.sh
popd > /dev/null
host_ip=$(hostname -I | awk '{print $1}')
-export HF_TOKEN=${HF_TOKEN}
+export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
+export HF_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
export TEI_EMBEDDER_PORT=11633
export LLM_ENDPOINT_PORT=11634
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
diff --git a/GraphRAG/example_data/README.md b/GraphRAG/example_data/README.md
new file mode 100644
index 0000000000..87dacf06b6
--- /dev/null
+++ b/GraphRAG/example_data/README.md
@@ -0,0 +1,87 @@
+# GraphRAG Example Datasets
+
+This directory contains example datasets carefully crafted to demonstrate GraphRAG's capabilities for knowledge graph construction and querying.
+
+## Programming Languages Dataset (`programming_languages.txt`)
+
+A concise dataset that showcases GraphRAG's ability to:
+
+1. **Entity Extraction**
+
+ - People (Guido van Rossum, James Gosling)
+ - Organizations (CWI, Sun Microsystems, Google)
+ - Programming Languages (Python, Java, ABC, Go)
+ - Technologies (REPL, var keyword)
+
+2. **Relationship Types**
+
+ - Creation relationships ("created by")
+ - Influence relationships ("influenced by")
+ - Employment relationships ("worked at")
+ - Usage relationships ("used by")
+ - Feature relationships ("borrowed ideas from")
+
+3. **Temporal Information**
+
+ - Creation dates (1991, 1995, 2009)
+ - Sequential influences (ABC → Python → Java)
+
+4. **Complex Reasoning Capabilities**
+ - Bidirectional influences (Java ↔ Python)
+ - Multi-hop relationships (ABC → Python → Java's features)
+ - Organizational relationships (Google's use of multiple languages)
+
+### Example Queries
+
+This dataset is ideal for testing queries like:
+
+1. "What are the main themes of the programming dataset?"
+2. "What's the relationship between Google and these programming languages?"
+3. "How did early teaching languages influence modern programming languages?"
+4. "Trace the evolution of programming language features through these languages."
+5. "What role did corporate entities play in language development?"
+
+### Community Detection
+
+The dataset is structured to form natural communities around:
+
+- Language Development (Python, ABC, Guido)
+- Corporate Influence (Google, Java, Go)
+- Language Features (OOP, REPL, var keyword)
+
+This makes it perfect for testing GraphRAG's community detection and summarization capabilities.
+
+### Why Traditional RAG Falls Short
+
+For the example queries above, traditional RAG approaches would struggle in several ways:
+
+1. **Multi-hop Relationships**
+
+ - Traditional RAG: Can only find direct relationships within single documents
+ - Example: For "How did ABC influence Java's features?", traditional RAG might miss the connection because it can't trace ABC → Python → Java
+ - GraphRAG: Can traverse multiple relationship hops to uncover indirect influences
+
+2. **Community Analysis**
+
+ - Traditional RAG: Limited to keyword matching and proximity-based relationships
+ - Example: "What programming language communities formed around Google?" requires understanding organizational and temporal relationships
+ - GraphRAG: Can detect and analyze communities through relationship patterns and clustering
+
+3. **Bidirectional Relationships**
+
+ - Traditional RAG: Typically treats relationships as unidirectional text mentions
+ - Example: Understanding how Java and Python mutually influenced each other requires tracking bidirectional relationships
+ - GraphRAG: Explicitly models bidirectional relationships and their evolution over time
+
+4. **Complex Entity Relationships**
+
+ - Traditional RAG: Struggles to maintain consistency across multiple entity mentions
+ - Example: "Trace the evolution of REPL features" requires understanding how the feature moved across languages
+ - GraphRAG: Maintains consistent entity relationships across the entire knowledge graph
+
+5. **Temporal Evolution**
+ - Traditional RAG: Limited ability to track changes over time
+ - Example: Understanding how language features evolved requires tracking temporal relationships
+ - GraphRAG: Can model and query temporal relationships between entities
+
+These limitations make traditional RAG less effective for complex queries that require understanding relationships, community structures, and temporal evolution. GraphRAG's knowledge graph approach provides a more complete and accurate representation of these complex relationships.
diff --git a/GraphRAG/example_data/programming_languages.txt b/GraphRAG/example_data/programming_languages.txt
new file mode 100644
index 0000000000..12a3ec4693
--- /dev/null
+++ b/GraphRAG/example_data/programming_languages.txt
@@ -0,0 +1,5 @@
+Python was created by Guido van Rossum in 1991. He was influenced by ABC, a teaching language he helped develop at CWI. Python is known for its readability and is widely used in AI development.
+
+James Gosling designed Java at Sun Microsystems in 1995. Java influenced Python's approach to object-oriented programming, while Python's success later influenced Java's addition of the var keyword and REPL features.
+
+Both Python and Java are used heavily at Google, where they maintain internal versions of these languages. Google also created Go in 2009, which borrowed ideas from both languages while focusing on simplicity and performance.
diff --git a/GraphRAG/example_data/programming_languages2.txt b/GraphRAG/example_data/programming_languages2.txt
new file mode 100644
index 0000000000..bf634a04dd
--- /dev/null
+++ b/GraphRAG/example_data/programming_languages2.txt
@@ -0,0 +1,19 @@
+Python has become the dominant language in AI research and development. Libraries like TensorFlow, developed by Google Brain in 2015, and PyTorch, released by Facebook's AI Research lab in 2016, have made Python the go-to language for implementing machine learning models. Python's syntax and extensive ecosystem make it particularly suited for rapid prototyping in AI applications.
+
+R was created by Ross Ihaka and Robert Gentleman at the University of Auckland in 1993. It specializes in statistical computing and data visualization, making it popular for data analysis in AI projects. While Python has gained more traction in deep learning, R remains crucial in biostatistics and academic research where statistical rigor is prioritized.
+
+Julia was designed at MIT by Jeff Bezanson, Stefan Karpinski, Viral Shah, and Alan Edelman in 2012. It aims to address the "two-language problem" in scientific computing by offering Python's ease of use with C's performance. Julia's just-in-time compilation makes it increasingly popular for computational aspects of AI, particularly in optimization algorithms and differential equation solvers used in scientific machine learning.
+
+JavaScript, initially created by Brendan Eich at Netscape in 1995, has evolved to support AI in web applications through libraries like TensorFlow.js, which was introduced by Google in 2018. This allows machine learning models to run directly in browsers, enabling privacy-preserving AI applications where data never leaves the client device.
+
+Rust, developed by Mozilla, focuses on performance and safety. In 2021, the Rust Foundation was established to steward the language. Though not traditionally associated with AI, Rust is gaining popularity for building high-performance AI infrastructure and is being used by companies like Hugging Face to optimize transformer model inference.
+
+Lisp, created by John McCarthy in 1958, was intimately connected with early AI research. McCarthy founded the Stanford AI Lab in 1963, and Lisp became the preferred language for AI throughout the 1970s and 1980s. Despite being less common in modern AI systems, Lisp's influence on functional programming concepts can be seen in features of Python and Julia used in contemporary machine learning code.
+
+C++ continues to be vital for deploying AI models in production environments where performance is critical. Libraries like ONNX Runtime, developed collaboratively by Microsoft, Facebook, and Amazon since 2017, use C++ to provide cross-platform, high-performance inference engines for deep learning models originally trained in Python frameworks.
+
+MATLAB, developed by MathWorks beginning in the 1980s, has been widely used in academic AI research for matrix operations and algorithm development. Its Neural Network Toolbox, introduced in 1992 and later renamed to Deep Learning Toolbox in 2018, predates many modern deep learning frameworks and influenced their design principles.
+
+Scala, created by Martin Odersky in 2004, combines object-oriented and functional programming paradigms. It gained prominence in AI through Apache Spark, developed at UC Berkeley's AMPLab in 2009, which has become essential for large-scale machine learning on distributed datasets. Spark's MLlib library provides scalable implementations of common machine learning algorithms.
+
+Go, designed by Robert Griesemer, Rob Pike, and Ken Thompson at Google in 2009, is increasingly used for AI microservices and deployment infrastructure. Its simplicity, performance, and excellent support for concurrency make it valuable for building robust AI systems, particularly API layers that serve machine learning models at scale.
diff --git a/GraphRAG/graphrag.py b/GraphRAG/graphrag.py
index 77a912418c..c3833d97e4 100644
--- a/GraphRAG/graphrag.py
+++ b/GraphRAG/graphrag.py
@@ -3,8 +3,17 @@
import argparse
import json
+import logging
import os
import re
+import time
+import uuid
+from typing import Dict, List, Union
+
+# Configure logging
+logger = logging.getLogger(__name__)
+log_level = logging.DEBUG if os.getenv("LOGFLAG", "").lower() == "true" else logging.INFO
+logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
from comps import MegaServiceEndpoint, MicroService, ServiceOrchestrator, ServiceRoleType, ServiceType
from comps.cores.mega.utils import handle_message
@@ -57,7 +66,7 @@ def generate_rag_prompt(question, documents):
def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **kwargs):
if self.services[cur_node].service_type == ServiceType.RETRIEVER:
- print("make no changes for retriever inputs. AlreadyCheckCompletionRequest")
+ logger.debug("No changes needed for retriever inputs - already a CompletionRequest")
elif self.services[cur_node].service_type == ServiceType.LLM:
# convert TGI/vLLM to unified OpenAI /v1/chat/completions format
next_inputs = {}
@@ -71,7 +80,15 @@ def align_inputs(self, inputs, cur_node, runtime_graph, llm_parameters_dict, **k
# next_inputs["repetition_penalty"] = inputs["repetition_penalty"]
next_inputs["temperature"] = inputs["temperature"]
inputs = next_inputs
- print("inputs after align:\n", inputs)
+
+ # Convert Pydantic models to dict before logging
+ log_inputs = inputs
+ if hasattr(inputs, "model_dump"): # Pydantic v2
+ log_inputs = inputs.model_dump()
+ elif hasattr(inputs, "dict"): # Pydantic v1
+ log_inputs = inputs.dict()
+
+ logger.debug(f"Inputs after alignment:\n{json.dumps(log_inputs, indent=2)}")
return inputs
@@ -96,10 +113,12 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di
elif input_variables == ["question"]:
prompt = prompt_template.format(question=prompt)
else:
- print(f"{prompt_template} not used, we only support 2 input variables ['question', 'context']")
+ logger.warning(
+ f"Template {prompt_template} not used - only supporting input variables ['question', 'context']"
+ )
prompt = ChatTemplate.generate_rag_prompt(prompt, docs)
else:
- print("no rerank no chat template")
+ logger.debug("Using default chat template (no rerank or custom template provided)")
prompt = ChatTemplate.generate_rag_prompt(prompt, docs)
next_data["inputs"] = prompt
@@ -110,22 +129,36 @@ def align_outputs(self, data, cur_node, inputs, runtime_graph, llm_parameters_di
def align_generator(self, gen, **kwargs):
- # OpenAI response format
- # b'data:{"id":"","object":"text_completion","created":1725530204,"model":"meta-llama/Meta-Llama-3-8B-Instruct","system_fingerprint":"2.0.1-native","choices":[{"index":0,"delta":{"role":"assistant","content":"?"},"logprobs":null,"finish_reason":null}]}\n\n'
- print("generator in align generator:\n", gen)
+ """Aligns the generator output to match ChatQnA's format of sending bytes.
+
+ The UI expects messages in the format: b'content' which it can then decode.
+ """
for line in gen:
line = line.decode("utf-8")
start = line.find("{")
end = line.rfind("}") + 1
+ if start == -1 or end <= start:
+ # Skip lines with invalid json structure
+ continue
+
json_str = line[start:end]
try:
- # sometimes yield empty chunk, do a fallback here
json_data = json.loads(json_str)
- if json_data["choices"][0]["finish_reason"] != "eos_token":
- yield f"data: {repr(json_data['choices'][0]['delta']['content'].encode('utf-8'))}\n\n"
+ if "ops" in json_data and "op" in json_data["ops"][0]:
+ if "value" in json_data["ops"][0] and isinstance(json_data["ops"][0]["value"], str):
+ yield f"data: {repr(json_data['ops'][0]['value'].encode('utf-8'))}\n\n"
+ elif (
+ "choices" in json_data
+ and "delta" in json_data["choices"][0]
+ and "content" in json_data["choices"][0]["delta"]
+ ):
+ content = json_data["choices"][0]["delta"]["content"]
+ yield f"data: {repr(content.encode('utf-8'))}\n\n"
except Exception as e:
+ # If JSON parsing fails, send the raw string as bytes
yield f"data: {repr(json_str.encode('utf-8'))}\n\n"
+
yield "data: [DONE]\n\n"
@@ -163,12 +196,12 @@ def add_remote_service(self):
async def handle_request(self, request: Request):
data = await request.json()
stream_opt = data.get("stream", True)
- chat_request = ChatCompletionRequest.parse_obj(data)
+ chat_request = ChatCompletionRequest.model_validate(data)
def parser_input(data, TypeClass, key):
chat_request = None
try:
- chat_request = TypeClass.parse_obj(data)
+ chat_request = TypeClass.model_validate(data)
query = getattr(chat_request, key)
except:
query = None
diff --git a/GraphRAG/tests/README.md b/GraphRAG/tests/README.md
index 3f41f1851c..daf4788df2 100644
--- a/GraphRAG/tests/README.md
+++ b/GraphRAG/tests/README.md
@@ -3,7 +3,7 @@
## Set the required environment variable
```bash
-export HF_TOKEN="Your_Huggingface_API_Token"
+export HUGGINGFACEHUB_API_TOKEN="Your_Huggingface_API_Token"
```
## Run test
diff --git a/GraphRAG/ui/svelte/src/lib/network/chat/Network.ts b/GraphRAG/ui/svelte/src/lib/network/chat/Network.ts
index d0ae7b701d..e78c97c344 100644
--- a/GraphRAG/ui/svelte/src/lib/network/chat/Network.ts
+++ b/GraphRAG/ui/svelte/src/lib/network/chat/Network.ts
@@ -5,13 +5,17 @@ import { env } from "$env/dynamic/public";
import { SSE } from "sse.js";
const CHAT_BASE_URL = env.CHAT_BASE_URL;
+if (!env.LLM_MODEL_ID) {
+ throw new Error("LLM_MODEL_ID environment variable must be set");
+}
+const LLM_MODEL_ID = env.LLM_MODEL_ID;
export async function fetchTextStream(query: string) {
let payload = {};
let url = "";
payload = {
- model: "Intel/neural-chat-7b-v3-3",
+ model: LLM_MODEL_ID,
messages: query,
};
url = `${CHAT_BASE_URL}`;