From e3dc87e71bf350a0ecf1c7845668eb26790d71cf Mon Sep 17 00:00:00 2001 From: Razvan-Liviu Varzaru Date: Tue, 29 Apr 2025 16:51:43 +0300 Subject: [PATCH 1/5] Add MariaDB Vector third-party service MariaDB Vector was introduced since MariaDB Server 11.7 Signed-off-by: Razvan-Liviu Varzaru --- .../deployment/docker_compose/compose.yaml | 21 +++++++++++++++++++ comps/third_parties/mariadb/src/README.md | 20 ++++++++++++++++++ comps/third_parties/mariadb/src/__init__.py | 2 ++ 3 files changed, 43 insertions(+) create mode 100644 comps/third_parties/mariadb/deployment/docker_compose/compose.yaml create mode 100644 comps/third_parties/mariadb/src/README.md create mode 100644 comps/third_parties/mariadb/src/__init__.py diff --git a/comps/third_parties/mariadb/deployment/docker_compose/compose.yaml b/comps/third_parties/mariadb/deployment/docker_compose/compose.yaml new file mode 100644 index 0000000000..b519c981bc --- /dev/null +++ b/comps/third_parties/mariadb/deployment/docker_compose/compose.yaml @@ -0,0 +1,21 @@ +# Copyright (C) 2025 MariaDB Foundation +# SPDX-License-Identifier: Apache-2.0 + +services: + mariadb-server: + container_name: mariadb-server + image: mariadb:latest + ports: + - "${MARIADB_PORT:-3306}:3306" + restart: always + environment: + - MARIADB_DATABASE=${MARIADB_DATABASE:-vectordb} + - MARIADB_USER=${MARIADB_USER:-dbuser} + - MARIADB_PASSWORD=${MARIADB_PASSWORD:-password} + - MARIADB_RANDOM_ROOT_PASSWORD=1 + healthcheck: + test: ["CMD", "healthcheck.sh", "--connect", "--innodb_initialized"] + start_period: 10s + interval: 10s + timeout: 5s + retries: 3 diff --git a/comps/third_parties/mariadb/src/README.md b/comps/third_parties/mariadb/src/README.md new file mode 100644 index 0000000000..289d60e8e9 --- /dev/null +++ b/comps/third_parties/mariadb/src/README.md @@ -0,0 +1,20 @@ +# Start MariaDB Server + +**MariaDB Vector** was introduced starting with server version 11.7 +For more details please see the [official documentation](https://mariadb.com/kb/en/vectors/). + + +## 1. Configure the server + +```bash +export MARIADB_CONTAINER_IMAGE="mariadb:latest" +export MARIADB_USER=dbuser +export MARIADB_PASSWORD=password +export MARIADB_DATABASE=vectordb +``` + +## 2. Run MariaDB Server + +```bash +docker run --name mariadb-server -e MARIADB_USER=${MARIADB_USER} -e MARIADB_RANDOM_ROOT_PASSWORD=1 -e MARIADB_DATABASE=${MARIADB_DATABASE} -e MARIADB_PASSWORD=${MARIADB_PASSWORD} -d -p 3306:3306 ${MARIADB_CONTAINER_IMAGE} +``` diff --git a/comps/third_parties/mariadb/src/__init__.py b/comps/third_parties/mariadb/src/__init__.py new file mode 100644 index 0000000000..49c2a10929 --- /dev/null +++ b/comps/third_parties/mariadb/src/__init__.py @@ -0,0 +1,2 @@ +# Copyright (C) 2025 MariaDB Foundation +# SPDX-License-Identifier: Apache-2.0 From 05d41a73eb625389bb45bc3439d7e88e6e93ba83 Mon Sep 17 00:00:00 2001 From: Razvan-Liviu Varzaru Date: Tue, 29 Apr 2025 16:53:59 +0300 Subject: [PATCH 2/5] Add retriever MariaDB Vector integration Signed-off-by: Razvan-Liviu Varzaru --- comps/retrievers/README.md | 4 + .../deployment/docker_compose/compose.yaml | 12 ++ comps/retrievers/src/Dockerfile | 4 +- comps/retrievers/src/README_mariadb.md | 58 +++++++ comps/retrievers/src/integrations/config.py | 6 + comps/retrievers/src/integrations/mariadb.py | 153 ++++++++++++++++++ .../src/opea_retrievers_microservice.py | 1 + comps/retrievers/src/requirements.txt | 2 + tests/retrievers/test_retrievers_mariadb.sh | 84 ++++++++++ 9 files changed, 323 insertions(+), 1 deletion(-) create mode 100644 comps/retrievers/src/README_mariadb.md create mode 100644 comps/retrievers/src/integrations/mariadb.py create mode 100644 tests/retrievers/test_retrievers_mariadb.sh diff --git a/comps/retrievers/README.md b/comps/retrievers/README.md index 9cec099365..d68a450252 100644 --- a/comps/retrievers/README.md +++ b/comps/retrievers/README.md @@ -41,3 +41,7 @@ For details, please refer to this [readme](src/README_neo4j.md) ## Retriever Microservice with Pathway For details, please refer to this [readme](src/README_pathway.md) + +## Retriever Microservice with MariaDB Vector + +For details, please refer to this [readme](src/README_mariadb.md) diff --git a/comps/retrievers/deployment/docker_compose/compose.yaml b/comps/retrievers/deployment/docker_compose/compose.yaml index a1cbd5e7bd..85261e8047 100644 --- a/comps/retrievers/deployment/docker_compose/compose.yaml +++ b/comps/retrievers/deployment/docker_compose/compose.yaml @@ -14,6 +14,7 @@ include: - ../../../third_parties/tgi/deployment/docker_compose/compose.yaml - ../../../third_parties/vdms/deployment/docker_compose/compose.yaml - ../../../third_parties/arangodb/deployment/docker_compose/compose.yaml + - ../../../third_parties/mariadb/deployment/docker_compose/compose.yaml services: retriever: @@ -225,6 +226,17 @@ services: arango-vector-db: condition: service_healthy + retriever-mariadb-vector: + extends: retriever + container_name: retriever-mariadb-vector + environment: + RETRIEVER_COMPONENT_NAME: ${RETRIEVER_COMPONENT_NAME:-OPEA_RETRIEVER_MARIADBVECTOR} + MARIADB_CONNECTION_URL: ${MARIADB_CONNECTION_URL:-mariadb+mariadbconnector://dbuser:password@mariadb-server:3306/vectordb} + LOGFLAG: ${LOGFLAG} + depends_on: + mariadb-server: + condition: service_healthy + networks: default: driver: bridge diff --git a/comps/retrievers/src/Dockerfile b/comps/retrievers/src/Dockerfile index 7f4580d361..a5ae329e66 100644 --- a/comps/retrievers/src/Dockerfile +++ b/comps/retrievers/src/Dockerfile @@ -9,7 +9,9 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missin libcairo2 \ libgl1-mesa-glx \ libglib2.0-0 \ - libjemalloc-dev + libjemalloc-dev \ + libmariadb-dev \ + build-essential RUN useradd -m -s /bin/bash user && \ mkdir -p /home/user && \ diff --git a/comps/retrievers/src/README_mariadb.md b/comps/retrievers/src/README_mariadb.md new file mode 100644 index 0000000000..37561f7481 --- /dev/null +++ b/comps/retrievers/src/README_mariadb.md @@ -0,0 +1,58 @@ +# Retriever Microservice + +This retriever microservice is a highly efficient search service designed for handling and retrieving embedding vectors. It operates by receiving an embedding vector as input and conducting a similarity search against vectors stored in a VectorDB database. Users must specify the VectorDB's URL and the index name, and the service searches within that index to find documents with the highest similarity to the input vector. + +The service primarily utilizes similarity measures in vector space to rapidly retrieve contentually similar documents. The vector-based retrieval approach is particularly suited for handling large datasets, offering fast and accurate search results that significantly enhance the efficiency and quality of information retrieval. + +Overall, this microservice provides robust backend support for applications requiring efficient similarity searches, playing a vital role in scenarios such as recommendation systems, information retrieval, or any other context where precise measurement of document similarity is crucial. + +### 1.1 Build Docker Image + +```bash +cd GenAIComps +docker build -t opea/retriever:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/src/Dockerfile . +``` + +### 1.2 Run Docker with CLI (Option A) + +#### 1.2.1 Start MariaDB Server +Please refer to this [readme](../../third_parties/mariadb/src/README.md). +You need to ingest your knowledge documents into the vector database. + +#### 1.2.2 Start the retriever service +```bash +export HOST_IP=$(hostname -I | awk '{print $1}') +# If you've configured the server with the default env values then: +export MARIADB_CONNECTION_URL=mariadb+mariadbconnector://dbuser:password@${HOST_IP}$:3306/vectordb + +docker run -d --rm --name="retriever-mariadb-vector" -p 7000:7000 --ipc=host -e MARIADB_CONNECTION_URL=$MARIADB_CONNECTION_URL -e RETRIEVER_COMPONENT_NAME="OPEA_RETRIEVER_MARIADBVECTOR" opea/retriever:latest +``` + +### 1.3 Run with Docker Compose (Option B) + +```bash +cd comps/retrievers/deployment/docker_compose +docker compose -f compose.yaml up retriever-mariadb-vector -d +``` + +## 🚀2. Consume Retriever Service + +### 2.1 Check Service Status + +```bash +curl http://${HOST_IP}:7000/v1/health_check \ + -X GET \ + -H 'Content-Type: application/json' +``` + +### 2.2 Consume Embedding Service + +To consume the Retriever Microservice, you can generate a mock embedding vector of length 768 with Python. + +```bash +export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") +curl http://${HOST_IP}:7000/v1/retrieval \ + -X POST \ + -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding}}" \ + -H 'Content-Type: application/json' +``` diff --git a/comps/retrievers/src/integrations/config.py b/comps/retrievers/src/integrations/config.py index 95eb7c16ce..89ed92bbcf 100644 --- a/comps/retrievers/src/integrations/config.py +++ b/comps/retrievers/src/integrations/config.py @@ -237,3 +237,9 @@ def format_opensearch_conn_from_env(): OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small") OPENAI_CHAT_ENABLED = os.getenv("OPENAI_CHAT_ENABLED", "true").lower() == "true" OPENAI_EMBED_ENABLED = os.getenv("OPENAI_EMBED_ENABLED", "true").lower() == "true" + +####################################################### +# MariaDB Vector # +####################################################### +MARIADB_CONNECTION_URL = os.getenv("MARIADB_CONNECTION_URL", "localhost") +MARIADB_COLLECTION_NAME = os.getenv("MARIADB_COLLECTION_NAME", "rag_mariadbvector") \ No newline at end of file diff --git a/comps/retrievers/src/integrations/mariadb.py b/comps/retrievers/src/integrations/mariadb.py new file mode 100644 index 0000000000..df997e7c47 --- /dev/null +++ b/comps/retrievers/src/integrations/mariadb.py @@ -0,0 +1,153 @@ +# Copyright (C) 2025 MariaDB Foundation +# SPDX-License-Identifier: Apache-2.0 + + +import os +from urllib.parse import urlparse + +import mariadb +from fastapi import HTTPException +from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings +from langchain_huggingface import HuggingFaceEmbeddings +from langchain_mariadb.vectorstores import MariaDBStore, MariaDBStoreSettings + +from comps import CustomLogger, EmbedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType + +from .config import ( + EMBED_MODEL, + HUGGINGFACEHUB_API_TOKEN, + MARIADB_COLLECTION_NAME, + MARIADB_CONNECTION_URL, + TEI_EMBEDDING_ENDPOINT, +) + + +class NullLogger: + def info(self, *args, **kwargs): + pass + + def debug(self, *args, **kwargs): + pass + + def warning(self, *args, **kwargs): + pass + + def error(self, *args, **kwargs): + pass + + def critical(self, *args, **kwargs): + pass + + def exception(self, *args, **kwargs): + pass + + +logger = CustomLogger("mariadbvector_retrievers") +logflag = os.getenv("LOGFLAG", False) +if not logflag: + logger = NullLogger() + + +@OpeaComponentRegistry.register("OPEA_RETRIEVER_MARIADBVECTOR") +class OpeaMARIADBVectorRetriever(OpeaComponent): + """A specialized retriever component derived from OpeaComponent for mariadb vector retriever services. + + Attributes: + client (MariaDBStore): An instance of the MariaDBStore client for vector database operations. + """ + + def __init__(self, name: str, description: str, config: dict = None): + super().__init__(name, ServiceType.RETRIEVER.name.lower(), description, config) + + args = urlparse(MARIADB_CONNECTION_URL) + + self.conn_args = { + "host": args.hostname, + "port": args.port, + "user": args.username, + "password": args.password, + "database": args.path[1:], + } + + self.embedder = self._initialize_embedder() + + health_status = self.check_health() + if not health_status: + logger.error("OpeaMARIADBVectorRetriever health check failed.") + + self.store = self._initialize_client() + + def _initialize_embedder(self): + if TEI_EMBEDDING_ENDPOINT: + # create embeddings using TEI endpoint service + logger.info(f"[ init embedder ] TEI_EMBEDDING_ENDPOINT:{TEI_EMBEDDING_ENDPOINT}") + if not HUGGINGFACEHUB_API_TOKEN: + raise HTTPException( + status_code=400, + detail="You MUST offer the `HUGGINGFACEHUB_API_TOKEN` when using `TEI_EMBEDDING_ENDPOINT`.", + ) + import requests + + response = requests.get(TEI_EMBEDDING_ENDPOINT + "/info") + if response.status_code != 200: + raise HTTPException( + status_code=400, detail=f"TEI embedding endpoint {TEI_EMBEDDING_ENDPOINT} is not available." + ) + model_id = response.json()["model_id"] + embeddings = HuggingFaceInferenceAPIEmbeddings( + api_key=HUGGINGFACEHUB_API_TOKEN, model_name=model_id, api_url=TEI_EMBEDDING_ENDPOINT + ) + else: + # create embeddings using local embedding model + logger.info(f"[ init embedder ] LOCAL_EMBEDDING_MODEL:{EMBED_MODEL}") + embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL) + return embeddings + + def _initialize_client(self) -> MariaDBStore: + store = MariaDBStore( + embeddings=self.embedder, + collection_name=MARIADB_COLLECTION_NAME, + datasource=MARIADB_CONNECTION_URL, + config=MariaDBStoreSettings(lazy_init=True), + ) + return store + + def check_health(self) -> bool: + """Checks mariadb server health.""" + try: + connection = mariadb.connect(**self.conn_args) + return True + except mariadb.Error as e: + logger.error(f"Error connect to MariaDB Server: {e}") + return False + + except Exception as e: + logger.error(f"An unexpected error occurred: {e}") + return False + finally: + try: + connection.close() + except Exception as e: + logger.error(f"Error closing connection: {e}") + + async def invoke(self, input: EmbedDoc) -> list: + """Search the MariaDB Vector index for the most similar documents to the input query. + + Args: + input (EmbedDoc): The input query to search for. + Output: + list: The retrieved documents. + """ + logger.info(f"[ similarity search ] input: {input}") + + result = [] + try: + result = await self.store.asimilarity_search_by_vector(embedding=input.embedding) + logger.info(f"[ similarity search ] search result: {result}") + return result + except mariadb.Error as e: + logger.error(f"A database error occurred during similarity search: {e}") + raise HTTPException(status_code=500, detail="A database error occurred during similarity search") + except Exception as e: + logger.error(f"An unexpected error occurred: {e}") + raise HTTPException(status_code=500, detail="An unexpected error occurred") diff --git a/comps/retrievers/src/opea_retrievers_microservice.py b/comps/retrievers/src/opea_retrievers_microservice.py index 54015d3a20..2d3bbf7873 100644 --- a/comps/retrievers/src/opea_retrievers_microservice.py +++ b/comps/retrievers/src/opea_retrievers_microservice.py @@ -10,6 +10,7 @@ # import for retrievers component registration from integrations.elasticsearch import OpeaElasticsearchRetriever +from integrations.mariadb import OpeaMARIADBVectorRetriever from integrations.milvus import OpeaMilvusRetriever from integrations.neo4j import OpeaNeo4jRetriever from integrations.opensearch import OpeaOpensearchRetriever diff --git a/comps/retrievers/src/requirements.txt b/comps/retrievers/src/requirements.txt index 9b27448dd9..f7d79ef704 100644 --- a/comps/retrievers/src/requirements.txt +++ b/comps/retrievers/src/requirements.txt @@ -13,6 +13,7 @@ haystack-ai==2.3.1 langchain-arangodb langchain-elasticsearch langchain-openai +langchain-mariadb langchain-pinecone langchain-vdms>=0.1.4 langchain_community @@ -25,6 +26,7 @@ llama-index-llms-openai llama-index-llms-openai-like llama-index-llms-text-generation-inference llama_index_graph_stores_neo4j +mariadb neo4j numpy opensearch-py diff --git a/tests/retrievers/test_retrievers_mariadb.sh b/tests/retrievers/test_retrievers_mariadb.sh new file mode 100644 index 0000000000..84785f4ac6 --- /dev/null +++ b/tests/retrievers/test_retrievers_mariadb.sh @@ -0,0 +1,84 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +IMAGE_REPO=${IMAGE_REPO:-"opea"} +export REGISTRY=${IMAGE_REPO} +export TAG="comps" +echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}" +echo "TAG=${TAG}" + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +export host_ip=$(hostname -I | awk '{print $1}') +service_name="retriever-mariadb-vector" + +function build_docker_images() { + cd $WORKPATH + docker build --no-cache -t ${REGISTRY:-opea}/retriever:${TAG:-latest} --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/src/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/retriever built fail" + exit 1 + else + echo "opea/retriever built successful" + fi +} + +function start_service() { + export MARIADB_PORT=11617 + export RETRIEVER_PORT=11618 + export MARIADB_USER=testuser + export MARIADB_PASSWORD=testpwd + export MARIADB_DATABASE=vectordb + export HF_TOKEN=${HF_TOKEN} + export MARIADB_CONNECTION_URL=mariadb+mariadbconnector://${MARIADB_USER}:${MARIADB_PASSWORD}@$host_ip:$MARIADB_PORT/${MARIADB_DATABASE} + export LOGFLAG=True + + cd $WORKPATH/comps/retrievers/deployment/docker_compose + docker compose -f compose.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log + + sleep 1m +} + +function validate_microservice() { + test_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)") + + result=$(http_proxy='' + curl http://${host_ip}:$RETRIEVER_PORT/v1/retrieval \ + -X POST \ + -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" \ + -H 'Content-Type: application/json') + if [[ $result == *"retrieved_docs"* ]]; then + echo "Result correct." + else + echo "Result wrong. Received was $result" + docker logs mariadb-server >> ${LOG_PATH}/vectorstore.log + docker logs ${service_name} >> ${LOG_PATH}/retriever-mariadb-vector.log + exit 1 + fi +} + +function stop_docker() { + cd $WORKPATH/comps/retrievers/deployment/docker_compose + docker compose -f compose.yaml down --remove-orphans + cid=$(docker ps -aq --filter "name=mariadb-server") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main From a0c00fd4ced3c72d5ef4088b419bb8ddd5ff4e9f Mon Sep 17 00:00:00 2001 From: Razvan-Liviu Varzaru Date: Tue, 29 Apr 2025 16:55:42 +0300 Subject: [PATCH 3/5] Add dataprep MariaDB Vector integration Signed-off-by: Razvan-Liviu Varzaru --- comps/dataprep/README.md | 4 + .../deployment/docker_compose/compose.yaml | 24 + comps/dataprep/src/Dockerfile | 1 + comps/dataprep/src/README_mariadb.md | 98 +++++ comps/dataprep/src/integrations/mariadb.py | 415 ++++++++++++++++++ .../src/opea_dataprep_microservice.py | 1 + comps/dataprep/src/requirements.txt | 2 + tests/dataprep/test_dataprep_mariadb.sh | 106 +++++ 8 files changed, 651 insertions(+) create mode 100644 comps/dataprep/src/README_mariadb.md create mode 100644 comps/dataprep/src/integrations/mariadb.py create mode 100644 tests/dataprep/test_dataprep_mariadb.sh diff --git a/comps/dataprep/README.md b/comps/dataprep/README.md index 7cba93f0a0..cdb226ccc9 100644 --- a/comps/dataprep/README.md +++ b/comps/dataprep/README.md @@ -60,3 +60,7 @@ For details, please refer to this [readme](src/README_neo4j_llamaindex.md) ## Dataprep Microservice for financial domain data For details, please refer to this [readme](src/README_finance.md) + +## Dataprep Microservice with MariaDB Vector + +For details, please refer to this [readme](src/README_mariadb.md) \ No newline at end of file diff --git a/comps/dataprep/deployment/docker_compose/compose.yaml b/comps/dataprep/deployment/docker_compose/compose.yaml index c44fdb818f..481f63778c 100644 --- a/comps/dataprep/deployment/docker_compose/compose.yaml +++ b/comps/dataprep/deployment/docker_compose/compose.yaml @@ -15,6 +15,7 @@ include: - ../../../third_parties/tei/deployment/docker_compose/compose.yaml - ../../../third_parties/vllm/deployment/docker_compose/compose.yaml - ../../../third_parties/arangodb/deployment/docker_compose/compose.yaml + - ../../../third_parties/mariadb/deployment/docker_compose/compose.yaml services: @@ -414,6 +415,29 @@ services: retries: 10 restart: unless-stopped + dataprep-mariadb-vector: + image: ${REGISTRY:-opea}/dataprep:${TAG:-latest} + container_name: dataprep-mariadb-vector + ports: + - "${DATAPREP_PORT:-5000}:5000" + depends_on: + mariadb-server: + condition: service_healthy + ipc: host + environment: + no_proxy: ${no_proxy} + http_proxy: ${http_proxy} + https_proxy: ${https_proxy} + DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_MARIADBVECTOR" + MARIADB_CONNECTION_URL: ${MARIADB_CONNECTION_URL:-mariadb+mariadbconnector://dbuser:password@mariadb-server:3306/vectordb} + LOGFLAG: ${LOGFLAG} + healthcheck: + test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"] + interval: 10s + timeout: 5s + retries: 10 + restart: unless-stopped + networks: default: driver: bridge diff --git a/comps/dataprep/src/Dockerfile b/comps/dataprep/src/Dockerfile index a344066ba0..eba0288012 100644 --- a/comps/dataprep/src/Dockerfile +++ b/comps/dataprep/src/Dockerfile @@ -13,6 +13,7 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missin libcairo2 \ libgl1-mesa-glx \ libjemalloc-dev \ + libmariadb-dev \ libpq-dev \ libreoffice \ poppler-utils \ diff --git a/comps/dataprep/src/README_mariadb.md b/comps/dataprep/src/README_mariadb.md new file mode 100644 index 0000000000..6cb2262b4e --- /dev/null +++ b/comps/dataprep/src/README_mariadb.md @@ -0,0 +1,98 @@ +# Dataprep Microservice with MariaDB Vector + +## 🚀1. Start Microservice with Docker + +### 1.1 Build Docker Image + +```bash +cd GenAIComps +docker build -t opea/dataprep:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/src/Dockerfile . +``` + +### 1.2 Run Docker with CLI (Option A) + +#### 1.2.1 Start MariaDB Server +Please refer to this [readme](../../third_parties/mariadb/src/README.md). + +#### 1.2.2 Start the data preparation service +```bash + +export HOST_IP=$(hostname -I | awk '{print $1}') +# If you've configured the server with the default env values then: +export MARIADB_CONNECTION_URL: mariadb+mariadbconnector://dbuser:password@${HOST_IP}$:3306/vectordb + +docker run -d --rm --name="dataprep-mariadb-vector" -p 5000:5000 --ipc=host -e MARIADB_CONNECTION_URL=$MARIADB_CONNECTION_URL -e DATAPREP_COMPONENT_NAME="OPEA_DATAPREP_MARIADBVECTOR" opea/dataprep:latest +``` + +### 1.3 Run with Docker Compose (Option B) + +```bash +cd comps/dataprep/deployment/docker_compose +docker compose -f compose.yaml up dataprep-mariadb-vector -d +``` + +## 🚀2. Consume Microservice + +### 2.1 Consume Upload API + +Once the data preparation microservice for MariaDB Vector is started, one can use the below command to invoke the microservice to convert documents/links to embeddings and save them to the vector store. + +```bash +export document="/path/to/document" +curl -X POST \ + -H "Content-Type: application/json" \ + -d '{"path":"${document}"}' \ + http://localhost:6007/v1/dataprep/ingest +``` + +### 2.2 Consume get API + +To get the structure of the uploaded files, use the `get` API endpoint: + +```bash +curl -X POST \ + -H "Content-Type: application/json" \ + http://localhost:6007/v1/dataprep/get +``` + +A JSON formatted response similar to the one below will follow: + +```json +[ + { + "name": "uploaded_file_1.txt", + "id": "uploaded_file_1.txt", + "type": "File", + "parent": "" + }, + { + "name": "uploaded_file_2.txt", + "id": "uploaded_file_2.txt", + "type": "File", + "parent": "" + } +] +``` + +### 2.3 Consume delete API + +To delete uploaded files/links, use the `delete` API endpoint. + +The `file_path` is the `id` returned by the `/v1/dataprep/get` API. + +```bash +# delete link +curl -X POST "http://${HOST_IP}:5000/v1/dataprep/delete" + -H "Content-Type: application/json" \ + -d '{"file_path": "https://www.ces.tech/.txt"}' + +# delete file +curl -X POST "http://${HOST_IP}:5000/v1/dataprep/delete" + -H "Content-Type: application/json" \ + -d '{"file_path": "uploaded_file_1.txt"}' + +# delete all files and links +curl -X POST "http://${HOST_IP}:5000/v1/dataprep/delete" + -H "Content-Type: application/json" \ + -d '{"file_path": "all"}' +``` diff --git a/comps/dataprep/src/integrations/mariadb.py b/comps/dataprep/src/integrations/mariadb.py new file mode 100644 index 0000000000..3f2f7b2105 --- /dev/null +++ b/comps/dataprep/src/integrations/mariadb.py @@ -0,0 +1,415 @@ +# Copyright (C) 2025 MariaDB Foundation +# SPDX-License-Identifier: Apache-2.0 + +import hashlib +import json +import os +from pathlib import Path +from typing import ( + List, + Optional, + Union, +) +from urllib.parse import urlparse + +import mariadb +from fastapi import Body, File, Form, HTTPException, UploadFile +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings +from langchain_huggingface import HuggingFaceEmbeddings +from langchain_mariadb.vectorstores import MariaDBStore, MariaDBStoreSettings + +from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType +from comps.cores.proto.api_protocol import DataprepRequest +from comps.dataprep.src.utils import ( + create_upload_folder, + document_loader, + encode_filename, + get_file_structure, + get_separators, + parse_html_new, + remove_folder_with_ignore, + save_content_to_local_disk, +) + + +# A no-op logger that does nothing +class NullLogger: + def info(self, *args, **kwargs): + pass + + def debug(self, *args, **kwargs): + pass + + def warning(self, *args, **kwargs): + pass + + def error(self, *args, **kwargs): + pass + + def critical(self, *args, **kwargs): + pass + + def exception(self, *args, **kwargs): + pass + + +logger = CustomLogger("opea_dataprep_mariadbvector") +logflag = os.getenv("LOGFLAG", False) +if not logflag: + logger = NullLogger() + +# Embedding model +EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5") +# TEI Embedding endpoints +TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "") +# Huggingface API token for TEI embedding endpoint +HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "") + +MARIADB_CONNECTION_URL = os.getenv("MARIADB_CONNECTION_URL", "localhost") + +# Vector Index Configuration +MARIADB_COLLECTION_NAME = os.getenv("MARIADB_COLLECTION_NAME", "rag_mariadbvector") + +# chunk parameters +CHUNK_SIZE = os.getenv("CHUNK_SIZE", 1500) +CHUNK_OVERLAP = os.getenv("CHUNK_OVERLAP", 100) + + +class DocumentsTable: + """Table for storing documents.""" + + def __init__(self, conn_args): + self._table_name = "langchain_documents" + self.conn_args = conn_args + self.__post__init__() + + def __post__init__(self): + self.create_table_if_not_exists() + + def create_table_if_not_exists(self): + """Create the documents table if it does not exist.""" + connection = mariadb.connect(**self.conn_args) + cursor = connection.cursor() + cursor.execute( + f""" + CREATE TABLE IF NOT EXISTS {self._table_name} ( + id VARCHAR(32) PRIMARY KEY, + name TEXT, + embedding_ids JSON + ) + """ + ) + connection.commit() + cursor.close() + connection.close() + + def insert_document_ids(self, id: str, name: str, embedding_ids: list): + """Insert a document into the documents table.""" + connection = mariadb.connect(**self.conn_args) + cursor = connection.cursor() + cursor.execute( + f"INSERT INTO {self._table_name} (id, name, embedding_ids) VALUES (?, ?, ?)", + (id, name, json.dumps(embedding_ids)), + ) + connection.commit() + cursor.close() + connection.close() + + def delete_document(self, id: str): + """Delete a document from the documents table.""" + connection = mariadb.connect(**self.conn_args) + cursor = connection.cursor() + cursor.execute(f"DELETE FROM {self._table_name} WHERE id = ?", (id,)) + connection.commit() + cursor.close() + connection.close() + + def delete_all_documents(self): + """Delete all documents from the documents table.""" + connection = mariadb.connect(**self.conn_args) + cursor = connection.cursor() + cursor.execute(f"DELETE FROM {self._table_name}") + connection.commit() + cursor.close() + connection.close() + + def get_document_emb_ids(self, id: str): + """Get the embedding ids for a document.""" + connection = mariadb.connect(**self.conn_args) + cursor = connection.cursor() + cursor.execute(f"SELECT embedding_ids FROM {self._table_name} WHERE id = ?", (id,)) + result = cursor.fetchone() + cursor.close() + connection.close() + if result: + return json.loads(result[0]) + return None + + +@OpeaComponentRegistry.register("OPEA_DATAPREP_MARIADBVECTOR") +class OpeaMariaDBDataprep(OpeaComponent): + """Dataprep component for MariaDBStore ingestion and search services.""" + + def __init__(self, name: str, description: str, config: dict = None): + super().__init__(name, ServiceType.DATAPREP.name.lower(), description, config) + args = urlparse(MARIADB_CONNECTION_URL) + + self.conn_args = { + "host": args.hostname, + "port": args.port, + "user": args.username, + "password": args.password, + "database": args.path[1:], + } + + self.upload_folder = Path("./uploaded_files/") + self.embedder = self._initialize_embedder() + + # Perform health check + health_status = self.check_health() + if not health_status: + logger.error("OpeaMariaDBDataprep health check failed.") + + self.store = self._initialize_client() + self.documents = DocumentsTable(self.conn_args) + + def _initialize_embedder(self): + if TEI_EMBEDDING_ENDPOINT: + # create embeddings using TEI endpoint service + logger.info(f"[ init embedder ] TEI_EMBEDDING_ENDPOINT:{TEI_EMBEDDING_ENDPOINT}") + if not HUGGINGFACEHUB_API_TOKEN: + raise HTTPException( + status_code=400, + detail="You MUST offer the `HUGGINGFACEHUB_API_TOKEN` when using `TEI_EMBEDDING_ENDPOINT`.", + ) + import requests + + response = requests.get(TEI_EMBEDDING_ENDPOINT + "/info") + if response.status_code != 200: + raise HTTPException( + status_code=400, detail=f"TEI embedding endpoint {TEI_EMBEDDING_ENDPOINT} is not available." + ) + model_id = response.json()["model_id"] + embeddings = HuggingFaceInferenceAPIEmbeddings( + api_key=HUGGINGFACEHUB_API_TOKEN, model_name=model_id, api_url=TEI_EMBEDDING_ENDPOINT + ) + else: + # create embeddings using local embedding model + logger.info(f"[ init embedder ] LOCAL_EMBEDDING_MODEL:{EMBED_MODEL}") + embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL) + return embeddings + + def _initialize_client(self) -> MariaDBStore: + store = MariaDBStore( + embeddings=self.embedder, + collection_name=MARIADB_COLLECTION_NAME, + datasource=MARIADB_CONNECTION_URL, + config=MariaDBStoreSettings(lazy_init=True), + ) + return store + + def check_health(self) -> bool: + """Checks mariadb server health.""" + try: + connection = mariadb.connect(**self.conn_args) + return True + except mariadb.Error as e: + logger.error(f"Error connect to MariaDB Server: {e}") + return False + + except Exception as e: + logger.error(f"An unexpected error occurred: {e}") + return False + finally: + try: + connection.close() + except Exception as e: + logger.error(f"Error closing connection: {e}") + + def invoke(self, *args, **kwargs): + pass + + async def _save_file_to_local_disk(self, save_path: Path, file): + with save_path.open("wb") as fout: + try: + content = await file.read() + fout.write(content) + except Exception as e: + logger.error(f"Write file failed. Exception: {e}") + raise HTTPException(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}") + + def _store_texts(self, doc_path: str, chunks: list[str], batch_size: int = 32): + num_chunks = len(chunks) + metadata = [{"doc_name": doc_path}] + doc_id = hashlib.md5(str(doc_path).encode("utf-8")).hexdigest() + doc_emb_ids = [] + for i in range(0, num_chunks, batch_size): + batch_texts = chunks[i : i + batch_size] + batch_ids = self.store.add_texts( + texts=batch_texts, + metadatas=metadata * len(batch_texts), + ) + doc_emb_ids.extend(batch_ids) + self.documents.insert_document_ids(id=doc_id, name=doc_path, embedding_ids=doc_emb_ids) + if logflag: + logger.info(f"Processed batch {i // batch_size + 1} / {(num_chunks - 1) // batch_size + 1}") + + async def _ingest_doc_to_mariadb(self, path: str): + """Ingest document to mariadb.""" + doc_path = DocPath(path=path).path + logger.info(f"Parsing document {doc_path}.") + + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True, separators=get_separators() + ) + + content = await document_loader(doc_path) + + structured_types = [".xlsx", ".csv", ".json", "jsonl"] + _, ext = os.path.splitext(doc_path) + + if ext in structured_types: + chunks = content + else: + chunks = text_splitter.split_text(content) + + logger.info(f"Done preprocessing. Created {len(chunks)} chunks of the original file.") + + self._store_texts(doc_path, chunks) + return True + + async def _ingest_link_to_mariadb(self, link_list: List[str]): + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True, separators=get_separators() + ) + + for link in link_list: + content = parse_html_new([link], chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP) + logger.info(f"[ ingest link ] link: {link} content: {content}") + encoded_link = encode_filename(link) + save_path = self.upload_folder / (encoded_link + ".txt") + doc_path = self.upload_folder / (link + ".txt") + logger.info(f"[ ingest link ] save_path: {save_path}") + await save_content_to_local_disk(str(save_path), content) + + chunks = text_splitter.split_text(content) + self._store_texts(str(doc_path), chunks) + return True + + async def ingest_files( + self, + input: DataprepRequest, + ): + """Ingest files/links content into database. + + Save in the format of vector[768]. + Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful. + Args: + input (DataprepRequest): Model containing the following parameters: + files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None). + link_list (str, optional): A list of links to be ingested. Defaults to Form(None). + chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500). + chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100). + process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False). + table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast"). + """ + files = input.files + link_list = input.link_list + + logger.info(f"files:{files}") + logger.info(f"link_list:{link_list}") + if files and link_list: + raise HTTPException(status_code=400, detail="Provide either a file or a string list, not both.") + + if not files and not link_list: + raise HTTPException(status_code=400, detail="Must provide either a file or a string list.") + + if files: + if not isinstance(files, list): + files = [files] + + self.upload_folder.mkdir(parents=True, exist_ok=True) + for file in files: + save_path = self.upload_folder / file.filename + await self._save_file_to_local_disk(save_path, file) + await self._ingest_doc_to_mariadb(str(save_path)) + logger.info(f"Successfully saved file {save_path}") + + if link_list: + try: + link_list = json.loads(link_list) # Parse JSON string to list + if not isinstance(link_list, list): + raise HTTPException(status_code=400, detail="link_list should be a list.") + await self._ingest_link_to_mariadb(link_list) + logger.info(f"Successfully saved link list {link_list}") + except json.JSONDecodeError: + raise HTTPException(status_code=400, detail="Invalid JSON format for link_list.") + + result = {"status": 200, "message": "Data preparation succeeded"} + logger.info(result) + return result + + async def get_files(self): + """Get file structure from database in the format of + { + "name": "File Name", + "id": "File Name", + "type": "File", + "parent": "", + }""" + logger.info("[ dataprep - get file ] start to get file structure") + + if not self.upload_folder.exists(): + logger.info("No file uploaded, return empty list.") + return [] + + file_content = get_file_structure(str(self.upload_folder)) + logger.info(file_content) + return file_content + + def _delete_embedding(self, doc_path: Path): + doc_id = hashlib.md5(str(doc_path).encode("utf-8")).hexdigest() + doc_emb_ids = self.documents.get_document_emb_ids(doc_id) + self.store.delete(ids=doc_emb_ids) + self.documents.delete_document(doc_id) + + def _delete_all_embeddings(self): + self.store.delete_collection() + self.documents.delete_all_documents() + + def _delete_all_files(self): + """Delete all files in the upload folder.""" + logger.info("[dataprep - del] delete all files") + remove_folder_with_ignore(str(self.upload_folder)) + self._delete_all_embeddings() + logger.info("[dataprep - del] successfully delete all files.") + create_upload_folder(str(self.upload_folder)) + + async def delete_files(self, file_path: str = Body(..., embed=True)): + """Delete file according to `file_path`. + + `file_path`: + - specific file path (e.g. /path/to/file.txt) + - "all": delete all files uploaded + """ + if file_path == "all": + self._delete_all_files() + logger.info({"status": True}) + return {"status": True} + + # Case when file_path != all + delete_path = self.upload_folder / encode_filename(file_path) + logger.info(f"[dataprep - del] delete_path: {delete_path}") + + if not delete_path.exists(): + raise HTTPException(status_code=404, detail="File/folder not found. Please check del_path.") + + if not delete_path.is_file(): + logger.info("[dataprep - del] delete folder is not supported for now.") + logger.info({"status": False}) + return {"status": False} + self._delete_embedding(delete_path) + delete_path.unlink() + logger.info({"status": True}) + return {"status": True} diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py index caedafb4ab..4b8e5847bf 100644 --- a/comps/dataprep/src/opea_dataprep_microservice.py +++ b/comps/dataprep/src/opea_dataprep_microservice.py @@ -9,6 +9,7 @@ from fastapi import Body, Depends, File, Form, HTTPException, Request, UploadFile from integrations.arangodb import OpeaArangoDataprep from integrations.elasticsearch import OpeaElasticSearchDataprep +from integrations.mariadb import OpeaMariaDBDataprep from integrations.milvus import OpeaMilvusDataprep from integrations.neo4j_llamaindex import OpeaNeo4jLlamaIndexDataprep from integrations.opensearch import OpeaOpenSearchDataprep diff --git a/comps/dataprep/src/requirements.txt b/comps/dataprep/src/requirements.txt index 69d82b2129..3e0bc14091 100644 --- a/comps/dataprep/src/requirements.txt +++ b/comps/dataprep/src/requirements.txt @@ -20,6 +20,7 @@ langchain-arangodb langchain-community langchain-elasticsearch langchain-experimental +langchain-mariadb langchain-openai langchain-pinecone langchain-redis @@ -34,6 +35,7 @@ llama-index-graph-stores-neo4j llama-index-llms-openai llama-index-llms-openai-like markdown +mariadb moviepy neo4j numpy diff --git a/tests/dataprep/test_dataprep_mariadb.sh b/tests/dataprep/test_dataprep_mariadb.sh new file mode 100644 index 0000000000..7765efb417 --- /dev/null +++ b/tests/dataprep/test_dataprep_mariadb.sh @@ -0,0 +1,106 @@ +#!/bin/bash +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +set -x + +WORKPATH=$(dirname "$PWD") +LOG_PATH="$WORKPATH/tests" +ip_address=$(hostname -I | awk '{print $1}') +export DATAPREP_PORT="11105" +export TAG="comps" + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" +source ${SCRIPT_DIR}/dataprep_utils.sh + +function build_docker_images() { + cd $WORKPATH + + # build dataprep image for mariadb + docker build --no-cache -t opea/dataprep:${TAG} --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $WORKPATH/comps/dataprep/src/Dockerfile . + if [ $? -ne 0 ]; then + echo "opea/dataprep built fail" + exit 1 + else + echo "opea/dataprep built successful" + fi +} + +function start_service() { + export host_ip=${ip_address} + export EMBEDDING_LENGTH=768 + export MARIADB_PORT=11617 + export DATAPREP_PORT=11618 + export MARIADB_USER=testuser + export MARIADB_PASSWORD=testpwd + export MARIADB_DATABASE=vectordb + export MARIADB_CONNECTION_URL=mariadb+mariadbconnector://${MARIADB_USER}:${MARIADB_PASSWORD}@$host_ip:$MARIADB_PORT/${MARIADB_DATABASE} + export LOGFLAG=True + + service_name="dataprep-mariadb-vector" + + cd $WORKPATH/comps/dataprep/deployment/docker_compose/ + docker compose up ${service_name} -d + + check_healthy "dataprep-mariadb-vector" || exit 1 +} + +function validate_microservice() { + # test /v1/dataprep/ingest upload file + ingest_doc ${ip_address} ${DATAPREP_PORT} + check_result "dataprep - upload - doc" "Data preparation succeeded" dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log + + ingest_docx ${ip_address} ${DATAPREP_PORT} + check_result "dataprep - upload - docx" "Data preparation succeeded" dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log + + ingest_pdf ${ip_address} ${DATAPREP_PORT} + check_result "dataprep - upload - pdf" "Data preparation succeeded" dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log + + ingest_ppt ${ip_address} ${DATAPREP_PORT} + check_result "dataprep - upload - ppt" "Data preparation succeeded" dataprep-mariadb-vector ${LOG_PATH}/dataprep_upload_file.log + + ingest_pptx ${ip_address} ${DATAPREP_PORT} + check_result "dataprep - upload - pptx" "Data preparation succeeded" dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log + + ingest_txt ${ip_address} ${DATAPREP_PORT} + check_result "dataprep - upload - txt" "Data preparation succeeded" dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log + + ingest_xlsx ${ip_address} ${DATAPREP_PORT} + check_result "dataprep - upload - xlsx" "Data preparation succeeded" dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log + + # test /v1/dataprep/ingest upload link + ingest_external_link ${ip_address} ${DATAPREP_PORT} + check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log + + # test /v1/dataprep/get + get_all ${ip_address} ${DATAPREP_PORT} + check_result "dataprep - get" '{"name":' dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log + + # test /v1/dataprep/delete + delete_all ${ip_address} ${DATAPREP_PORT} + check_result "dataprep - del" '{"status":true}' dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log +} + +function stop_docker() { + cid=$(docker ps -aq --filter "name=dataprep-mariadb-vector") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi + + cid=$(docker ps -aq --filter "name=mariadb-server") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi +} + +function main() { + + stop_docker + + build_docker_images + start_service + + validate_microservice + + stop_docker + echo y | docker system prune + +} + +main From b070128a294b677c3d0065daea9c6ffa7158781b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 5 May 2025 08:00:44 +0000 Subject: [PATCH 4/5] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- comps/dataprep/README.md | 2 +- comps/dataprep/src/README_mariadb.md | 4 +++- comps/dataprep/src/requirements.txt | 2 +- comps/retrievers/src/README_mariadb.md | 2 ++ comps/retrievers/src/integrations/config.py | 2 +- comps/retrievers/src/requirements.txt | 2 +- comps/third_parties/mariadb/src/README.md | 1 - 7 files changed, 9 insertions(+), 6 deletions(-) diff --git a/comps/dataprep/README.md b/comps/dataprep/README.md index cdb226ccc9..b7b6979406 100644 --- a/comps/dataprep/README.md +++ b/comps/dataprep/README.md @@ -63,4 +63,4 @@ For details, please refer to this [readme](src/README_finance.md) ## Dataprep Microservice with MariaDB Vector -For details, please refer to this [readme](src/README_mariadb.md) \ No newline at end of file +For details, please refer to this [readme](src/README_mariadb.md) diff --git a/comps/dataprep/src/README_mariadb.md b/comps/dataprep/src/README_mariadb.md index 6cb2262b4e..0931e78edb 100644 --- a/comps/dataprep/src/README_mariadb.md +++ b/comps/dataprep/src/README_mariadb.md @@ -12,9 +12,11 @@ docker build -t opea/dataprep:latest --build-arg https_proxy=$https_proxy --buil ### 1.2 Run Docker with CLI (Option A) #### 1.2.1 Start MariaDB Server + Please refer to this [readme](../../third_parties/mariadb/src/README.md). #### 1.2.2 Start the data preparation service + ```bash export HOST_IP=$(hostname -I | awk '{print $1}') @@ -85,7 +87,7 @@ The `file_path` is the `id` returned by the `/v1/dataprep/get` API. curl -X POST "http://${HOST_IP}:5000/v1/dataprep/delete" -H "Content-Type: application/json" \ -d '{"file_path": "https://www.ces.tech/.txt"}' - + # delete file curl -X POST "http://${HOST_IP}:5000/v1/dataprep/delete" -H "Content-Type: application/json" \ diff --git a/comps/dataprep/src/requirements.txt b/comps/dataprep/src/requirements.txt index 3e0bc14091..2c8109d55b 100644 --- a/comps/dataprep/src/requirements.txt +++ b/comps/dataprep/src/requirements.txt @@ -34,8 +34,8 @@ llama-index-embeddings-text-embeddings-inference llama-index-graph-stores-neo4j llama-index-llms-openai llama-index-llms-openai-like -markdown mariadb +markdown moviepy neo4j numpy diff --git a/comps/retrievers/src/README_mariadb.md b/comps/retrievers/src/README_mariadb.md index 37561f7481..bbcbf9981e 100644 --- a/comps/retrievers/src/README_mariadb.md +++ b/comps/retrievers/src/README_mariadb.md @@ -16,10 +16,12 @@ docker build -t opea/retriever:latest --build-arg https_proxy=$https_proxy --bui ### 1.2 Run Docker with CLI (Option A) #### 1.2.1 Start MariaDB Server + Please refer to this [readme](../../third_parties/mariadb/src/README.md). You need to ingest your knowledge documents into the vector database. #### 1.2.2 Start the retriever service + ```bash export HOST_IP=$(hostname -I | awk '{print $1}') # If you've configured the server with the default env values then: diff --git a/comps/retrievers/src/integrations/config.py b/comps/retrievers/src/integrations/config.py index 89ed92bbcf..8514192611 100644 --- a/comps/retrievers/src/integrations/config.py +++ b/comps/retrievers/src/integrations/config.py @@ -242,4 +242,4 @@ def format_opensearch_conn_from_env(): # MariaDB Vector # ####################################################### MARIADB_CONNECTION_URL = os.getenv("MARIADB_CONNECTION_URL", "localhost") -MARIADB_COLLECTION_NAME = os.getenv("MARIADB_COLLECTION_NAME", "rag_mariadbvector") \ No newline at end of file +MARIADB_COLLECTION_NAME = os.getenv("MARIADB_COLLECTION_NAME", "rag_mariadbvector") diff --git a/comps/retrievers/src/requirements.txt b/comps/retrievers/src/requirements.txt index f7d79ef704..8e360866c2 100644 --- a/comps/retrievers/src/requirements.txt +++ b/comps/retrievers/src/requirements.txt @@ -12,8 +12,8 @@ graspologic haystack-ai==2.3.1 langchain-arangodb langchain-elasticsearch -langchain-openai langchain-mariadb +langchain-openai langchain-pinecone langchain-vdms>=0.1.4 langchain_community diff --git a/comps/third_parties/mariadb/src/README.md b/comps/third_parties/mariadb/src/README.md index 289d60e8e9..aa6ea5d90f 100644 --- a/comps/third_parties/mariadb/src/README.md +++ b/comps/third_parties/mariadb/src/README.md @@ -3,7 +3,6 @@ **MariaDB Vector** was introduced starting with server version 11.7 For more details please see the [official documentation](https://mariadb.com/kb/en/vectors/). - ## 1. Configure the server ```bash From 535f4cd65552211010b9b07fd54773f24a55f93b Mon Sep 17 00:00:00 2001 From: Razvan-Liviu Varzaru Date: Mon, 5 May 2025 15:02:17 +0300 Subject: [PATCH 5/5] Fix CI failures - md5 is used for the primary key not as a security hash - fixed mariadb readme headers Signed-off-by: Razvan-Liviu Varzaru --- comps/dataprep/src/integrations/mariadb.py | 4 ++-- comps/retrievers/src/README_mariadb.md | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/comps/dataprep/src/integrations/mariadb.py b/comps/dataprep/src/integrations/mariadb.py index 3f2f7b2105..34b0561159 100644 --- a/comps/dataprep/src/integrations/mariadb.py +++ b/comps/dataprep/src/integrations/mariadb.py @@ -242,7 +242,7 @@ async def _save_file_to_local_disk(self, save_path: Path, file): def _store_texts(self, doc_path: str, chunks: list[str], batch_size: int = 32): num_chunks = len(chunks) metadata = [{"doc_name": doc_path}] - doc_id = hashlib.md5(str(doc_path).encode("utf-8")).hexdigest() + doc_id = hashlib.md5(str(doc_path).encode("utf-8"), usedforsecurity=False).hexdigest() doc_emb_ids = [] for i in range(0, num_chunks, batch_size): batch_texts = chunks[i : i + batch_size] @@ -369,7 +369,7 @@ async def get_files(self): return file_content def _delete_embedding(self, doc_path: Path): - doc_id = hashlib.md5(str(doc_path).encode("utf-8")).hexdigest() + doc_id = hashlib.md5(str(doc_path).encode("utf-8"), usedforsecurity=False).hexdigest() doc_emb_ids = self.documents.get_document_emb_ids(doc_id) self.store.delete(ids=doc_emb_ids) self.documents.delete_document(doc_id) diff --git a/comps/retrievers/src/README_mariadb.md b/comps/retrievers/src/README_mariadb.md index bbcbf9981e..03ffdc2872 100644 --- a/comps/retrievers/src/README_mariadb.md +++ b/comps/retrievers/src/README_mariadb.md @@ -6,6 +6,8 @@ The service primarily utilizes similarity measures in vector space to rapidly re Overall, this microservice provides robust backend support for applications requiring efficient similarity searches, playing a vital role in scenarios such as recommendation systems, information retrieval, or any other context where precise measurement of document similarity is crucial. +## 🚀1. Start Microservice with Docker + ### 1.1 Build Docker Image ```bash