From e3dc87e71bf350a0ecf1c7845668eb26790d71cf Mon Sep 17 00:00:00 2001
From: Razvan-Liviu Varzaru <razvan@mariadb.org>
Date: Tue, 29 Apr 2025 16:51:43 +0300
Subject: [PATCH 1/5] Add MariaDB Vector third-party service

MariaDB Vector was introduced since MariaDB Server 11.7

Signed-off-by: Razvan-Liviu Varzaru <razvan@mariadb.org>
---
 .../deployment/docker_compose/compose.yaml    | 21 +++++++++++++++++++
 comps/third_parties/mariadb/src/README.md     | 20 ++++++++++++++++++
 comps/third_parties/mariadb/src/__init__.py   |  2 ++
 3 files changed, 43 insertions(+)
 create mode 100644 comps/third_parties/mariadb/deployment/docker_compose/compose.yaml
 create mode 100644 comps/third_parties/mariadb/src/README.md
 create mode 100644 comps/third_parties/mariadb/src/__init__.py

diff --git a/comps/third_parties/mariadb/deployment/docker_compose/compose.yaml b/comps/third_parties/mariadb/deployment/docker_compose/compose.yaml
new file mode 100644
index 0000000000..b519c981bc
--- /dev/null
+++ b/comps/third_parties/mariadb/deployment/docker_compose/compose.yaml
@@ -0,0 +1,21 @@
+# Copyright (C) 2025 MariaDB Foundation
+# SPDX-License-Identifier: Apache-2.0
+
+services:
+  mariadb-server:
+    container_name: mariadb-server
+    image: mariadb:latest
+    ports:
+      - "${MARIADB_PORT:-3306}:3306"
+    restart: always
+    environment:
+      - MARIADB_DATABASE=${MARIADB_DATABASE:-vectordb}
+      - MARIADB_USER=${MARIADB_USER:-dbuser}
+      - MARIADB_PASSWORD=${MARIADB_PASSWORD:-password}
+      - MARIADB_RANDOM_ROOT_PASSWORD=1
+    healthcheck:
+      test: ["CMD", "healthcheck.sh", "--connect", "--innodb_initialized"]
+      start_period: 10s
+      interval: 10s
+      timeout: 5s
+      retries: 3
diff --git a/comps/third_parties/mariadb/src/README.md b/comps/third_parties/mariadb/src/README.md
new file mode 100644
index 0000000000..289d60e8e9
--- /dev/null
+++ b/comps/third_parties/mariadb/src/README.md
@@ -0,0 +1,20 @@
+# Start MariaDB Server
+
+**MariaDB Vector** was introduced starting with server version 11.7  
+For more details please see the [official documentation](https://mariadb.com/kb/en/vectors/).
+
+
+## 1. Configure the server
+
+```bash
+export MARIADB_CONTAINER_IMAGE="mariadb:latest"
+export MARIADB_USER=dbuser
+export MARIADB_PASSWORD=password
+export MARIADB_DATABASE=vectordb
+```
+
+## 2. Run MariaDB Server
+
+```bash
+docker run --name mariadb-server -e MARIADB_USER=${MARIADB_USER} -e MARIADB_RANDOM_ROOT_PASSWORD=1 -e MARIADB_DATABASE=${MARIADB_DATABASE} -e MARIADB_PASSWORD=${MARIADB_PASSWORD} -d -p 3306:3306 ${MARIADB_CONTAINER_IMAGE}
+```
diff --git a/comps/third_parties/mariadb/src/__init__.py b/comps/third_parties/mariadb/src/__init__.py
new file mode 100644
index 0000000000..49c2a10929
--- /dev/null
+++ b/comps/third_parties/mariadb/src/__init__.py
@@ -0,0 +1,2 @@
+# Copyright (C) 2025 MariaDB Foundation
+# SPDX-License-Identifier: Apache-2.0

From 05d41a73eb625389bb45bc3439d7e88e6e93ba83 Mon Sep 17 00:00:00 2001
From: Razvan-Liviu Varzaru <razvan@mariadb.org>
Date: Tue, 29 Apr 2025 16:53:59 +0300
Subject: [PATCH 2/5] Add retriever MariaDB Vector integration

Signed-off-by: Razvan-Liviu Varzaru <razvan@mariadb.org>
---
 comps/retrievers/README.md                    |   4 +
 .../deployment/docker_compose/compose.yaml    |  12 ++
 comps/retrievers/src/Dockerfile               |   4 +-
 comps/retrievers/src/README_mariadb.md        |  58 +++++++
 comps/retrievers/src/integrations/config.py   |   6 +
 comps/retrievers/src/integrations/mariadb.py  | 153 ++++++++++++++++++
 .../src/opea_retrievers_microservice.py       |   1 +
 comps/retrievers/src/requirements.txt         |   2 +
 tests/retrievers/test_retrievers_mariadb.sh   |  84 ++++++++++
 9 files changed, 323 insertions(+), 1 deletion(-)
 create mode 100644 comps/retrievers/src/README_mariadb.md
 create mode 100644 comps/retrievers/src/integrations/mariadb.py
 create mode 100644 tests/retrievers/test_retrievers_mariadb.sh

diff --git a/comps/retrievers/README.md b/comps/retrievers/README.md
index 9cec099365..d68a450252 100644
--- a/comps/retrievers/README.md
+++ b/comps/retrievers/README.md
@@ -41,3 +41,7 @@ For details, please refer to this [readme](src/README_neo4j.md)
 ## Retriever Microservice with Pathway
 
 For details, please refer to this [readme](src/README_pathway.md)
+
+## Retriever Microservice with MariaDB Vector
+
+For details, please refer to this [readme](src/README_mariadb.md)
diff --git a/comps/retrievers/deployment/docker_compose/compose.yaml b/comps/retrievers/deployment/docker_compose/compose.yaml
index a1cbd5e7bd..85261e8047 100644
--- a/comps/retrievers/deployment/docker_compose/compose.yaml
+++ b/comps/retrievers/deployment/docker_compose/compose.yaml
@@ -14,6 +14,7 @@ include:
   - ../../../third_parties/tgi/deployment/docker_compose/compose.yaml
   - ../../../third_parties/vdms/deployment/docker_compose/compose.yaml
   - ../../../third_parties/arangodb/deployment/docker_compose/compose.yaml
+  - ../../../third_parties/mariadb/deployment/docker_compose/compose.yaml
 
 services:
   retriever:
@@ -225,6 +226,17 @@ services:
       arango-vector-db:
         condition: service_healthy
 
+  retriever-mariadb-vector:
+    extends: retriever
+    container_name: retriever-mariadb-vector
+    environment:
+      RETRIEVER_COMPONENT_NAME: ${RETRIEVER_COMPONENT_NAME:-OPEA_RETRIEVER_MARIADBVECTOR}
+      MARIADB_CONNECTION_URL: ${MARIADB_CONNECTION_URL:-mariadb+mariadbconnector://dbuser:password@mariadb-server:3306/vectordb}
+      LOGFLAG: ${LOGFLAG}
+    depends_on:
+      mariadb-server:
+        condition: service_healthy
+
 networks:
   default:
     driver: bridge
diff --git a/comps/retrievers/src/Dockerfile b/comps/retrievers/src/Dockerfile
index 7f4580d361..a5ae329e66 100644
--- a/comps/retrievers/src/Dockerfile
+++ b/comps/retrievers/src/Dockerfile
@@ -9,7 +9,9 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missin
     libcairo2 \
     libgl1-mesa-glx \
     libglib2.0-0 \
-    libjemalloc-dev
+    libjemalloc-dev \
+    libmariadb-dev \
+    build-essential
 
 RUN useradd -m -s /bin/bash user && \
     mkdir -p /home/user && \
diff --git a/comps/retrievers/src/README_mariadb.md b/comps/retrievers/src/README_mariadb.md
new file mode 100644
index 0000000000..37561f7481
--- /dev/null
+++ b/comps/retrievers/src/README_mariadb.md
@@ -0,0 +1,58 @@
+# Retriever Microservice
+
+This retriever microservice is a highly efficient search service designed for handling and retrieving embedding vectors. It operates by receiving an embedding vector as input and conducting a similarity search against vectors stored in a VectorDB database. Users must specify the VectorDB's URL and the index name, and the service searches within that index to find documents with the highest similarity to the input vector.
+
+The service primarily utilizes similarity measures in vector space to rapidly retrieve contentually similar documents. The vector-based retrieval approach is particularly suited for handling large datasets, offering fast and accurate search results that significantly enhance the efficiency and quality of information retrieval.
+
+Overall, this microservice provides robust backend support for applications requiring efficient similarity searches, playing a vital role in scenarios such as recommendation systems, information retrieval, or any other context where precise measurement of document similarity is crucial.
+
+### 1.1 Build Docker Image
+
+```bash
+cd GenAIComps
+docker build -t opea/retriever:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/src/Dockerfile .
+```
+
+### 1.2 Run Docker with CLI (Option A)
+
+#### 1.2.1 Start MariaDB Server
+Please refer to this [readme](../../third_parties/mariadb/src/README.md).
+You need to ingest your knowledge documents into the vector database.
+
+#### 1.2.2 Start the retriever service
+```bash
+export HOST_IP=$(hostname -I | awk '{print $1}')
+# If you've configured the server with the default env values then:
+export MARIADB_CONNECTION_URL=mariadb+mariadbconnector://dbuser:password@${HOST_IP}$:3306/vectordb
+
+docker run  -d --rm --name="retriever-mariadb-vector" -p 7000:7000 --ipc=host -e MARIADB_CONNECTION_URL=$MARIADB_CONNECTION_URL -e RETRIEVER_COMPONENT_NAME="OPEA_RETRIEVER_MARIADBVECTOR" opea/retriever:latest
+```
+
+### 1.3 Run with Docker Compose (Option B)
+
+```bash
+cd comps/retrievers/deployment/docker_compose
+docker compose -f compose.yaml up retriever-mariadb-vector -d
+```
+
+## 🚀2. Consume Retriever Service
+
+### 2.1 Check Service Status
+
+```bash
+curl http://${HOST_IP}:7000/v1/health_check \
+  -X GET \
+  -H 'Content-Type: application/json'
+```
+
+### 2.2 Consume Embedding Service
+
+To consume the Retriever Microservice, you can generate a mock embedding vector of length 768 with Python.
+
+```bash
+export your_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
+curl http://${HOST_IP}:7000/v1/retrieval \
+  -X POST \
+  -d "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${your_embedding}}" \
+  -H 'Content-Type: application/json'
+```
diff --git a/comps/retrievers/src/integrations/config.py b/comps/retrievers/src/integrations/config.py
index 95eb7c16ce..89ed92bbcf 100644
--- a/comps/retrievers/src/integrations/config.py
+++ b/comps/retrievers/src/integrations/config.py
@@ -237,3 +237,9 @@ def format_opensearch_conn_from_env():
 OPENAI_EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
 OPENAI_CHAT_ENABLED = os.getenv("OPENAI_CHAT_ENABLED", "true").lower() == "true"
 OPENAI_EMBED_ENABLED = os.getenv("OPENAI_EMBED_ENABLED", "true").lower() == "true"
+
+#######################################################
+#                     MariaDB Vector                  #
+#######################################################
+MARIADB_CONNECTION_URL = os.getenv("MARIADB_CONNECTION_URL", "localhost")
+MARIADB_COLLECTION_NAME = os.getenv("MARIADB_COLLECTION_NAME", "rag_mariadbvector")
\ No newline at end of file
diff --git a/comps/retrievers/src/integrations/mariadb.py b/comps/retrievers/src/integrations/mariadb.py
new file mode 100644
index 0000000000..df997e7c47
--- /dev/null
+++ b/comps/retrievers/src/integrations/mariadb.py
@@ -0,0 +1,153 @@
+# Copyright (C) 2025 MariaDB Foundation
+# SPDX-License-Identifier: Apache-2.0
+
+
+import os
+from urllib.parse import urlparse
+
+import mariadb
+from fastapi import HTTPException
+from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_mariadb.vectorstores import MariaDBStore, MariaDBStoreSettings
+
+from comps import CustomLogger, EmbedDoc, OpeaComponent, OpeaComponentRegistry, ServiceType
+
+from .config import (
+    EMBED_MODEL,
+    HUGGINGFACEHUB_API_TOKEN,
+    MARIADB_COLLECTION_NAME,
+    MARIADB_CONNECTION_URL,
+    TEI_EMBEDDING_ENDPOINT,
+)
+
+
+class NullLogger:
+    def info(self, *args, **kwargs):
+        pass
+
+    def debug(self, *args, **kwargs):
+        pass
+
+    def warning(self, *args, **kwargs):
+        pass
+
+    def error(self, *args, **kwargs):
+        pass
+
+    def critical(self, *args, **kwargs):
+        pass
+
+    def exception(self, *args, **kwargs):
+        pass
+
+
+logger = CustomLogger("mariadbvector_retrievers")
+logflag = os.getenv("LOGFLAG", False)
+if not logflag:
+    logger = NullLogger()
+
+
+@OpeaComponentRegistry.register("OPEA_RETRIEVER_MARIADBVECTOR")
+class OpeaMARIADBVectorRetriever(OpeaComponent):
+    """A specialized retriever component derived from OpeaComponent for mariadb vector retriever services.
+
+    Attributes:
+        client (MariaDBStore): An instance of the MariaDBStore client for vector database operations.
+    """
+
+    def __init__(self, name: str, description: str, config: dict = None):
+        super().__init__(name, ServiceType.RETRIEVER.name.lower(), description, config)
+
+        args = urlparse(MARIADB_CONNECTION_URL)
+
+        self.conn_args = {
+            "host": args.hostname,
+            "port": args.port,
+            "user": args.username,
+            "password": args.password,
+            "database": args.path[1:],
+        }
+
+        self.embedder = self._initialize_embedder()
+
+        health_status = self.check_health()
+        if not health_status:
+            logger.error("OpeaMARIADBVectorRetriever health check failed.")
+
+        self.store = self._initialize_client()
+
+    def _initialize_embedder(self):
+        if TEI_EMBEDDING_ENDPOINT:
+            # create embeddings using TEI endpoint service
+            logger.info(f"[ init embedder ] TEI_EMBEDDING_ENDPOINT:{TEI_EMBEDDING_ENDPOINT}")
+            if not HUGGINGFACEHUB_API_TOKEN:
+                raise HTTPException(
+                    status_code=400,
+                    detail="You MUST offer the `HUGGINGFACEHUB_API_TOKEN` when using `TEI_EMBEDDING_ENDPOINT`.",
+                )
+            import requests
+
+            response = requests.get(TEI_EMBEDDING_ENDPOINT + "/info")
+            if response.status_code != 200:
+                raise HTTPException(
+                    status_code=400, detail=f"TEI embedding endpoint {TEI_EMBEDDING_ENDPOINT} is not available."
+                )
+            model_id = response.json()["model_id"]
+            embeddings = HuggingFaceInferenceAPIEmbeddings(
+                api_key=HUGGINGFACEHUB_API_TOKEN, model_name=model_id, api_url=TEI_EMBEDDING_ENDPOINT
+            )
+        else:
+            # create embeddings using local embedding model
+            logger.info(f"[ init embedder ] LOCAL_EMBEDDING_MODEL:{EMBED_MODEL}")
+            embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
+        return embeddings
+
+    def _initialize_client(self) -> MariaDBStore:
+        store = MariaDBStore(
+            embeddings=self.embedder,
+            collection_name=MARIADB_COLLECTION_NAME,
+            datasource=MARIADB_CONNECTION_URL,
+            config=MariaDBStoreSettings(lazy_init=True),
+        )
+        return store
+
+    def check_health(self) -> bool:
+        """Checks mariadb server health."""
+        try:
+            connection = mariadb.connect(**self.conn_args)
+            return True
+        except mariadb.Error as e:
+            logger.error(f"Error connect to MariaDB Server: {e}")
+            return False
+
+        except Exception as e:
+            logger.error(f"An unexpected error occurred: {e}")
+            return False
+        finally:
+            try:
+                connection.close()
+            except Exception as e:
+                logger.error(f"Error closing connection: {e}")
+
+    async def invoke(self, input: EmbedDoc) -> list:
+        """Search the MariaDB Vector index for the most similar documents to the input query.
+
+        Args:
+            input (EmbedDoc): The input query to search for.
+        Output:
+            list: The retrieved documents.
+        """
+        logger.info(f"[ similarity search ] input: {input}")
+
+        result = []
+        try:
+            result = await self.store.asimilarity_search_by_vector(embedding=input.embedding)
+            logger.info(f"[ similarity search ] search result: {result}")
+            return result
+        except mariadb.Error as e:
+            logger.error(f"A database error occurred during similarity search: {e}")
+            raise HTTPException(status_code=500, detail="A database error occurred during similarity search")
+        except Exception as e:
+            logger.error(f"An unexpected error occurred: {e}")
+            raise HTTPException(status_code=500, detail="An unexpected error occurred")
diff --git a/comps/retrievers/src/opea_retrievers_microservice.py b/comps/retrievers/src/opea_retrievers_microservice.py
index 54015d3a20..2d3bbf7873 100644
--- a/comps/retrievers/src/opea_retrievers_microservice.py
+++ b/comps/retrievers/src/opea_retrievers_microservice.py
@@ -10,6 +10,7 @@
 
 # import for retrievers component registration
 from integrations.elasticsearch import OpeaElasticsearchRetriever
+from integrations.mariadb import OpeaMARIADBVectorRetriever
 from integrations.milvus import OpeaMilvusRetriever
 from integrations.neo4j import OpeaNeo4jRetriever
 from integrations.opensearch import OpeaOpensearchRetriever
diff --git a/comps/retrievers/src/requirements.txt b/comps/retrievers/src/requirements.txt
index 9b27448dd9..f7d79ef704 100644
--- a/comps/retrievers/src/requirements.txt
+++ b/comps/retrievers/src/requirements.txt
@@ -13,6 +13,7 @@ haystack-ai==2.3.1
 langchain-arangodb
 langchain-elasticsearch
 langchain-openai
+langchain-mariadb
 langchain-pinecone
 langchain-vdms>=0.1.4
 langchain_community
@@ -25,6 +26,7 @@ llama-index-llms-openai
 llama-index-llms-openai-like
 llama-index-llms-text-generation-inference
 llama_index_graph_stores_neo4j
+mariadb
 neo4j
 numpy
 opensearch-py
diff --git a/tests/retrievers/test_retrievers_mariadb.sh b/tests/retrievers/test_retrievers_mariadb.sh
new file mode 100644
index 0000000000..84785f4ac6
--- /dev/null
+++ b/tests/retrievers/test_retrievers_mariadb.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -x
+
+IMAGE_REPO=${IMAGE_REPO:-"opea"}
+export REGISTRY=${IMAGE_REPO}
+export TAG="comps"
+echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
+echo "TAG=${TAG}"
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+export host_ip=$(hostname -I | awk '{print $1}')
+service_name="retriever-mariadb-vector"
+
+function build_docker_images() {
+    cd $WORKPATH
+    docker build --no-cache -t ${REGISTRY:-opea}/retriever:${TAG:-latest} --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/retrievers/src/Dockerfile .
+    if [ $? -ne 0 ]; then
+        echo "opea/retriever built fail"
+        exit 1
+    else
+        echo "opea/retriever built successful"
+    fi
+}
+
+function start_service() {
+    export MARIADB_PORT=11617
+    export RETRIEVER_PORT=11618
+    export MARIADB_USER=testuser
+    export MARIADB_PASSWORD=testpwd
+    export MARIADB_DATABASE=vectordb
+    export HF_TOKEN=${HF_TOKEN}
+    export MARIADB_CONNECTION_URL=mariadb+mariadbconnector://${MARIADB_USER}:${MARIADB_PASSWORD}@$host_ip:$MARIADB_PORT/${MARIADB_DATABASE}
+    export LOGFLAG=True
+
+    cd $WORKPATH/comps/retrievers/deployment/docker_compose
+    docker compose -f compose.yaml up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log
+
+    sleep 1m
+}
+
+function validate_microservice() {
+    test_embedding=$(python -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
+
+    result=$(http_proxy=''
+    curl http://${host_ip}:$RETRIEVER_PORT/v1/retrieval \
+        -X POST \
+        -d "{\"text\":\"test\",\"embedding\":${test_embedding}}" \
+        -H 'Content-Type: application/json')
+    if [[ $result == *"retrieved_docs"* ]]; then
+        echo "Result correct."
+    else
+        echo "Result wrong. Received was $result"
+        docker logs mariadb-server >> ${LOG_PATH}/vectorstore.log
+        docker logs ${service_name} >> ${LOG_PATH}/retriever-mariadb-vector.log
+        exit 1
+    fi
+}
+
+function stop_docker() {
+    cd $WORKPATH/comps/retrievers/deployment/docker_compose
+    docker compose -f compose.yaml down --remove-orphans
+    cid=$(docker ps -aq --filter "name=mariadb-server")
+    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
+}
+
+function main() {
+
+    stop_docker
+
+    build_docker_images
+    start_service
+
+    validate_microservice
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main

From a0c00fd4ced3c72d5ef4088b419bb8ddd5ff4e9f Mon Sep 17 00:00:00 2001
From: Razvan-Liviu Varzaru <razvan@mariadb.org>
Date: Tue, 29 Apr 2025 16:55:42 +0300
Subject: [PATCH 3/5] Add dataprep MariaDB Vector integration

Signed-off-by: Razvan-Liviu Varzaru <razvan@mariadb.org>
---
 comps/dataprep/README.md                      |   4 +
 .../deployment/docker_compose/compose.yaml    |  24 +
 comps/dataprep/src/Dockerfile                 |   1 +
 comps/dataprep/src/README_mariadb.md          |  98 +++++
 comps/dataprep/src/integrations/mariadb.py    | 415 ++++++++++++++++++
 .../src/opea_dataprep_microservice.py         |   1 +
 comps/dataprep/src/requirements.txt           |   2 +
 tests/dataprep/test_dataprep_mariadb.sh       | 106 +++++
 8 files changed, 651 insertions(+)
 create mode 100644 comps/dataprep/src/README_mariadb.md
 create mode 100644 comps/dataprep/src/integrations/mariadb.py
 create mode 100644 tests/dataprep/test_dataprep_mariadb.sh

diff --git a/comps/dataprep/README.md b/comps/dataprep/README.md
index 7cba93f0a0..cdb226ccc9 100644
--- a/comps/dataprep/README.md
+++ b/comps/dataprep/README.md
@@ -60,3 +60,7 @@ For details, please refer to this [readme](src/README_neo4j_llamaindex.md)
 ## Dataprep Microservice for financial domain data
 
 For details, please refer to this [readme](src/README_finance.md)
+
+## Dataprep Microservice with MariaDB Vector
+
+For details, please refer to this [readme](src/README_mariadb.md)
\ No newline at end of file
diff --git a/comps/dataprep/deployment/docker_compose/compose.yaml b/comps/dataprep/deployment/docker_compose/compose.yaml
index c44fdb818f..481f63778c 100644
--- a/comps/dataprep/deployment/docker_compose/compose.yaml
+++ b/comps/dataprep/deployment/docker_compose/compose.yaml
@@ -15,6 +15,7 @@ include:
   - ../../../third_parties/tei/deployment/docker_compose/compose.yaml
   - ../../../third_parties/vllm/deployment/docker_compose/compose.yaml
   - ../../../third_parties/arangodb/deployment/docker_compose/compose.yaml
+  - ../../../third_parties/mariadb/deployment/docker_compose/compose.yaml
 
 services:
 
@@ -414,6 +415,29 @@ services:
       retries: 10
     restart: unless-stopped
 
+  dataprep-mariadb-vector:
+    image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
+    container_name: dataprep-mariadb-vector
+    ports:
+      - "${DATAPREP_PORT:-5000}:5000"
+    depends_on:
+      mariadb-server:
+        condition: service_healthy
+    ipc: host
+    environment:
+      no_proxy: ${no_proxy}
+      http_proxy: ${http_proxy}
+      https_proxy: ${https_proxy}
+      DATAPREP_COMPONENT_NAME: "OPEA_DATAPREP_MARIADBVECTOR"
+      MARIADB_CONNECTION_URL: ${MARIADB_CONNECTION_URL:-mariadb+mariadbconnector://dbuser:password@mariadb-server:3306/vectordb}
+      LOGFLAG: ${LOGFLAG}
+    healthcheck:
+      test: ["CMD-SHELL", "curl -f http://localhost:5000/v1/health_check || exit 1"]
+      interval: 10s
+      timeout: 5s
+      retries: 10
+    restart: unless-stopped
+
 networks:
   default:
     driver: bridge
diff --git a/comps/dataprep/src/Dockerfile b/comps/dataprep/src/Dockerfile
index a344066ba0..eba0288012 100644
--- a/comps/dataprep/src/Dockerfile
+++ b/comps/dataprep/src/Dockerfile
@@ -13,6 +13,7 @@ RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missin
     libcairo2 \
     libgl1-mesa-glx \
     libjemalloc-dev \
+    libmariadb-dev \
     libpq-dev \
     libreoffice \
     poppler-utils \
diff --git a/comps/dataprep/src/README_mariadb.md b/comps/dataprep/src/README_mariadb.md
new file mode 100644
index 0000000000..6cb2262b4e
--- /dev/null
+++ b/comps/dataprep/src/README_mariadb.md
@@ -0,0 +1,98 @@
+# Dataprep Microservice with MariaDB Vector
+
+## 🚀1. Start Microservice with Docker
+
+### 1.1 Build Docker Image
+
+```bash
+cd GenAIComps
+docker build -t opea/dataprep:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f comps/dataprep/src/Dockerfile .
+```
+
+### 1.2 Run Docker with CLI (Option A)
+
+#### 1.2.1 Start MariaDB Server
+Please refer to this [readme](../../third_parties/mariadb/src/README.md).
+
+#### 1.2.2 Start the data preparation service
+```bash
+
+export HOST_IP=$(hostname -I | awk '{print $1}')
+# If you've configured the server with the default env values then:
+export MARIADB_CONNECTION_URL: mariadb+mariadbconnector://dbuser:password@${HOST_IP}$:3306/vectordb
+
+docker run  -d --rm --name="dataprep-mariadb-vector" -p 5000:5000 --ipc=host -e MARIADB_CONNECTION_URL=$MARIADB_CONNECTION_URL -e DATAPREP_COMPONENT_NAME="OPEA_DATAPREP_MARIADBVECTOR" opea/dataprep:latest
+```
+
+### 1.3 Run with Docker Compose (Option B)
+
+```bash
+cd comps/dataprep/deployment/docker_compose
+docker compose -f compose.yaml up dataprep-mariadb-vector -d
+```
+
+## 🚀2. Consume Microservice
+
+### 2.1 Consume Upload API
+
+Once the data preparation microservice for MariaDB Vector is started, one can use the below command to invoke the microservice to convert documents/links to embeddings and save them to the vector store.
+
+```bash
+export document="/path/to/document"
+curl -X POST \
+    -H "Content-Type: application/json" \
+    -d '{"path":"${document}"}' \
+    http://localhost:6007/v1/dataprep/ingest
+```
+
+### 2.2 Consume get API
+
+To get the structure of the uploaded files, use the `get` API endpoint:
+
+```bash
+curl -X POST \
+    -H "Content-Type: application/json" \
+    http://localhost:6007/v1/dataprep/get
+```
+
+A JSON formatted response similar to the one below will follow:
+
+```json
+[
+  {
+    "name": "uploaded_file_1.txt",
+    "id": "uploaded_file_1.txt",
+    "type": "File",
+    "parent": ""
+  },
+  {
+    "name": "uploaded_file_2.txt",
+    "id": "uploaded_file_2.txt",
+    "type": "File",
+    "parent": ""
+  }
+]
+```
+
+### 2.3 Consume delete API
+
+To delete uploaded files/links, use the `delete` API endpoint.
+
+The `file_path` is the `id` returned by the `/v1/dataprep/get` API.
+
+```bash
+# delete link
+curl -X POST "http://${HOST_IP}:5000/v1/dataprep/delete"
+    -H "Content-Type: application/json" \
+    -d '{"file_path": "https://www.ces.tech/.txt"}'
+    
+# delete file
+curl -X POST "http://${HOST_IP}:5000/v1/dataprep/delete"
+    -H "Content-Type: application/json" \
+    -d '{"file_path": "uploaded_file_1.txt"}'
+
+# delete all files and links
+curl -X POST "http://${HOST_IP}:5000/v1/dataprep/delete"
+    -H "Content-Type: application/json" \
+    -d '{"file_path": "all"}'
+```
diff --git a/comps/dataprep/src/integrations/mariadb.py b/comps/dataprep/src/integrations/mariadb.py
new file mode 100644
index 0000000000..3f2f7b2105
--- /dev/null
+++ b/comps/dataprep/src/integrations/mariadb.py
@@ -0,0 +1,415 @@
+# Copyright (C) 2025 MariaDB Foundation
+# SPDX-License-Identifier: Apache-2.0
+
+import hashlib
+import json
+import os
+from pathlib import Path
+from typing import (
+    List,
+    Optional,
+    Union,
+)
+from urllib.parse import urlparse
+
+import mariadb
+from fastapi import Body, File, Form, HTTPException, UploadFile
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
+from langchain_huggingface import HuggingFaceEmbeddings
+from langchain_mariadb.vectorstores import MariaDBStore, MariaDBStoreSettings
+
+from comps import CustomLogger, DocPath, OpeaComponent, OpeaComponentRegistry, ServiceType
+from comps.cores.proto.api_protocol import DataprepRequest
+from comps.dataprep.src.utils import (
+    create_upload_folder,
+    document_loader,
+    encode_filename,
+    get_file_structure,
+    get_separators,
+    parse_html_new,
+    remove_folder_with_ignore,
+    save_content_to_local_disk,
+)
+
+
+# A no-op logger that does nothing
+class NullLogger:
+    def info(self, *args, **kwargs):
+        pass
+
+    def debug(self, *args, **kwargs):
+        pass
+
+    def warning(self, *args, **kwargs):
+        pass
+
+    def error(self, *args, **kwargs):
+        pass
+
+    def critical(self, *args, **kwargs):
+        pass
+
+    def exception(self, *args, **kwargs):
+        pass
+
+
+logger = CustomLogger("opea_dataprep_mariadbvector")
+logflag = os.getenv("LOGFLAG", False)
+if not logflag:
+    logger = NullLogger()
+
+# Embedding model
+EMBED_MODEL = os.getenv("EMBED_MODEL", "BAAI/bge-base-en-v1.5")
+# TEI Embedding endpoints
+TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "")
+# Huggingface API token for TEI embedding endpoint
+HUGGINGFACEHUB_API_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN", "")
+
+MARIADB_CONNECTION_URL = os.getenv("MARIADB_CONNECTION_URL", "localhost")
+
+# Vector Index Configuration
+MARIADB_COLLECTION_NAME = os.getenv("MARIADB_COLLECTION_NAME", "rag_mariadbvector")
+
+# chunk parameters
+CHUNK_SIZE = os.getenv("CHUNK_SIZE", 1500)
+CHUNK_OVERLAP = os.getenv("CHUNK_OVERLAP", 100)
+
+
+class DocumentsTable:
+    """Table for storing documents."""
+
+    def __init__(self, conn_args):
+        self._table_name = "langchain_documents"
+        self.conn_args = conn_args
+        self.__post__init__()
+
+    def __post__init__(self):
+        self.create_table_if_not_exists()
+
+    def create_table_if_not_exists(self):
+        """Create the documents table if it does not exist."""
+        connection = mariadb.connect(**self.conn_args)
+        cursor = connection.cursor()
+        cursor.execute(
+            f"""
+            CREATE TABLE IF NOT EXISTS {self._table_name} (
+                id VARCHAR(32) PRIMARY KEY,
+                name TEXT,
+                embedding_ids JSON
+            )
+            """
+        )
+        connection.commit()
+        cursor.close()
+        connection.close()
+
+    def insert_document_ids(self, id: str, name: str, embedding_ids: list):
+        """Insert a document into the documents table."""
+        connection = mariadb.connect(**self.conn_args)
+        cursor = connection.cursor()
+        cursor.execute(
+            f"INSERT INTO {self._table_name} (id, name, embedding_ids) VALUES (?, ?, ?)",
+            (id, name, json.dumps(embedding_ids)),
+        )
+        connection.commit()
+        cursor.close()
+        connection.close()
+
+    def delete_document(self, id: str):
+        """Delete a document from the documents table."""
+        connection = mariadb.connect(**self.conn_args)
+        cursor = connection.cursor()
+        cursor.execute(f"DELETE FROM {self._table_name} WHERE id = ?", (id,))
+        connection.commit()
+        cursor.close()
+        connection.close()
+
+    def delete_all_documents(self):
+        """Delete all documents from the documents table."""
+        connection = mariadb.connect(**self.conn_args)
+        cursor = connection.cursor()
+        cursor.execute(f"DELETE FROM {self._table_name}")
+        connection.commit()
+        cursor.close()
+        connection.close()
+
+    def get_document_emb_ids(self, id: str):
+        """Get the embedding ids for a document."""
+        connection = mariadb.connect(**self.conn_args)
+        cursor = connection.cursor()
+        cursor.execute(f"SELECT embedding_ids FROM {self._table_name} WHERE id = ?", (id,))
+        result = cursor.fetchone()
+        cursor.close()
+        connection.close()
+        if result:
+            return json.loads(result[0])
+        return None
+
+
+@OpeaComponentRegistry.register("OPEA_DATAPREP_MARIADBVECTOR")
+class OpeaMariaDBDataprep(OpeaComponent):
+    """Dataprep component for MariaDBStore ingestion and search services."""
+
+    def __init__(self, name: str, description: str, config: dict = None):
+        super().__init__(name, ServiceType.DATAPREP.name.lower(), description, config)
+        args = urlparse(MARIADB_CONNECTION_URL)
+
+        self.conn_args = {
+            "host": args.hostname,
+            "port": args.port,
+            "user": args.username,
+            "password": args.password,
+            "database": args.path[1:],
+        }
+
+        self.upload_folder = Path("./uploaded_files/")
+        self.embedder = self._initialize_embedder()
+
+        # Perform health check
+        health_status = self.check_health()
+        if not health_status:
+            logger.error("OpeaMariaDBDataprep health check failed.")
+
+        self.store = self._initialize_client()
+        self.documents = DocumentsTable(self.conn_args)
+
+    def _initialize_embedder(self):
+        if TEI_EMBEDDING_ENDPOINT:
+            # create embeddings using TEI endpoint service
+            logger.info(f"[ init embedder ] TEI_EMBEDDING_ENDPOINT:{TEI_EMBEDDING_ENDPOINT}")
+            if not HUGGINGFACEHUB_API_TOKEN:
+                raise HTTPException(
+                    status_code=400,
+                    detail="You MUST offer the `HUGGINGFACEHUB_API_TOKEN` when using `TEI_EMBEDDING_ENDPOINT`.",
+                )
+            import requests
+
+            response = requests.get(TEI_EMBEDDING_ENDPOINT + "/info")
+            if response.status_code != 200:
+                raise HTTPException(
+                    status_code=400, detail=f"TEI embedding endpoint {TEI_EMBEDDING_ENDPOINT} is not available."
+                )
+            model_id = response.json()["model_id"]
+            embeddings = HuggingFaceInferenceAPIEmbeddings(
+                api_key=HUGGINGFACEHUB_API_TOKEN, model_name=model_id, api_url=TEI_EMBEDDING_ENDPOINT
+            )
+        else:
+            # create embeddings using local embedding model
+            logger.info(f"[ init embedder ] LOCAL_EMBEDDING_MODEL:{EMBED_MODEL}")
+            embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
+        return embeddings
+
+    def _initialize_client(self) -> MariaDBStore:
+        store = MariaDBStore(
+            embeddings=self.embedder,
+            collection_name=MARIADB_COLLECTION_NAME,
+            datasource=MARIADB_CONNECTION_URL,
+            config=MariaDBStoreSettings(lazy_init=True),
+        )
+        return store
+
+    def check_health(self) -> bool:
+        """Checks mariadb server health."""
+        try:
+            connection = mariadb.connect(**self.conn_args)
+            return True
+        except mariadb.Error as e:
+            logger.error(f"Error connect to MariaDB Server: {e}")
+            return False
+
+        except Exception as e:
+            logger.error(f"An unexpected error occurred: {e}")
+            return False
+        finally:
+            try:
+                connection.close()
+            except Exception as e:
+                logger.error(f"Error closing connection: {e}")
+
+    def invoke(self, *args, **kwargs):
+        pass
+
+    async def _save_file_to_local_disk(self, save_path: Path, file):
+        with save_path.open("wb") as fout:
+            try:
+                content = await file.read()
+                fout.write(content)
+            except Exception as e:
+                logger.error(f"Write file failed. Exception: {e}")
+                raise HTTPException(status_code=500, detail=f"Write file {save_path} failed. Exception: {e}")
+
+    def _store_texts(self, doc_path: str, chunks: list[str], batch_size: int = 32):
+        num_chunks = len(chunks)
+        metadata = [{"doc_name": doc_path}]
+        doc_id = hashlib.md5(str(doc_path).encode("utf-8")).hexdigest()
+        doc_emb_ids = []
+        for i in range(0, num_chunks, batch_size):
+            batch_texts = chunks[i : i + batch_size]
+            batch_ids = self.store.add_texts(
+                texts=batch_texts,
+                metadatas=metadata * len(batch_texts),
+            )
+            doc_emb_ids.extend(batch_ids)
+        self.documents.insert_document_ids(id=doc_id, name=doc_path, embedding_ids=doc_emb_ids)
+        if logflag:
+            logger.info(f"Processed batch {i // batch_size + 1} / {(num_chunks - 1) // batch_size + 1}")
+
+    async def _ingest_doc_to_mariadb(self, path: str):
+        """Ingest document to mariadb."""
+        doc_path = DocPath(path=path).path
+        logger.info(f"Parsing document {doc_path}.")
+
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True, separators=get_separators()
+        )
+
+        content = await document_loader(doc_path)
+
+        structured_types = [".xlsx", ".csv", ".json", "jsonl"]
+        _, ext = os.path.splitext(doc_path)
+
+        if ext in structured_types:
+            chunks = content
+        else:
+            chunks = text_splitter.split_text(content)
+
+        logger.info(f"Done preprocessing. Created {len(chunks)} chunks of the original file.")
+
+        self._store_texts(doc_path, chunks)
+        return True
+
+    async def _ingest_link_to_mariadb(self, link_list: List[str]):
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP, add_start_index=True, separators=get_separators()
+        )
+
+        for link in link_list:
+            content = parse_html_new([link], chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
+            logger.info(f"[ ingest link ] link: {link} content: {content}")
+            encoded_link = encode_filename(link)
+            save_path = self.upload_folder / (encoded_link + ".txt")
+            doc_path = self.upload_folder / (link + ".txt")
+            logger.info(f"[ ingest link ] save_path: {save_path}")
+            await save_content_to_local_disk(str(save_path), content)
+
+            chunks = text_splitter.split_text(content)
+            self._store_texts(str(doc_path), chunks)
+        return True
+
+    async def ingest_files(
+        self,
+        input: DataprepRequest,
+    ):
+        """Ingest files/links content into database.
+
+        Save in the format of vector[768].
+        Returns '{"status": 200, "message": "Data preparation succeeded"}' if successful.
+        Args:
+            input (DataprepRequest): Model containing the following parameters:
+                files (Union[UploadFile, List[UploadFile]], optional): A file or a list of files to be ingested. Defaults to File(None).
+                link_list (str, optional): A list of links to be ingested. Defaults to Form(None).
+                chunk_size (int, optional): The size of the chunks to be split. Defaults to Form(1500).
+                chunk_overlap (int, optional): The overlap between chunks. Defaults to Form(100).
+                process_table (bool, optional): Whether to process tables in PDFs. Defaults to Form(False).
+                table_strategy (str, optional): The strategy to process tables in PDFs. Defaults to Form("fast").
+        """
+        files = input.files
+        link_list = input.link_list
+
+        logger.info(f"files:{files}")
+        logger.info(f"link_list:{link_list}")
+        if files and link_list:
+            raise HTTPException(status_code=400, detail="Provide either a file or a string list, not both.")
+
+        if not files and not link_list:
+            raise HTTPException(status_code=400, detail="Must provide either a file or a string list.")
+
+        if files:
+            if not isinstance(files, list):
+                files = [files]
+
+            self.upload_folder.mkdir(parents=True, exist_ok=True)
+            for file in files:
+                save_path = self.upload_folder / file.filename
+                await self._save_file_to_local_disk(save_path, file)
+                await self._ingest_doc_to_mariadb(str(save_path))
+                logger.info(f"Successfully saved file {save_path}")
+
+        if link_list:
+            try:
+                link_list = json.loads(link_list)  # Parse JSON string to list
+                if not isinstance(link_list, list):
+                    raise HTTPException(status_code=400, detail="link_list should be a list.")
+                await self._ingest_link_to_mariadb(link_list)
+                logger.info(f"Successfully saved link list {link_list}")
+            except json.JSONDecodeError:
+                raise HTTPException(status_code=400, detail="Invalid JSON format for link_list.")
+
+        result = {"status": 200, "message": "Data preparation succeeded"}
+        logger.info(result)
+        return result
+
+    async def get_files(self):
+        """Get file structure from database in the format of
+        {
+            "name": "File Name",
+            "id": "File Name",
+            "type": "File",
+            "parent": "",
+        }"""
+        logger.info("[ dataprep - get file ] start to get file structure")
+
+        if not self.upload_folder.exists():
+            logger.info("No file uploaded, return empty list.")
+            return []
+
+        file_content = get_file_structure(str(self.upload_folder))
+        logger.info(file_content)
+        return file_content
+
+    def _delete_embedding(self, doc_path: Path):
+        doc_id = hashlib.md5(str(doc_path).encode("utf-8")).hexdigest()
+        doc_emb_ids = self.documents.get_document_emb_ids(doc_id)
+        self.store.delete(ids=doc_emb_ids)
+        self.documents.delete_document(doc_id)
+
+    def _delete_all_embeddings(self):
+        self.store.delete_collection()
+        self.documents.delete_all_documents()
+
+    def _delete_all_files(self):
+        """Delete all files in the upload folder."""
+        logger.info("[dataprep - del] delete all files")
+        remove_folder_with_ignore(str(self.upload_folder))
+        self._delete_all_embeddings()
+        logger.info("[dataprep - del] successfully delete all files.")
+        create_upload_folder(str(self.upload_folder))
+
+    async def delete_files(self, file_path: str = Body(..., embed=True)):
+        """Delete file according to `file_path`.
+
+        `file_path`:
+            - specific file path (e.g. /path/to/file.txt)
+            - "all": delete all files uploaded
+        """
+        if file_path == "all":
+            self._delete_all_files()
+            logger.info({"status": True})
+            return {"status": True}
+
+        # Case when file_path != all
+        delete_path = self.upload_folder / encode_filename(file_path)
+        logger.info(f"[dataprep - del] delete_path: {delete_path}")
+
+        if not delete_path.exists():
+            raise HTTPException(status_code=404, detail="File/folder not found. Please check del_path.")
+
+        if not delete_path.is_file():
+            logger.info("[dataprep - del] delete folder is not supported for now.")
+            logger.info({"status": False})
+            return {"status": False}
+        self._delete_embedding(delete_path)
+        delete_path.unlink()
+        logger.info({"status": True})
+        return {"status": True}
diff --git a/comps/dataprep/src/opea_dataprep_microservice.py b/comps/dataprep/src/opea_dataprep_microservice.py
index caedafb4ab..4b8e5847bf 100644
--- a/comps/dataprep/src/opea_dataprep_microservice.py
+++ b/comps/dataprep/src/opea_dataprep_microservice.py
@@ -9,6 +9,7 @@
 from fastapi import Body, Depends, File, Form, HTTPException, Request, UploadFile
 from integrations.arangodb import OpeaArangoDataprep
 from integrations.elasticsearch import OpeaElasticSearchDataprep
+from integrations.mariadb import OpeaMariaDBDataprep
 from integrations.milvus import OpeaMilvusDataprep
 from integrations.neo4j_llamaindex import OpeaNeo4jLlamaIndexDataprep
 from integrations.opensearch import OpeaOpenSearchDataprep
diff --git a/comps/dataprep/src/requirements.txt b/comps/dataprep/src/requirements.txt
index 69d82b2129..3e0bc14091 100644
--- a/comps/dataprep/src/requirements.txt
+++ b/comps/dataprep/src/requirements.txt
@@ -20,6 +20,7 @@ langchain-arangodb
 langchain-community
 langchain-elasticsearch
 langchain-experimental
+langchain-mariadb
 langchain-openai
 langchain-pinecone
 langchain-redis
@@ -34,6 +35,7 @@ llama-index-graph-stores-neo4j
 llama-index-llms-openai
 llama-index-llms-openai-like
 markdown
+mariadb
 moviepy
 neo4j
 numpy
diff --git a/tests/dataprep/test_dataprep_mariadb.sh b/tests/dataprep/test_dataprep_mariadb.sh
new file mode 100644
index 0000000000..7765efb417
--- /dev/null
+++ b/tests/dataprep/test_dataprep_mariadb.sh
@@ -0,0 +1,106 @@
+#!/bin/bash
+# Copyright (C) 2024 Intel Corporation
+# SPDX-License-Identifier: Apache-2.0
+
+set -x
+
+WORKPATH=$(dirname "$PWD")
+LOG_PATH="$WORKPATH/tests"
+ip_address=$(hostname -I | awk '{print $1}')
+export DATAPREP_PORT="11105"
+export TAG="comps"
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+source ${SCRIPT_DIR}/dataprep_utils.sh
+
+function build_docker_images() {
+    cd $WORKPATH
+
+    # build dataprep image for mariadb
+    docker build --no-cache -t opea/dataprep:${TAG} --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy -f $WORKPATH/comps/dataprep/src/Dockerfile .
+    if [ $? -ne 0 ]; then
+        echo "opea/dataprep built fail"
+        exit 1
+    else
+        echo "opea/dataprep built successful"
+    fi
+}
+
+function start_service() {
+    export host_ip=${ip_address}
+    export EMBEDDING_LENGTH=768
+    export MARIADB_PORT=11617
+    export DATAPREP_PORT=11618
+    export MARIADB_USER=testuser
+    export MARIADB_PASSWORD=testpwd
+    export MARIADB_DATABASE=vectordb
+    export MARIADB_CONNECTION_URL=mariadb+mariadbconnector://${MARIADB_USER}:${MARIADB_PASSWORD}@$host_ip:$MARIADB_PORT/${MARIADB_DATABASE}
+    export LOGFLAG=True
+
+    service_name="dataprep-mariadb-vector"
+
+    cd $WORKPATH/comps/dataprep/deployment/docker_compose/
+    docker compose up ${service_name} -d
+
+    check_healthy "dataprep-mariadb-vector" || exit 1
+}
+
+function validate_microservice() {
+    # test /v1/dataprep/ingest upload file
+    ingest_doc ${ip_address} ${DATAPREP_PORT}
+    check_result "dataprep - upload - doc" "Data preparation succeeded" dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log
+
+    ingest_docx ${ip_address} ${DATAPREP_PORT}
+    check_result "dataprep - upload - docx" "Data preparation succeeded" dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log
+
+    ingest_pdf ${ip_address} ${DATAPREP_PORT}
+    check_result "dataprep - upload - pdf" "Data preparation succeeded" dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log
+
+    ingest_ppt ${ip_address} ${DATAPREP_PORT}
+    check_result "dataprep - upload - ppt" "Data preparation succeeded" dataprep-mariadb-vector ${LOG_PATH}/dataprep_upload_file.log
+
+    ingest_pptx ${ip_address} ${DATAPREP_PORT}
+    check_result "dataprep - upload - pptx" "Data preparation succeeded" dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log
+
+    ingest_txt ${ip_address} ${DATAPREP_PORT}
+    check_result "dataprep - upload - txt" "Data preparation succeeded" dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log
+
+    ingest_xlsx ${ip_address} ${DATAPREP_PORT}
+    check_result "dataprep - upload - xlsx" "Data preparation succeeded" dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log
+
+    # test /v1/dataprep/ingest upload link
+    ingest_external_link ${ip_address} ${DATAPREP_PORT}
+    check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log
+
+    # test /v1/dataprep/get
+    get_all ${ip_address} ${DATAPREP_PORT}
+    check_result "dataprep - get" '{"name":' dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log
+
+    # test /v1/dataprep/delete
+    delete_all ${ip_address} ${DATAPREP_PORT}
+    check_result "dataprep - del" '{"status":true}' dataprep-mariadb-vector ${LOG_PATH}/dataprep_mariadb.log
+}
+
+function stop_docker() {
+    cid=$(docker ps -aq --filter "name=dataprep-mariadb-vector")
+    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
+
+    cid=$(docker ps -aq --filter "name=mariadb-server")
+    if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
+}
+
+function main() {
+
+    stop_docker
+
+    build_docker_images
+    start_service
+
+    validate_microservice
+
+    stop_docker
+    echo y | docker system prune
+
+}
+
+main

From b070128a294b677c3d0065daea9c6ffa7158781b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 5 May 2025 08:00:44 +0000
Subject: [PATCH 4/5] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 comps/dataprep/README.md                    | 2 +-
 comps/dataprep/src/README_mariadb.md        | 4 +++-
 comps/dataprep/src/requirements.txt         | 2 +-
 comps/retrievers/src/README_mariadb.md      | 2 ++
 comps/retrievers/src/integrations/config.py | 2 +-
 comps/retrievers/src/requirements.txt       | 2 +-
 comps/third_parties/mariadb/src/README.md   | 1 -
 7 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/comps/dataprep/README.md b/comps/dataprep/README.md
index cdb226ccc9..b7b6979406 100644
--- a/comps/dataprep/README.md
+++ b/comps/dataprep/README.md
@@ -63,4 +63,4 @@ For details, please refer to this [readme](src/README_finance.md)
 
 ## Dataprep Microservice with MariaDB Vector
 
-For details, please refer to this [readme](src/README_mariadb.md)
\ No newline at end of file
+For details, please refer to this [readme](src/README_mariadb.md)
diff --git a/comps/dataprep/src/README_mariadb.md b/comps/dataprep/src/README_mariadb.md
index 6cb2262b4e..0931e78edb 100644
--- a/comps/dataprep/src/README_mariadb.md
+++ b/comps/dataprep/src/README_mariadb.md
@@ -12,9 +12,11 @@ docker build -t opea/dataprep:latest --build-arg https_proxy=$https_proxy --buil
 ### 1.2 Run Docker with CLI (Option A)
 
 #### 1.2.1 Start MariaDB Server
+
 Please refer to this [readme](../../third_parties/mariadb/src/README.md).
 
 #### 1.2.2 Start the data preparation service
+
 ```bash
 
 export HOST_IP=$(hostname -I | awk '{print $1}')
@@ -85,7 +87,7 @@ The `file_path` is the `id` returned by the `/v1/dataprep/get` API.
 curl -X POST "http://${HOST_IP}:5000/v1/dataprep/delete"
     -H "Content-Type: application/json" \
     -d '{"file_path": "https://www.ces.tech/.txt"}'
-    
+
 # delete file
 curl -X POST "http://${HOST_IP}:5000/v1/dataprep/delete"
     -H "Content-Type: application/json" \
diff --git a/comps/dataprep/src/requirements.txt b/comps/dataprep/src/requirements.txt
index 3e0bc14091..2c8109d55b 100644
--- a/comps/dataprep/src/requirements.txt
+++ b/comps/dataprep/src/requirements.txt
@@ -34,8 +34,8 @@ llama-index-embeddings-text-embeddings-inference
 llama-index-graph-stores-neo4j
 llama-index-llms-openai
 llama-index-llms-openai-like
-markdown
 mariadb
+markdown
 moviepy
 neo4j
 numpy
diff --git a/comps/retrievers/src/README_mariadb.md b/comps/retrievers/src/README_mariadb.md
index 37561f7481..bbcbf9981e 100644
--- a/comps/retrievers/src/README_mariadb.md
+++ b/comps/retrievers/src/README_mariadb.md
@@ -16,10 +16,12 @@ docker build -t opea/retriever:latest --build-arg https_proxy=$https_proxy --bui
 ### 1.2 Run Docker with CLI (Option A)
 
 #### 1.2.1 Start MariaDB Server
+
 Please refer to this [readme](../../third_parties/mariadb/src/README.md).
 You need to ingest your knowledge documents into the vector database.
 
 #### 1.2.2 Start the retriever service
+
 ```bash
 export HOST_IP=$(hostname -I | awk '{print $1}')
 # If you've configured the server with the default env values then:
diff --git a/comps/retrievers/src/integrations/config.py b/comps/retrievers/src/integrations/config.py
index 89ed92bbcf..8514192611 100644
--- a/comps/retrievers/src/integrations/config.py
+++ b/comps/retrievers/src/integrations/config.py
@@ -242,4 +242,4 @@ def format_opensearch_conn_from_env():
 #                     MariaDB Vector                  #
 #######################################################
 MARIADB_CONNECTION_URL = os.getenv("MARIADB_CONNECTION_URL", "localhost")
-MARIADB_COLLECTION_NAME = os.getenv("MARIADB_COLLECTION_NAME", "rag_mariadbvector")
\ No newline at end of file
+MARIADB_COLLECTION_NAME = os.getenv("MARIADB_COLLECTION_NAME", "rag_mariadbvector")
diff --git a/comps/retrievers/src/requirements.txt b/comps/retrievers/src/requirements.txt
index f7d79ef704..8e360866c2 100644
--- a/comps/retrievers/src/requirements.txt
+++ b/comps/retrievers/src/requirements.txt
@@ -12,8 +12,8 @@ graspologic
 haystack-ai==2.3.1
 langchain-arangodb
 langchain-elasticsearch
-langchain-openai
 langchain-mariadb
+langchain-openai
 langchain-pinecone
 langchain-vdms>=0.1.4
 langchain_community
diff --git a/comps/third_parties/mariadb/src/README.md b/comps/third_parties/mariadb/src/README.md
index 289d60e8e9..aa6ea5d90f 100644
--- a/comps/third_parties/mariadb/src/README.md
+++ b/comps/third_parties/mariadb/src/README.md
@@ -3,7 +3,6 @@
 **MariaDB Vector** was introduced starting with server version 11.7  
 For more details please see the [official documentation](https://mariadb.com/kb/en/vectors/).
 
-
 ## 1. Configure the server
 
 ```bash

From 535f4cd65552211010b9b07fd54773f24a55f93b Mon Sep 17 00:00:00 2001
From: Razvan-Liviu Varzaru <razvan@mariadb.org>
Date: Mon, 5 May 2025 15:02:17 +0300
Subject: [PATCH 5/5] Fix CI failures

- md5 is used for the primary key not as a security hash
- fixed mariadb readme headers

Signed-off-by: Razvan-Liviu Varzaru <razvan@mariadb.org>
---
 comps/dataprep/src/integrations/mariadb.py | 4 ++--
 comps/retrievers/src/README_mariadb.md     | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/comps/dataprep/src/integrations/mariadb.py b/comps/dataprep/src/integrations/mariadb.py
index 3f2f7b2105..34b0561159 100644
--- a/comps/dataprep/src/integrations/mariadb.py
+++ b/comps/dataprep/src/integrations/mariadb.py
@@ -242,7 +242,7 @@ async def _save_file_to_local_disk(self, save_path: Path, file):
     def _store_texts(self, doc_path: str, chunks: list[str], batch_size: int = 32):
         num_chunks = len(chunks)
         metadata = [{"doc_name": doc_path}]
-        doc_id = hashlib.md5(str(doc_path).encode("utf-8")).hexdigest()
+        doc_id = hashlib.md5(str(doc_path).encode("utf-8"), usedforsecurity=False).hexdigest()
         doc_emb_ids = []
         for i in range(0, num_chunks, batch_size):
             batch_texts = chunks[i : i + batch_size]
@@ -369,7 +369,7 @@ async def get_files(self):
         return file_content
 
     def _delete_embedding(self, doc_path: Path):
-        doc_id = hashlib.md5(str(doc_path).encode("utf-8")).hexdigest()
+        doc_id = hashlib.md5(str(doc_path).encode("utf-8"), usedforsecurity=False).hexdigest()
         doc_emb_ids = self.documents.get_document_emb_ids(doc_id)
         self.store.delete(ids=doc_emb_ids)
         self.documents.delete_document(doc_id)
diff --git a/comps/retrievers/src/README_mariadb.md b/comps/retrievers/src/README_mariadb.md
index bbcbf9981e..03ffdc2872 100644
--- a/comps/retrievers/src/README_mariadb.md
+++ b/comps/retrievers/src/README_mariadb.md
@@ -6,6 +6,8 @@ The service primarily utilizes similarity measures in vector space to rapidly re
 
 Overall, this microservice provides robust backend support for applications requiring efficient similarity searches, playing a vital role in scenarios such as recommendation systems, information retrieval, or any other context where precise measurement of document similarity is crucial.
 
+## 🚀1. Start Microservice with Docker
+
 ### 1.1 Build Docker Image
 
 ```bash