Skip to content
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
9950f6f
Initial version of MinIO event based dataprep for Milvus
dilverse Oct 28, 2024
7d76da5
Merge branch 'opea-project:main' into main
dilverse Oct 28, 2024
a299782
Initial version of MinIO event based dataprep for Milvus
dilverse Nov 1, 2024
2d1a561
Update to latest MinIO Image
dilverse Nov 2, 2024
92e99c6
Add LnaceDB and MinIO event based dataprep support
dilverse Nov 2, 2024
d6a5a7e
Add MinIO utils file that parses event notifications from MinIO
dilverse Nov 2, 2024
e497710
Merge branch 'opea-project:main' into main
dilverse Nov 2, 2024
2b13f5d
Update README.md files
dilverse Nov 3, 2024
78064a8
Add MinIO LanceDB retriever support
dilverse Nov 3, 2024
eb8897d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 3, 2024
616dc63
Fix README.md paths
dilverse Nov 3, 2024
c05bdcb
Fix README.md for Milvus to right path
dilverse Nov 3, 2024
a65d3dc
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 3, 2024
80fe8f6
Fix connection arguments for Milvus
dilverse Nov 3, 2024
33d38f4
Remove local file volumes from docker compose
dilverse Nov 3, 2024
e83f649
Merge branch 'main' into main
mkbhanda Nov 5, 2024
c4dfea3
Update comps/dataprep/minio/lancedb/langchain/README.md
dilverse Nov 6, 2024
263aac3
Update comps/dataprep/minio/lancedb/langchain/README.md
dilverse Nov 6, 2024
5238a85
Update comps/dataprep/minio/lancedb/langchain/README.md
dilverse Nov 6, 2024
d6142a8
Update comps/dataprep/minio/lancedb/langchain/README.md
dilverse Nov 6, 2024
b66221e
Update comps/dataprep/minio/milvus/langchain/README.md
dilverse Nov 6, 2024
491cdcd
Update comps/dataprep/minio/milvus/langchain/README.md
dilverse Nov 6, 2024
94581a7
Update comps/dataprep/minio/milvus/langchain/README.md
dilverse Nov 6, 2024
535d09a
Update comps/dataprep/minio/milvus/langchain/README.md
dilverse Nov 6, 2024
e726c57
Update comps/dataprep/minio/milvus/langchain/README.md
dilverse Nov 6, 2024
fefd9ec
Update comps/dataprep/minio/milvus/langchain/README.md
dilverse Nov 6, 2024
1efea4d
Remove unused code and add appropriate copyrights and minor lint fixex
dilverse Nov 6, 2024
2952611
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Nov 6, 2024
30976f1
Merge branch 'main' into main
dilverse Nov 6, 2024
7c41fc5
Merge branch 'main' into main
lvliang-intel Nov 12, 2024
ce2c768
Merge branch 'main' into main
dilverse Dec 27, 2024
ac5413f
Added new docker files to github workflow
dilverse Dec 27, 2024
fba854a
Add new lines to the workflows
dilverse Dec 27, 2024
d526d28
Remove trailing spaces
dilverse Dec 27, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions comps/dataprep/milvus/langchain/prepare_doc_milvus.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def ingest_chunks_to_milvus(file_name: str, chunks: List):
batch_docs,
embeddings,
collection_name=COLLECTION_NAME,
connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
connection_args={"uri": f"{MILVUS_HOST}:{MILVUS_PORT}"},
partition_key_field=partition_field_name,
)
except Exception as e:
Expand Down Expand Up @@ -211,7 +211,7 @@ async def ingest_documents(
my_milvus = Milvus(
embedding_function=embeddings,
collection_name=COLLECTION_NAME,
connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
connection_args={"uri": f"{MILVUS_HOST}:{MILVUS_PORT}"},
index_params=index_params,
auto_id=True,
)
Expand Down Expand Up @@ -347,7 +347,7 @@ async def rag_get_file_structure():
my_milvus = Milvus(
embedding_function=embeddings,
collection_name=COLLECTION_NAME,
connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
connection_args={"uri": f"{MILVUS_HOST}:{MILVUS_PORT}"},
index_params=index_params,
auto_id=True,
)
Expand Down Expand Up @@ -405,7 +405,7 @@ async def delete_single_file(file_path: str = Body(..., embed=True)):
my_milvus = Milvus(
embedding_function=embeddings,
collection_name=COLLECTION_NAME,
connection_args={"host": MILVUS_HOST, "port": MILVUS_PORT},
connection_args={"uri": f"{MILVUS_HOST}:{MILVUS_PORT}"},
index_params=index_params,
auto_id=True,
)
Expand Down
2 changes: 2 additions & 0 deletions comps/dataprep/minio/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
38 changes: 38 additions & 0 deletions comps/dataprep/minio/lancedb/langchain/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

FROM python:3.11-slim

ENV LANG=C.UTF-8

ARG ARCH="cpu"

RUN apt-get update -y && apt-get install -y --no-install-recommends --fix-missing \
build-essential \
default-jre \
libgl1-mesa-glx \
libjemalloc-dev \
tesseract-ocr

RUN useradd -m -s /bin/bash user && \
mkdir -p /home/user && \
chown -R user /home/user/

USER user

COPY comps /home/user/comps

RUN pip install --no-cache-dir --upgrade pip setuptools && \
if [ ${ARCH} = "cpu" ]; then pip install --no-cache-dir torch torchvision --index-url https://download.pytorch.org/whl/cpu; fi && \
pip install --no-cache-dir -r /home/user/comps/dataprep/minio/lancedb/langchain/requirements.txt

ENV PYTHONPATH=$PYTHONPATH:/home/user

USER root

RUN mkdir -p /home/user/comps/dataprep/minio/lancedb/langchain/uploaded_files && chown -R user /home/user/comps/dataprep/minio/lancedb/langchain/uploaded_files

USER user
WORKDIR /home/user/comps/dataprep/minio/lancedb/langchain

ENTRYPOINT ["python", "prepare_doc_lancedb.py"]
244 changes: 244 additions & 0 deletions comps/dataprep/minio/lancedb/langchain/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,244 @@
# Dataprep Microservice with MinIO and Lancedb

## 🚀1. Start Microservice with Python (Option 1)

### 1.1 Requirements

```bash
pip install -r requirements.txt
apt-get install tesseract-ocr -y
apt-get install libtesseract-dev -y
apt-get install poppler-utils -y
```

### 1.2 Setup Environment Variables

```bash
export no_proxy=${your_no_proxy}
export http_proxy=${your_http_proxy}
export https_proxy=${your_http_proxy}
export MINIO_ACCESS_KEY=${your_minio_access_key}
export MINIO_SECRET_KEY=${your_minio_secret_key}
export MINIO_ENDPOINT=${your_minio_endpoint}
export MINIO_SECURE = ${your_minio_secure}
export COLLECTION_NAME=${your_collection_name}
export MOSEC_EMBEDDING_ENDPOINT=${your_embedding_endpoint}
```

### 1.3 Start Mosec Embedding Service

First, you need to build a mosec embedding serving docker image.

```bash
cd ../../../..
docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-mosec-endpoint:latest -f comps/embeddings/mosec/langchain/dependency/Dockerfile .
```

Then start the mosec embedding server.

```bash
your_port=6010
docker run -d --name="embedding-mosec-endpoint" -p $your_port:8000 opea/embedding-mosec-endpoint:latest
```

Setup environment variables:

```bash
export MOSEC_EMBEDDING_ENDPOINT="http://localhost:$your_port"
```

### 1.4 Start Document Preparation Microservice for Lancedb with Python Script

Start document preparation microservice for Lancedb with below command.

```bash
python prepare_doc_lancedb.py
```

## 🚀2. Start Microservice with Docker (Option 2)

### 2.1 Build Docker Image

```bash
cd ../../../..
# build mosec embedding docker image
docker build --build-arg http_proxy=$http_proxy --build-arg https_proxy=$https_proxy -t opea/embedding-langchain-mosec-endpoint:latest -f comps/embeddings/mosec/langchain/dependency/Dockerfile .
# build dataprep lancedb docker image
docker build -t opea/dataprep-minio-lancedb:latest --build-arg https_proxy=$https_proxy --build-arg http_proxy=$http_proxy --build-arg no_proxy=$no_proxy -f comps/dataprep/minio/lancedb/langchain/Dockerfile .
```

### 2.3 Setup Environment Variables

```bash
export MOSEC_EMBEDDING_ENDPOINT="http://localhost:$your_port"
export MINIO_ACCESS_KEY=${your_minio_access_key}
export MINIO_SECRET_KEY=${your_minio_secret_key}
export MINIO_ENDPOINT=${your_minio_endpoint}
export MINIO_SECURE = ${your_minio_secure}
```

### 2.3 Run Docker with CLI (Option A)

```bash
docker run -d --name="dataprep-lancedb-server" -p 6010:6010 --ipc=host \
-e http_proxy=$http_proxy -e https_proxy=$https_proxy \
-e no_proxy=$no_proxy \
-e MOSEC_EMBEDDING_ENDPOINT=${MOSEC_EMBEDDING_ENDPOINT} \
-e MINIO_ACCESS_KEY=${MINIO_ACCESS_KEY} \
-e MINIO_SECRET_KEY=${MINIO_SECRET_KEY} \
-e MINIO_ENDPOINT=${MINIO_ENDPOINT} \
-e MINIO_SECURE=${MINIO_SECURE} \
opea/dataprep-lancedb:latest
```

## 🚀3. Consume Microservice

### 3.1 Consume Upload API

Once document preparation microservice for Lancedb is started, user can use below command to invoke the microservice to convert the document to embedding and save to the database.

Make sure the file path after `files=@` is correct.

- Single file upload

```bash
curl -X POST \
-H "Content-Type: multipart/form-data" \
-F "files=@./file.pdf" \
http://localhost:6010/v1/dataprep
```

You can specify chunk_size and chunk_size by the following commands. To avoid big chunks, pass a small chun_size like 500 as below (default 1500).

```bash
curl -X POST \
-H "Content-Type: multipart/form-data" \
-F "files=@./file.pdf" \
-F "chunk_size=500" \
-F "chunk_overlap=100" \
http://localhost:6010/v1/dataprep
```

- Multiple file upload

```bash
curl -X POST \
-H "Content-Type: multipart/form-data" \
-F "files=@./file1.pdf" \
-F "files=@./file2.pdf" \
-F "files=@./file3.pdf" \
http://localhost:6010/v1/dataprep
```

- Links upload (not supported for llama_index now)

```bash
curl -X POST \
-F 'link_list=["https://www.ces.tech/"]' \
http://localhost:6010/v1/dataprep
```

or

```python
import requests
import json

proxies = {"http": ""}
url = "http://localhost:6010/v1/dataprep"
urls = [
"https://towardsdatascience.com/no-gpu-no-party-fine-tune-bert-for-sentiment-analysis-with-vertex-ai-custom-jobs-d8fc410e908b?source=rss----7f60cf5620c9---4"
]
payload = {"link_list": json.dumps(urls)}

try:
resp = requests.post(url=url, data=payload, proxies=proxies)
print(resp.text)
resp.raise_for_status() # Raise an exception for unsuccessful HTTP status codes
print("Request successful!")
except requests.exceptions.RequestException as e:
print("An error occurred:", e)
```

We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast".

Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`.

```bash
curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name","process_table":true,"table_strategy":"hq"}' http://localhost:6010/v1/dataprep
```

We support table extraction from pdf documents. You can specify process_table and table_strategy by the following commands. "table_strategy" refers to the strategies to understand tables for table retrieval. As the setting progresses from "fast" to "hq" to "llm," the focus shifts towards deeper table understanding at the expense of processing speed. The default strategy is "fast".

Note: If you specify "table_strategy=llm", You should first start TGI Service, please refer to 1.2.1, 1.3.1 in https://github.com/opea-project/GenAIComps/tree/main/comps/llms/README.md, and then `export TGI_LLM_ENDPOINT="http://${your_ip}:8008"`.

```bash
curl -X POST -H "Content-Type: application/json" -d '{"path":"/home/user/doc/your_document_name","process_table":true,"table_strategy":"hq"}' http://localhost:6010/v1/dataprep
```

### 3.2 Consume get_file API

To get uploaded file structures, use the following command:

```bash
curl -X POST \
-H "Content-Type: application/json" \
http://localhost:6010/v1/dataprep/get_file
```

Then you will get the response JSON like this:

```json
[
{
"name": "uploaded_file_1.txt",
"id": "uploaded_file_1.txt",
"type": "File",
"parent": ""
},
{
"name": "uploaded_file_2.txt",
"id": "uploaded_file_2.txt",
"type": "File",
"parent": ""
}
]
```

### 3.3 Consume delete_file API

To delete uploaded file/link, use the following command.

The `file_path` here should be the `id` get from `/v1/dataprep/get_file` API.

```bash
# delete link
curl -X POST \
-H "Content-Type: application/json" \
-d '{"file_path": "https://www.ces.tech/.txt"}' \
http://localhost:6010/v1/dataprep/delete_file

# delete file
curl -X POST \
-H "Content-Type: application/json" \
-d '{"file_path": "uploaded_file_1.txt"}' \
http://localhost:6010/v1/dataprep/delete_file

# delete all files and links, will drop the entire db collection
curl -X POST \
-H "Content-Type: application/json" \
-d '{"file_path": "all"}' \
http://localhost:6010/v1/dataprep/delete_file
```

## 🚀4. Troubleshooting

1. If you get errors from Mosec Embedding Endpoint like `cannot find this task, maybe it has expired` while uploading files, try to reduce the `chunk_size` in the curl command like below (the default chunk_size=1500).

```bash
curl -X POST \
-H "Content-Type: multipart/form-data" \
-F "files=@./file.pdf" \
-F "chunk_size=500" \
http://localhost:6010/v1/dataprep
```
2 changes: 2 additions & 0 deletions comps/dataprep/minio/lancedb/langchain/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0
26 changes: 26 additions & 0 deletions comps/dataprep/minio/lancedb/langchain/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

import os

# Local Embedding model
LOCAL_EMBEDDING_MODEL = os.getenv("LOCAL_EMBEDDING_MODEL", "maidalun1020/bce-embedding-base_v1")
# TEI Embedding endpoints
TEI_EMBEDDING_ENDPOINT = os.getenv("TEI_EMBEDDING_ENDPOINT", "")
# LanceDB configuration
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "rag_milvus")
# MOSEC configuration
MOSEC_EMBEDDING_MODEL = os.environ.get("MOSEC_EMBEDDING_MODEL", "/home/user/bge-large-zh-v1.5")
MOSEC_EMBEDDING_ENDPOINT = os.environ.get("MOSEC_EMBEDDING_ENDPOINT", "")
MINIO_ENDPOINT = os.environ.get("MINIO_ENDPOINT", "minio:9000")
MINIO_ACCESS_KEY = os.environ.get("MINIO_ACCESS_KEY", "minioadmin")
MINIO_SECRET_KEY = os.environ.get("MINIO_SECRET_KEY", "minioadmin")
MINIO_SECURE = os.environ.get("MINIO_SECURE", "False").lower() == "true"
MINIO_DOCUMENT_BUCKET = os.environ.get("MINIO_DOCUMENT_BUCKET", "document")
MINIO_WAREHOUSE_BUCKET = os.environ.get("MINIO_WAREHOUSE_BUCKET", "warehouse")
os.environ["OPENAI_API_BASE"] = MOSEC_EMBEDDING_ENDPOINT
os.environ["OPENAI_API_KEY"] = "Dummy key"
os.environ["AWS_ENDPOINT"] = f"http://{MINIO_ENDPOINT}"
os.environ["AWS_ACCESS_KEY_ID"] = MINIO_ACCESS_KEY
os.environ["AWS_SECRET_ACCESS_KEY"] = MINIO_SECRET_KEY
os.environ["ALLOW_HTTP"] = str(MINIO_SECURE != "true").lower()
Loading
Loading