Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions comps/dataprep/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,20 @@ For details, please refer to this [readme](src/README_finance.md)
## Dataprep Microservice with MariaDB Vector

For details, please refer to this [readme](src/README_mariadb.md)

## Running in the air gapped environment

The following steps are common for running the dataprep microservice in an air gapped environment (a.k.a. environment with no internet access), for all DB backends.

1. Download the following models, e.g. `huggingface-cli download --cache-dir <model data directory> <model>`

- microsoft/table-transformer-structure-recognition
- timm/resnet18.a1_in1k
- unstructuredio/yolo_x_layout

2. launch the `dataprep` microservice with the following settings:

- mount the `model data directory` as the `/data` directory within the `dataprep` container
- set environment variable `HF_HUB_OFFLINE` to 1 when launching the `dataprep` microservice

e.g. `docker run -d -v <model data directory>:/data -e HF_HUB_OFFLINE=1 ... ...`
52 changes: 52 additions & 0 deletions comps/dataprep/deployment/docker_compose/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,26 @@ services:
minio:
condition: service_healthy

dataprep-milvus-offline:
extends: dataprep-milvus
depends_on:
tei-embedding-serving:
condition: service_healthy
standalone:
condition: service_healthy
etcd:
condition: service_healthy
minio:
condition: service_healthy
environment:
HF_HUB_OFFLINE: 1
# Use non-existing proxy to mimic air gapped environment
no_proxy: localhost,127.0.0.1,${offline_no_proxy}
http_proxy: http://localhost:7777
https_proxy: http://localhost:7777
volumes:
- "${DATA_PATH:-./data}:/data"

dataprep-multimodal-milvus:
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
container_name: dataprep-multimodal-milvus-server
Expand Down Expand Up @@ -242,6 +262,22 @@ services:
retries: 10
restart: unless-stopped

dataprep-qdrant-offline:
extends: dataprep-qdrant
depends_on:
qdrant-vector-db:
condition: service_healthy
tei-embedding-serving:
condition: service_healthy
environment:
HF_HUB_OFFLINE: 1
# Use non-existing proxy to mimic air gapped environment
no_proxy: localhost,127.0.0.1,${offline_no_proxy}
http_proxy: http://localhost:7777
https_proxy: http://localhost:7777
volumes:
- "${DATA_PATH:-./data}:/data"

dataprep-redis:
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
container_name: dataprep-redis-server
Expand Down Expand Up @@ -271,6 +307,22 @@ services:
retries: 10
restart: unless-stopped

dataprep-redis-offline:
extends: dataprep-redis
depends_on:
redis-vector-db:
condition: service_healthy
tei-embedding-serving:
condition: service_healthy
environment:
HF_HUB_OFFLINE: 1
# Use non-existing proxy to mimic air gapped environment
no_proxy: localhost,127.0.0.1,${offline_no_proxy}
http_proxy: http://localhost:7777
https_proxy: http://localhost:7777
volumes:
- "${DATA_PATH:-./data}:/data"

dataprep-multimodal-redis:
image: ${REGISTRY:-opea}/dataprep:${TAG:-latest}
container_name: dataprep-multimodal-redis-server
Expand Down
5 changes: 5 additions & 0 deletions comps/dataprep/src/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,14 @@ RUN pip install --no-cache-dir --upgrade pip setuptools && \
ENV PYTHONPATH=$PYTHONPATH:/home/user

RUN mkdir -p /home/user/comps/dataprep/src/uploaded_files && chown -R user /home/user/comps/dataprep/src/uploaded_files
RUN mkdir -p /data && chown -R user /data

USER user
ENV NLTK_DATA=/home/user/nltk_data
# air gapped support: predownload all needed nltk data
RUN mkdir -p /home/user/nltk_data && python -m nltk.downloader -d /home/user/nltk_data punkt_tab averaged_perceptron_tagger_eng stopwords
# air gapped support: set model cache dir
ENV HF_HUB_CACHE=/data

WORKDIR /home/user/comps/dataprep/src

Expand Down
4 changes: 4 additions & 0 deletions comps/dataprep/src/README_milvus.md
Original file line number Diff line number Diff line change
Expand Up @@ -207,3 +207,7 @@ curl -X POST \
-F "chunk_size=500" \
http://localhost:6010/v1/dataprep/ingest
```

## Running in the air gapped environment

Please follow the [common guide](../README.md#running-in-the-air-gapped-environment) to run dataprep microservice in the air gapped environment.
4 changes: 4 additions & 0 deletions comps/dataprep/src/README_qdrant.md
Original file line number Diff line number Diff line change
Expand Up @@ -72,3 +72,7 @@ curl -X POST \
-F "table_strategy=hq" \
http://localhost:6007/v1/dataprep/ingest
```

## Running in the air gapped environment

Please follow the [common guide](../README.md#running-in-the-air-gapped-environment) to run dataprep microservice in the air gapped environment.
4 changes: 4 additions & 0 deletions comps/dataprep/src/README_redis.md
Original file line number Diff line number Diff line change
Expand Up @@ -261,3 +261,7 @@ curl -X POST \
-d '{"file_path": "all", "index_name": "test_redis_1"}' \
http://localhost:6007/v1/dataprep/delete
```

## Running in the air gapped environment

Please follow the [common guide](../README.md#running-in-the-air-gapped-environment) to run dataprep microservice in the air gapped environment.
18 changes: 18 additions & 0 deletions tests/dataprep/dataprep_utils.sh
Original file line number Diff line number Diff line change
Expand Up @@ -224,3 +224,21 @@ function check_healthy() {
echo "$container_name did not become healthy in time."
return 1
}

DATAPREP_MODELS=(microsoft/table-transformer-structure-recognition timm/resnet18.a1_in1k unstructuredio/yolo_x_layout)

function prepare_dataprep_models() {
local model_path=$1
mkdir -p ${model_path}
python3 -m pip install huggingface_hub[cli] --user
# Workaround for huggingface-cli reporting error when set --cache-dir to same as default
local extra_args=""
local default_model_dir=$(readlink -m ~/.cache/huggingface/hub)
local real_model_dir=$(echo ${model_path/#\~/$HOME} | xargs readlink -m )
if [[ "${default_model_dir}" != "${real_model_dir}" ]]; then
extra_args="--cache-dir ${model_path}"
fi
for m in ${DATAPREP_MODELS[@]}; do
PATH=~/.local/bin:$PATH huggingface-cli download ${extra_args} $m
done
}
28 changes: 24 additions & 4 deletions tests/dataprep/test_dataprep_milvus.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,20 +28,28 @@ function build_docker_images() {
}

function start_service() {
local offline=${1:-false}
export host_ip=${ip_address}
export TEI_EMBEDDER_PORT=12005
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
export MILVUS_HOST=${ip_address}
export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:${TEI_EMBEDDER_PORT}"
export LOGFLAG=true

if [[ "$offline" == "true" ]]; then
service_name="dataprep-milvus-offline tei-embedding-serving etcd minio standalone"
export offline_no_proxy="${ip_address},${host_ip}"
else
service_name="dataprep-milvus tei-embedding-serving etcd minio standalone"
fi
cd $WORKPATH/comps/dataprep/deployment/docker_compose/
docker compose up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log

check_healthy "dataprep-milvus-server" || exit 1
}

function validate_microservice() {
local offline=${1:-false}
# test /v1/dataprep/delete
delete_all ${ip_address} ${DATAPREP_PORT}
check_result "dataprep - del" '{"status":true}' dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log
Expand Down Expand Up @@ -69,8 +77,10 @@ function validate_microservice() {
check_result "dataprep - upload - xlsx" "Data preparation succeeded" dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log

# test /v1/dataprep/ingest upload link
ingest_external_link ${ip_address} ${DATAPREP_PORT}
check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log
if [[ "$offline" != "true" ]]; then
ingest_external_link ${ip_address} ${DATAPREP_PORT}
check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log
fi

# test /v1/dataprep/get
get_all ${ip_address} ${DATAPREP_PORT}
Expand All @@ -95,11 +105,21 @@ function main() {
stop_docker

build_docker_images
start_service
trap stop_docker EXIT

echo "Test normal env ..."
start_service
validate_microservice

stop_docker

if [[ -n "${DATA_PATH}" ]]; then
echo "Test air gapped env ..."
prepare_dataprep_models ${DATA_PATH}
start_service true
validate_microservice true
stop_docker
fi

echo y | docker system prune

}
Expand Down
33 changes: 29 additions & 4 deletions tests/dataprep/test_dataprep_qdrant.sh
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ function build_docker_images() {
}

function start_service() {
local offline=${1:-false}
export host_ip=${ip_address}
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
export EMBED_MODEL=${EMBEDDING_MODEL_ID}
Expand All @@ -37,14 +38,20 @@ function start_service() {
export COLLECTION_NAME="rag-qdrant"
export QDRANT_HOST=$ip_address
export QDRANT_PORT=6360
service_name="qdrant-vector-db tei-embedding-serving dataprep-qdrant"
if [[ "$offline" == "true" ]]; then
service_name="qdrant-vector-db tei-embedding-serving dataprep-qdrant-offline"
export offline_no_proxy="${ip_address}"
else
service_name="qdrant-vector-db tei-embedding-serving dataprep-qdrant"
fi
cd $WORKPATH/comps/dataprep/deployment/docker_compose/
docker compose up ${service_name} -d

check_healthy "dataprep-qdrant-server" || exit 1
}

function validate_microservice() {
local offline=${1:-false}
# test /v1/dataprep/ingest upload file
ingest_doc ${ip_address} ${DATAPREP_PORT}
check_result "dataprep - upload - doc" "Data preparation succeeded" dataprep-qdrant-server ${LOG_PATH}/dataprep-qdrant.log
Expand All @@ -68,8 +75,10 @@ function validate_microservice() {
check_result "dataprep - upload - xlsx" "Data preparation succeeded" dataprep-qdrant-server ${LOG_PATH}/dataprep-qdrant.log

# test /v1/dataprep/ingest upload link
ingest_external_link ${ip_address} ${DATAPREP_PORT}
check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-qdrant-server ${LOG_PATH}/dataprep-qdrant.log
if [[ "$offline" != "true" ]]; then
ingest_external_link ${ip_address} ${DATAPREP_PORT}
check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-qdrant-server ${LOG_PATH}/dataprep-qdrant.log
fi

}

Expand All @@ -78,14 +87,30 @@ function stop_docker() {
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
}

function stop_service() {
cd $WORKPATH/comps/dataprep/deployment/docker_compose/
docker compose down || true
}

function main() {

stop_docker

build_docker_images
start_service
trap stop_service EXIT

echo "Test normal env ..."
start_service
validate_microservice
stop_service

if [[ -n "${DATA_PATH}" ]]; then
echo "Test air gapped env ..."
prepare_dataprep_models ${DATA_PATH}
start_service true
validate_microservice true
stop_service
fi

stop_docker
echo y | docker system prune
Expand Down
39 changes: 32 additions & 7 deletions tests/dataprep/test_dataprep_redis.sh
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ function build_docker_images() {
}

function start_service() {
local offline=${1:-false}

export host_ip=${ip_address}
export REDIS_HOST=$ip_address
Expand All @@ -38,14 +39,20 @@ function start_service() {
export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${TEI_EMBEDDER_PORT}"
export INDEX_NAME="rag_redis"
service_name="redis-vector-db tei-embedding-serving dataprep-redis"
if [[ "$offline" == "true" ]]; then
service_name="redis-vector-db tei-embedding-serving dataprep-redis-offline"
export offline_no_proxy="${ip_address}"
else
service_name="redis-vector-db tei-embedding-serving dataprep-redis"
fi
cd $WORKPATH/comps/dataprep/deployment/docker_compose/
docker compose up ${service_name} -d

check_healthy "dataprep-redis-server" || exit 1
}

function validate_microservice() {
local offline=${1:-false}

# test /v1/dataprep/delete
delete_all ${ip_address} ${DATAPREP_PORT}
Expand Down Expand Up @@ -73,12 +80,14 @@ function validate_microservice() {
ingest_xlsx ${ip_address} ${DATAPREP_PORT} "redis"
check_result "dataprep - upload - xlsx" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log

# test /v1/dataprep/ingest upload link
ingest_external_link ${ip_address} ${DATAPREP_PORT}
check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log
# test /v1/dataprep/ingest upload link
if [[ "$offline" != "true" ]]; then
ingest_external_link ${ip_address} ${DATAPREP_PORT}
check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log

ingest_external_link_with_chunk_parameters ${ip_address} ${DATAPREP_PORT} "rag_redis_test_link_params"
check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log
ingest_external_link_with_chunk_parameters ${ip_address} ${DATAPREP_PORT} "rag_redis_test_link_params"
check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log
fi

ingest_txt_with_index_name ${ip_address} ${DATAPREP_PORT} rag_redis_test
check_result "dataprep - upload with index - txt" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log
Expand Down Expand Up @@ -114,14 +123,30 @@ function stop_docker() {
if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi
}

function stop_service() {
cd $WORKPATH/comps/dataprep/deployment/docker_compose/
docker compose down || true
}

function main() {

stop_docker

build_docker_images
start_service
trap stop_service EXIT

echo "Test normal env ..."
start_service
validate_microservice
stop_service

if [[ -n "${DATA_PATH}" ]]; then
echo "Test air gapped env ..."
prepare_dataprep_models ${DATA_PATH}
start_service true
validate_microservice true
stop_service
fi

stop_docker
echo y | docker system prune
Expand Down