diff --git a/comps/dataprep/README.md b/comps/dataprep/README.md index b7b6979406..ad11756009 100644 --- a/comps/dataprep/README.md +++ b/comps/dataprep/README.md @@ -64,3 +64,20 @@ For details, please refer to this [readme](src/README_finance.md) ## Dataprep Microservice with MariaDB Vector For details, please refer to this [readme](src/README_mariadb.md) + +## Running in the air gapped environment + +The following steps are common for running the dataprep microservice in an air gapped environment (a.k.a. environment with no internet access), for all DB backends. + +1. Download the following models, e.g. `huggingface-cli download --cache-dir ` + +- microsoft/table-transformer-structure-recognition +- timm/resnet18.a1_in1k +- unstructuredio/yolo_x_layout + +2. launch the `dataprep` microservice with the following settings: + +- mount the `model data directory` as the `/data` directory within the `dataprep` container +- set environment variable `HF_HUB_OFFLINE` to 1 when launching the `dataprep` microservice + +e.g. `docker run -d -v :/data -e HF_HUB_OFFLINE=1 ... ...` diff --git a/comps/dataprep/deployment/docker_compose/compose.yaml b/comps/dataprep/deployment/docker_compose/compose.yaml index 481f63778c..d6a173bd0d 100644 --- a/comps/dataprep/deployment/docker_compose/compose.yaml +++ b/comps/dataprep/deployment/docker_compose/compose.yaml @@ -75,6 +75,26 @@ services: minio: condition: service_healthy + dataprep-milvus-offline: + extends: dataprep-milvus + depends_on: + tei-embedding-serving: + condition: service_healthy + standalone: + condition: service_healthy + etcd: + condition: service_healthy + minio: + condition: service_healthy + environment: + HF_HUB_OFFLINE: 1 + # Use non-existing proxy to mimic air gapped environment + no_proxy: localhost,127.0.0.1,${offline_no_proxy} + http_proxy: http://localhost:7777 + https_proxy: http://localhost:7777 + volumes: + - "${DATA_PATH:-./data}:/data" + dataprep-multimodal-milvus: image: ${REGISTRY:-opea}/dataprep:${TAG:-latest} container_name: dataprep-multimodal-milvus-server @@ -242,6 +262,22 @@ services: retries: 10 restart: unless-stopped + dataprep-qdrant-offline: + extends: dataprep-qdrant + depends_on: + qdrant-vector-db: + condition: service_healthy + tei-embedding-serving: + condition: service_healthy + environment: + HF_HUB_OFFLINE: 1 + # Use non-existing proxy to mimic air gapped environment + no_proxy: localhost,127.0.0.1,${offline_no_proxy} + http_proxy: http://localhost:7777 + https_proxy: http://localhost:7777 + volumes: + - "${DATA_PATH:-./data}:/data" + dataprep-redis: image: ${REGISTRY:-opea}/dataprep:${TAG:-latest} container_name: dataprep-redis-server @@ -271,6 +307,22 @@ services: retries: 10 restart: unless-stopped + dataprep-redis-offline: + extends: dataprep-redis + depends_on: + redis-vector-db: + condition: service_healthy + tei-embedding-serving: + condition: service_healthy + environment: + HF_HUB_OFFLINE: 1 + # Use non-existing proxy to mimic air gapped environment + no_proxy: localhost,127.0.0.1,${offline_no_proxy} + http_proxy: http://localhost:7777 + https_proxy: http://localhost:7777 + volumes: + - "${DATA_PATH:-./data}:/data" + dataprep-multimodal-redis: image: ${REGISTRY:-opea}/dataprep:${TAG:-latest} container_name: dataprep-multimodal-redis-server diff --git a/comps/dataprep/src/Dockerfile b/comps/dataprep/src/Dockerfile index df07d6f985..d3cd2bb0e1 100644 --- a/comps/dataprep/src/Dockerfile +++ b/comps/dataprep/src/Dockerfile @@ -46,9 +46,14 @@ RUN pip install --no-cache-dir --upgrade pip setuptools && \ ENV PYTHONPATH=$PYTHONPATH:/home/user RUN mkdir -p /home/user/comps/dataprep/src/uploaded_files && chown -R user /home/user/comps/dataprep/src/uploaded_files +RUN mkdir -p /data && chown -R user /data USER user ENV NLTK_DATA=/home/user/nltk_data +# air gapped support: predownload all needed nltk data +RUN mkdir -p /home/user/nltk_data && python -m nltk.downloader -d /home/user/nltk_data punkt_tab averaged_perceptron_tagger_eng stopwords +# air gapped support: set model cache dir +ENV HF_HUB_CACHE=/data WORKDIR /home/user/comps/dataprep/src diff --git a/comps/dataprep/src/README_milvus.md b/comps/dataprep/src/README_milvus.md index da3eeb9565..3908f82b5a 100644 --- a/comps/dataprep/src/README_milvus.md +++ b/comps/dataprep/src/README_milvus.md @@ -207,3 +207,7 @@ curl -X POST \ -F "chunk_size=500" \ http://localhost:6010/v1/dataprep/ingest ``` + +## Running in the air gapped environment + +Please follow the [common guide](../README.md#running-in-the-air-gapped-environment) to run dataprep microservice in the air gapped environment. diff --git a/comps/dataprep/src/README_qdrant.md b/comps/dataprep/src/README_qdrant.md index 36be4b3f43..435019beaf 100644 --- a/comps/dataprep/src/README_qdrant.md +++ b/comps/dataprep/src/README_qdrant.md @@ -72,3 +72,7 @@ curl -X POST \ -F "table_strategy=hq" \ http://localhost:6007/v1/dataprep/ingest ``` + +## Running in the air gapped environment + +Please follow the [common guide](../README.md#running-in-the-air-gapped-environment) to run dataprep microservice in the air gapped environment. diff --git a/comps/dataprep/src/README_redis.md b/comps/dataprep/src/README_redis.md index 8442ca39da..389507a84d 100644 --- a/comps/dataprep/src/README_redis.md +++ b/comps/dataprep/src/README_redis.md @@ -261,3 +261,7 @@ curl -X POST \ -d '{"file_path": "all", "index_name": "test_redis_1"}' \ http://localhost:6007/v1/dataprep/delete ``` + +## Running in the air gapped environment + +Please follow the [common guide](../README.md#running-in-the-air-gapped-environment) to run dataprep microservice in the air gapped environment. diff --git a/tests/dataprep/dataprep_utils.sh b/tests/dataprep/dataprep_utils.sh index bb959a665a..78570fa10c 100644 --- a/tests/dataprep/dataprep_utils.sh +++ b/tests/dataprep/dataprep_utils.sh @@ -224,3 +224,21 @@ function check_healthy() { echo "$container_name did not become healthy in time." return 1 } + +DATAPREP_MODELS=(microsoft/table-transformer-structure-recognition timm/resnet18.a1_in1k unstructuredio/yolo_x_layout) + +function prepare_dataprep_models() { + local model_path=$1 + mkdir -p ${model_path} + python3 -m pip install huggingface_hub[cli] --user + # Workaround for huggingface-cli reporting error when set --cache-dir to same as default + local extra_args="" + local default_model_dir=$(readlink -m ~/.cache/huggingface/hub) + local real_model_dir=$(echo ${model_path/#\~/$HOME} | xargs readlink -m ) + if [[ "${default_model_dir}" != "${real_model_dir}" ]]; then + extra_args="--cache-dir ${model_path}" + fi + for m in ${DATAPREP_MODELS[@]}; do + PATH=~/.local/bin:$PATH huggingface-cli download ${extra_args} $m + done +} diff --git a/tests/dataprep/test_dataprep_milvus.sh b/tests/dataprep/test_dataprep_milvus.sh index 958e44ddf6..6fcad88e97 100644 --- a/tests/dataprep/test_dataprep_milvus.sh +++ b/tests/dataprep/test_dataprep_milvus.sh @@ -28,6 +28,7 @@ function build_docker_images() { } function start_service() { + local offline=${1:-false} export host_ip=${ip_address} export TEI_EMBEDDER_PORT=12005 export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" @@ -35,6 +36,12 @@ function start_service() { export TEI_EMBEDDING_ENDPOINT="http://${host_ip}:${TEI_EMBEDDER_PORT}" export LOGFLAG=true + if [[ "$offline" == "true" ]]; then + service_name="dataprep-milvus-offline tei-embedding-serving etcd minio standalone" + export offline_no_proxy="${ip_address},${host_ip}" + else + service_name="dataprep-milvus tei-embedding-serving etcd minio standalone" + fi cd $WORKPATH/comps/dataprep/deployment/docker_compose/ docker compose up ${service_name} -d > ${LOG_PATH}/start_services_with_compose.log @@ -42,6 +49,7 @@ function start_service() { } function validate_microservice() { + local offline=${1:-false} # test /v1/dataprep/delete delete_all ${ip_address} ${DATAPREP_PORT} check_result "dataprep - del" '{"status":true}' dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log @@ -69,8 +77,10 @@ function validate_microservice() { check_result "dataprep - upload - xlsx" "Data preparation succeeded" dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log # test /v1/dataprep/ingest upload link - ingest_external_link ${ip_address} ${DATAPREP_PORT} - check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log + if [[ "$offline" != "true" ]]; then + ingest_external_link ${ip_address} ${DATAPREP_PORT} + check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-milvus-server ${LOG_PATH}/dataprep_milvus.log + fi # test /v1/dataprep/get get_all ${ip_address} ${DATAPREP_PORT} @@ -95,11 +105,21 @@ function main() { stop_docker build_docker_images - start_service + trap stop_docker EXIT + echo "Test normal env ..." + start_service validate_microservice - stop_docker + + if [[ -n "${DATA_PATH}" ]]; then + echo "Test air gapped env ..." + prepare_dataprep_models ${DATA_PATH} + start_service true + validate_microservice true + stop_docker + fi + echo y | docker system prune } diff --git a/tests/dataprep/test_dataprep_qdrant.sh b/tests/dataprep/test_dataprep_qdrant.sh index 086a399570..ac2678c844 100644 --- a/tests/dataprep/test_dataprep_qdrant.sh +++ b/tests/dataprep/test_dataprep_qdrant.sh @@ -29,6 +29,7 @@ function build_docker_images() { } function start_service() { + local offline=${1:-false} export host_ip=${ip_address} export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export EMBED_MODEL=${EMBEDDING_MODEL_ID} @@ -37,7 +38,12 @@ function start_service() { export COLLECTION_NAME="rag-qdrant" export QDRANT_HOST=$ip_address export QDRANT_PORT=6360 - service_name="qdrant-vector-db tei-embedding-serving dataprep-qdrant" + if [[ "$offline" == "true" ]]; then + service_name="qdrant-vector-db tei-embedding-serving dataprep-qdrant-offline" + export offline_no_proxy="${ip_address}" + else + service_name="qdrant-vector-db tei-embedding-serving dataprep-qdrant" + fi cd $WORKPATH/comps/dataprep/deployment/docker_compose/ docker compose up ${service_name} -d @@ -45,6 +51,7 @@ function start_service() { } function validate_microservice() { + local offline=${1:-false} # test /v1/dataprep/ingest upload file ingest_doc ${ip_address} ${DATAPREP_PORT} check_result "dataprep - upload - doc" "Data preparation succeeded" dataprep-qdrant-server ${LOG_PATH}/dataprep-qdrant.log @@ -68,8 +75,10 @@ function validate_microservice() { check_result "dataprep - upload - xlsx" "Data preparation succeeded" dataprep-qdrant-server ${LOG_PATH}/dataprep-qdrant.log # test /v1/dataprep/ingest upload link - ingest_external_link ${ip_address} ${DATAPREP_PORT} - check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-qdrant-server ${LOG_PATH}/dataprep-qdrant.log + if [[ "$offline" != "true" ]]; then + ingest_external_link ${ip_address} ${DATAPREP_PORT} + check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-qdrant-server ${LOG_PATH}/dataprep-qdrant.log + fi } @@ -78,14 +87,30 @@ function stop_docker() { if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi } +function stop_service() { + cd $WORKPATH/comps/dataprep/deployment/docker_compose/ + docker compose down || true +} + function main() { stop_docker build_docker_images - start_service + trap stop_service EXIT + echo "Test normal env ..." + start_service validate_microservice + stop_service + + if [[ -n "${DATA_PATH}" ]]; then + echo "Test air gapped env ..." + prepare_dataprep_models ${DATA_PATH} + start_service true + validate_microservice true + stop_service + fi stop_docker echo y | docker system prune diff --git a/tests/dataprep/test_dataprep_redis.sh b/tests/dataprep/test_dataprep_redis.sh index 952a4ed628..230808a24e 100644 --- a/tests/dataprep/test_dataprep_redis.sh +++ b/tests/dataprep/test_dataprep_redis.sh @@ -28,6 +28,7 @@ function build_docker_images() { } function start_service() { + local offline=${1:-false} export host_ip=${ip_address} export REDIS_HOST=$ip_address @@ -38,7 +39,12 @@ function start_service() { export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5" export TEI_EMBEDDING_ENDPOINT="http://${ip_address}:${TEI_EMBEDDER_PORT}" export INDEX_NAME="rag_redis" - service_name="redis-vector-db tei-embedding-serving dataprep-redis" + if [[ "$offline" == "true" ]]; then + service_name="redis-vector-db tei-embedding-serving dataprep-redis-offline" + export offline_no_proxy="${ip_address}" + else + service_name="redis-vector-db tei-embedding-serving dataprep-redis" + fi cd $WORKPATH/comps/dataprep/deployment/docker_compose/ docker compose up ${service_name} -d @@ -46,6 +52,7 @@ function start_service() { } function validate_microservice() { + local offline=${1:-false} # test /v1/dataprep/delete delete_all ${ip_address} ${DATAPREP_PORT} @@ -73,12 +80,14 @@ function validate_microservice() { ingest_xlsx ${ip_address} ${DATAPREP_PORT} "redis" check_result "dataprep - upload - xlsx" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log - # test /v1/dataprep/ingest upload link - ingest_external_link ${ip_address} ${DATAPREP_PORT} - check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log + # test /v1/dataprep/ingest upload link + if [[ "$offline" != "true" ]]; then + ingest_external_link ${ip_address} ${DATAPREP_PORT} + check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log - ingest_external_link_with_chunk_parameters ${ip_address} ${DATAPREP_PORT} "rag_redis_test_link_params" - check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log + ingest_external_link_with_chunk_parameters ${ip_address} ${DATAPREP_PORT} "rag_redis_test_link_params" + check_result "dataprep - upload - link" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log + fi ingest_txt_with_index_name ${ip_address} ${DATAPREP_PORT} rag_redis_test check_result "dataprep - upload with index - txt" "Data preparation succeeded" dataprep-redis-server ${LOG_PATH}/dataprep_upload_file.log @@ -114,14 +123,30 @@ function stop_docker() { if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid && sleep 1s; fi } +function stop_service() { + cd $WORKPATH/comps/dataprep/deployment/docker_compose/ + docker compose down || true +} + function main() { stop_docker build_docker_images - start_service + trap stop_service EXIT + echo "Test normal env ..." + start_service validate_microservice + stop_service + + if [[ -n "${DATA_PATH}" ]]; then + echo "Test air gapped env ..." + prepare_dataprep_models ${DATA_PATH} + start_service true + validate_microservice true + stop_service + fi stop_docker echo y | docker system prune