opea-project · poussa · Jun 16, 2025 · May 20, 2025
@@ -42,33 +42,40 @@ helm install data-prep . --set TEI_EMBEDDING_ENDPOINT=${TEI_EMBEDDING_ENDPOINT}
 # helm install data-prep . --set TEI_EMBEDDING_ENDPOINT=${TEI_EMBEDDING_ENDPOINT} --set global.HUGGINGFACEHUB_API_TOKEN=${HF_TOKEN} --set DATAPREP_BACKEND=${DATAPREP_BACKEND} --set QDRANT_HOST=${DB_HOST},QDRANT_PORT=6333,COLLECTION_NAME=rag_qdrant
 ```
 
-### Install the microservice in air gapped(offline) mode
+### Install the microservice in air gapped (offline) mode
 
-To support running this microservice in an air gapped environment, users are required to download the offline data including the `nltk` data and model `unstructuredio/yolo_x_layout` to a shared storage. Below is an example for using node level local directory to download the offline data:
+To support running this microservice in an air gapped environment, users are required to pre-download the following models to a shared storage:
 
-Assuming the `nltk` data is shared using node-local directory `/mnt/nltk_data`, and the model data is shared using node-local directory `/mnt/opea-models`.
+- microsoft/table-transformer-structure-recognition
+- timm/resnet18.a1_in1k
+- unstructuredio/yolo_x_layout
+
+Below is an example for using node level local directory to download the offline data:
+
+Assuming the model data is shared using node-local directory `/mnt/opea-models`.
 
 ```
 # On every K8s node, run the following command:
 export MODELDIR=/mnt/opea-models
-export NLTKDATA=/mnt/nltk_data
-# Download nltk data, assumes Python nltk module(s) are already installed
-python -m nltk.downloader -d $NLTKDATA all && chmod -R a+r $NLTKDATA
-# Download model,  assumes Python huggingface_hub[cli] module are already installed
-huggingface-cli download unstructuredio/yolo_x_layout --local-dir ${MODELDIR}/unstructuredio/yolo_x_layout && chmod -R a+r ${MODELDIR}/unstructuredio/yolo_x_layout
+# Download model, assumes Python huggingface_hub[cli] module is already installed
+DATAPREP_MODELS=(microsoft/table-transformer-structure-recognition timm/resnet18.a1_in1k unstructuredio/yolo_x_layout)
+for model in ${DATAPREP_MODELS[@]}; do
+    huggingface-cli download --cache-dir "${MODEL_DIR}" $model
+done
 
+# On K8s master node, run the following command:
 # Install using Helm with the following additional parameters:
-# helm install ... ... --set global.offline=true,global.modelUseHostPath=${MODELDIR},global.nltkDataUseHostPath=${NLTKDATA}
+helm install ... ... --set global.offline=true,global.modelUseHostPath=${MODELDIR}
 ```
 
-Assuming we share the offline data on cluster level using a persistent volume(PV), first we need to create the persistent volume claim(PVC) with name `opea-model-pvc` to store model data, the PVC with name `opea-nltk-pvc` to store nltk data:
+Assuming we share the offline data on cluster level using a persistent volume (PV), first we need to create the persistent volume claim (PVC) with name `opea-model-pvc` to store the model data.
 
 ```
-# Download nltk data and model into the root and `unstructuredio/yolo_x_layout` directory at the root of the corresponding PVs respectively
+# Download model data at the root directory of the corresponding PV
+# ... ...
 # Install using Helm with the following additional parameters:
 # export MODELPVC=opea-model-pvc
-# export NLTKPVC=opea-nltk-pvc
-# helm install ... ... --set global.offline=true,global.modelUsePVC=${MOELPVC},global.nltkDataUsePVC=${NLTKPVC}
+# helm install ... ... --set global.offline=true,global.modelUsePVC=${MOELPVC}
 ```
 
 ## Verify
@@ -92,6 +99,7 @@ curl http://localhost:6007/v1/dataprep/ingest  \
 | ------------------------------- | ------ | --------- | ------------------------------------------------------------------------------------------------------- |
 | service.port                    | string | `"6007"`  |                                                                                                         |
 | global.HUGGINGFACEHUB_API_TOKEN | string | `""`      | Your own Hugging Face API token                                                                         |
+| global.offline                  | bool   | `false`   | Whether to run the microservice in air gapped environment                                               |
 | DATAPREP_BACKEND                | string | `"REDIS"` | vector DB backend to use, one of "REDIS", "MILVUS", "QDRANT"                                            |
 | REDIS_HOST                      | string | `""`      | Redis service URL host, only valid for Redis, please see `values.yaml` for other Redis configuration    |
 | MILVUS_HOST                     | string | `""`      | Milvus service URL host, only valid for Milvus, please see `values.yaml` for other Milvus configuration |

@@ -85,11 +85,14 @@ data:
   http_proxy: {{ .Values.global.http_proxy | quote }}
   https_proxy: {{ .Values.global.https_proxy | quote }}
   {{- if and (and (not .Values.MILVUS_HOST ) (not .Values.REDIS_HOST)) (and (not .Values.TEI_EMBEDDING_ENDPOINT) (or .Values.global.http_proxy .Values.global.https_proxy)) }}
-  no_proxy: "{{ .Release.Name }}-tei,{{ .Release.Name }}-redis-vector-db,{{ .Release.Name }}-milvus,{{ .Values.global.no_proxy }}"
+  no_proxy: "{{ .Release.Name }}-tei,{{ .Release.Name }}-redis-vector-db,{{ .Release.Name }}-milvus,{{ .Release.Name }}-qdrant,{{ .Values.global.no_proxy }}"
   {{- else }}
   no_proxy: {{ .Values.global.no_proxy | quote }}
   {{- end }}
   LOGFLAG: {{ .Values.LOGFLAG | quote }}
   NUMBA_CACHE_DIR: "/tmp/numba/cache"
   XDG_CACHE_HOME: "/tmp/fontconfig/cache"
   MPLCONFIGDIR: "/tmp/matplotlib"
+  {{- if .Values.global.offline }}
+  HF_HUB_OFFLINE: "1"
+  {{- end }}
@@ -87,10 +87,7 @@ spec:
               name: tmp
             - mountPath: /home/user/comps/dataprep/src/uploaded_files
               name: uploaded-files
-            - mountPath: /home/user/nltk_data
-              name: nltk-data
-            - mountPath: /home/user/comps/dataprep/src/unstructuredio/yolo_x_layout
-              subPath: unstructuredio/yolo_x_layout
+            - mountPath: /data
               name: model-volume
             - mountPath: /home/user/.config
               name: user-config-data
@@ -130,21 +127,6 @@ spec:
           {{- else }}
           emptyDir: {}
           {{- end }}
-        - name: nltk-data
-          {{- if .Values.global.offline }}
-          {{- if .Values.global.nltkDataUsePVC }}
-          persistentVolumeClaim:
-            claimName: {{ .Values.global.nltkDataUsePVC }}
-          {{- else if .Values.global.nltkDataUseHostPath }}
-          hostPath:
-             path: {{ .Values.global.nltkDataUseHostPath }}
-             type: Directory
-          {{- else }}
-          {{- fail "'global.nltkDataUsePVC' or 'global.nltkDataUseHostPath' must be set in offline mode" }}
-          {{- end }}
-          {{- else }}
-          emptyDir: {}
-          {{- end }}
       {{- with .Values.nodeSelector }}
       nodeSelector:
         {{- toYaml . | nindent 8 }}

@@ -142,21 +142,14 @@ global:
   sharedSAName: ""
 
   # Running service in air gapped(offline) mode
-  # If offline is enabled, user must set either modelUseHostPath or modelUsePVC and download model unstructuredio/yolo_x_layout out of band.
-  # If offline is enabled, user must set either nltkDataUseHostPath or nltkDataUsePVC and download nltk data out of band.
+  # If offline is enabled, user must set either modelUseHostPath or modelUsePVC and download models `microsoft/table-transformer-structure-recognition`, `timm/resnet18.a1_in1k`, `unstructuredio/yolo_x_layout`
   offline: false
   # To store offline model data in local directory for one node K8s environment, set modelUseHostPath
-  # Download offline model: huggingface-cli download unstructuredio/yolo_x_layout --local-dir <modelUseHostPath>/unstructuredio/yolo_x_layout && chmod a+r <modelUseHostPath>/unstructuredio/yolo_x_layout
+  # Download offline models: huggingface-cli download --cache-dir <modelUseHostPath> <model>
   modelUseHostPath: ""
-  # To store offline data in persistent volume(PV) to be shared by multinode K8s environment, set modelUsePVC
+  # To store offline model data in persistent volume (PV) to be shared by multinode K8s environment, set modelUsePVC
   # then follow the similar above steps to download the offline model to the root directory of that PV
   modelUsePVC: ""
-  # To store offline nltk data in local directory for one node K8s environment, set nltkDataUseHostPath
-  # Download nltk data: python -m nltk.downloader -d <nltkDataUseHostPath> all && chmod a+r <nltkDataUseHostPath>
-  nltkDataUseHostPath: ""
-  # To store offline nltk data in persistent volume(PV) to be shared by multinode K8s environment, set nltkDataUsePVC
-  # and follow the similar above steps to download the nltk data to the root directory of that PV
-  nltkDataUsePVC: ""
 
   # Install Prometheus serviceMonitors for service components
   monitoring: false