opea-project · yongfengdu · Jun 9, 2025 · May 30, 2025 · May 30, 2025 · Jun 4, 2025
@@ -60,6 +60,42 @@ helm install llm-uservice . --set TEXTGEN_BACKEND="BEDROCK" --set LLM_MODEL_ID=$
 # helm install llm-uservice . --set image.repository="opea/llm-faqgen" --set FAQGEN_BACKEND="vLLM" --set LLM_ENDPOINT=${LLM_ENDPOINT} --set LLM_MODEL_ID=${LLM_MODEL_ID} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --wait
 ```
 
+### Install the microservice in air gapped (offline) mode
+
+To run `llm-docsum` microservice in an air gapped environment, users are required to pre-download the following models to a shared storage:
+
+- gpt2
+- the same model as the inference backend engine
+
+Below is an example for using node level local directory to download the model data:
+
+Assuming the model data is shared using node-local directory `/mnt/opea-models`.
+
+```
+# On every K8s node, run the following command:
+export MODEL_DIR=/mnt/opea-models
+# Download model, assumes Python huggingface_hub[cli] module is already installed
+huggingface-cli download --cache-dir "${MODEL_DIR}" gpt2
+huggingface-cli download --cache-dir "${MODEL_DIR}" ${LLM_MODEL_ID}
+
+# On K8s master node, run the following command:
+# Install using Helm with the following additional parameters:
+helm install ... ... --set global.offline=true,global.modelUseHostPath=${MODEL_DIR}
+
+```
+
+Assuming we share the offline data on cluster level using a persistent volume (PV), first we need to create the persistent volume claim (PVC) with name `opea-model-pvc` to store the model data.
+
+```
+# Download model data at the root directory of the corresponding PV
+# ... ...
+# Install using Helm with the following additional parameters:
+# export MODEL_PVC=opea-model-pvc
+# helm install ... ... --set global.offline=true,global.modelUsePVC=${MODEL_PVC}
+```
+
+There is no special step or setting needed to run `llm-textgen` or `llm-faqgen` microservice in an air gapped environment.
+
 ## Verify
 
 To verify the installation, run the command `kubectl get pod` to make sure all pods are running.
@@ -99,6 +135,7 @@ curl http://localhost:9000/v1/faqgen \
 | TEXTGEN_BACKEND                 | string | `"TGI"`                       | backend inference engine, only valid for llm-textgen image, one of "TGI", "vLLM", "BEDROCK"                                       |
 | DOCSUM_BACKEND                  | string | `"TGI"`                       | backend inference engine, only valid for llm-docsum image, one of "TGI", "vLLM"                                                   |
 | FAQGEN_BACKEND                  | string | `"TGI"`                       | backend inference engine, only valid for llm-faqgen image, one of "TGi", "vLLM"                                                   |
+| global.offline                  | bool   | `false`                       | Whether to run the microservice in air gapped environment                                                                         |
 | global.monitoring               | bool   | `false`                       | Service usage metrics                                                                                                             |
 | bedrock.BEDROCK_REGION          | string | `"us-east-1"`                 | The AWS Region to use when accessing the Bedrock service                                                                          |
 | bedrock.AWS_ACCESS_KEY_ID       | string | `""`                          | The AWS Access Key to use when authenticating with the Bedrock service. If set, bedrock.AWS_SECRET_ACCESS_KEY must also be set    |

@@ -71,7 +71,9 @@ data:
   LLM_MODEL_ID: {{ .Values.LLM_MODEL_ID | quote }}
   {{- end }}
   HF_HOME: "/tmp/.cache/huggingface"
+  {{- if not .Values.global.offline }}
   HF_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote }}
+  {{- end }}
   {{- if .Values.global.HF_ENDPOINT }}
   HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote }}
   {{- end }}

@@ -81,6 +81,10 @@ spec:
           volumeMounts:
             - mountPath: /tmp
               name: tmp
+            {{- if and .Values.global.offline (hasSuffix "llm-docsum" .Values.image.repository) }}
+            - mountPath: /data
+              name: model-volume
+            {{- end }}
           {{- if .Values.livenessProbe }}
           livenessProbe:
             {{- toYaml .Values.livenessProbe | nindent 12 }}
@@ -98,6 +102,19 @@ spec:
       volumes:
         - name: tmp
           emptyDir: {}
+        {{- if and .Values.global.offline (hasSuffix "llm-docsum" .Values.image.repository) }}
+        - name: model-volume
+          {{- if .Values.global.modelUsePVC }}
+          persistentVolumeClaim:
+            claimName: {{ .Values.global.modelUsePVC }}
+          {{- else if .Values.global.modelUseHostPath }}
+          hostPath:
+            path: {{ .Values.global.modelUseHostPath }}
+            type: Directory
+          {{- else }}
+          {{- fail "Either global.modelUsePVC or global.modelUseHostPath must be set in offline environment" }}
+          {{- end }}
+        {{- end }}
       {{- with .Values.nodeSelector }}
       nodeSelector:
         {{- toYaml . | nindent 8 }}

@@ -122,6 +122,15 @@ global:
   # If set, it will overwrite serviceAccount.name.
   # If set, and serviceAccount.create is false, it will assume this service account is already created by others.
   sharedSAName: ""
+  # Running llm-docsum service in air gapped (offline) mode
+  # If offline is enabled, user must set either modelUseHostPath or modelUsePVC and download models `gpt2` as well as the model used by the LLM inference backend.
+  offline: false
+  # To store offline model data in local directory for one node K8s environment, set modelUseHostPath
+  # Download offline models: huggingface-cli download --cache-dir <modelUseHostPath> <model>
+  modelUseHostPath: ""
+  # To store offline model data in persistent volume (PV) to be shared by multinode K8s environment, set modelUsePVC
+  # then download the offline model to the root directory of that PV, like above.
+  modelUsePVC: ""
 
   # Install Prometheus serviceMonitor for service
   monitoring: false

@@ -6,7 +6,7 @@ image:
   tag: "latest"
 
 DOCSUM_BACKEND: "vLLM"
-LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3"
+LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
 MAX_INPUT_TOKENS: 2048
 MAX_TOTAL_TOKENS: 4096
 retryTimeoutSeconds: 720
@@ -19,7 +19,7 @@ vllm:
   image:
     repository: opea/vllm-gaudi
     tag: "latest"
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
   OMPI_MCA_btl_vader_single_copy_mechanism: none
   extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq-len-to-capture","2048"]
   resources:

@@ -11,7 +11,7 @@ vllm:
   image:
     repository: opea/vllm-gaudi
     tag: "latest"
-  LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+  LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
   OMPI_MCA_btl_vader_single_copy_mechanism: none
   extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq-len-to-capture","2048"]
   resources:
@@ -21,5 +21,5 @@ vllm:
     failureThreshold: 360
 
 TEXTGEN_BACKEND: "vLLM"
-LLM_MODEL_ID: Intel/neural-chat-7b-v3-3
+LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct
 retryTimeoutSeconds: 720