diff --git a/helm-charts/common/llm-uservice/README.md b/helm-charts/common/llm-uservice/README.md index 570fc3af5..aed911c32 100644 --- a/helm-charts/common/llm-uservice/README.md +++ b/helm-charts/common/llm-uservice/README.md @@ -60,6 +60,42 @@ helm install llm-uservice . --set TEXTGEN_BACKEND="BEDROCK" --set LLM_MODEL_ID=$ # helm install llm-uservice . --set image.repository="opea/llm-faqgen" --set FAQGEN_BACKEND="vLLM" --set LLM_ENDPOINT=${LLM_ENDPOINT} --set LLM_MODEL_ID=${LLM_MODEL_ID} --set global.HUGGINGFACEHUB_API_TOKEN=${HFTOKEN} --wait ``` +### Install the microservice in air gapped (offline) mode + +To run `llm-docsum` microservice in an air gapped environment, users are required to pre-download the following models to a shared storage: + +- gpt2 +- the same model as the inference backend engine + +Below is an example for using node level local directory to download the model data: + +Assuming the model data is shared using node-local directory `/mnt/opea-models`. + +``` +# On every K8s node, run the following command: +export MODEL_DIR=/mnt/opea-models +# Download model, assumes Python huggingface_hub[cli] module is already installed +huggingface-cli download --cache-dir "${MODEL_DIR}" gpt2 +huggingface-cli download --cache-dir "${MODEL_DIR}" ${LLM_MODEL_ID} + +# On K8s master node, run the following command: +# Install using Helm with the following additional parameters: +helm install ... ... --set global.offline=true,global.modelUseHostPath=${MODEL_DIR} + +``` + +Assuming we share the offline data on cluster level using a persistent volume (PV), first we need to create the persistent volume claim (PVC) with name `opea-model-pvc` to store the model data. + +``` +# Download model data at the root directory of the corresponding PV +# ... ... +# Install using Helm with the following additional parameters: +# export MODEL_PVC=opea-model-pvc +# helm install ... ... --set global.offline=true,global.modelUsePVC=${MODEL_PVC} +``` + +There is no special step or setting needed to run `llm-textgen` or `llm-faqgen` microservice in an air gapped environment. + ## Verify To verify the installation, run the command `kubectl get pod` to make sure all pods are running. @@ -99,6 +135,7 @@ curl http://localhost:9000/v1/faqgen \ | TEXTGEN_BACKEND | string | `"TGI"` | backend inference engine, only valid for llm-textgen image, one of "TGI", "vLLM", "BEDROCK" | | DOCSUM_BACKEND | string | `"TGI"` | backend inference engine, only valid for llm-docsum image, one of "TGI", "vLLM" | | FAQGEN_BACKEND | string | `"TGI"` | backend inference engine, only valid for llm-faqgen image, one of "TGi", "vLLM" | +| global.offline | bool | `false` | Whether to run the microservice in air gapped environment | | global.monitoring | bool | `false` | Service usage metrics | | bedrock.BEDROCK_REGION | string | `"us-east-1"` | The AWS Region to use when accessing the Bedrock service | | bedrock.AWS_ACCESS_KEY_ID | string | `""` | The AWS Access Key to use when authenticating with the Bedrock service. If set, bedrock.AWS_SECRET_ACCESS_KEY must also be set | diff --git a/helm-charts/common/llm-uservice/templates/configmap.yaml b/helm-charts/common/llm-uservice/templates/configmap.yaml index 33a6fe994..e0b90cba8 100644 --- a/helm-charts/common/llm-uservice/templates/configmap.yaml +++ b/helm-charts/common/llm-uservice/templates/configmap.yaml @@ -71,7 +71,9 @@ data: LLM_MODEL_ID: {{ .Values.LLM_MODEL_ID | quote }} {{- end }} HF_HOME: "/tmp/.cache/huggingface" + {{- if not .Values.global.offline }} HF_TOKEN: {{ .Values.global.HUGGINGFACEHUB_API_TOKEN | quote }} + {{- end }} {{- if .Values.global.HF_ENDPOINT }} HF_ENDPOINT: {{ .Values.global.HF_ENDPOINT | quote }} {{- end }} diff --git a/helm-charts/common/llm-uservice/templates/deployment.yaml b/helm-charts/common/llm-uservice/templates/deployment.yaml index e5d4acaaa..d9c65056f 100644 --- a/helm-charts/common/llm-uservice/templates/deployment.yaml +++ b/helm-charts/common/llm-uservice/templates/deployment.yaml @@ -81,6 +81,10 @@ spec: volumeMounts: - mountPath: /tmp name: tmp + {{- if and .Values.global.offline (hasSuffix "llm-docsum" .Values.image.repository) }} + - mountPath: /data + name: model-volume + {{- end }} {{- if .Values.livenessProbe }} livenessProbe: {{- toYaml .Values.livenessProbe | nindent 12 }} @@ -98,6 +102,19 @@ spec: volumes: - name: tmp emptyDir: {} + {{- if and .Values.global.offline (hasSuffix "llm-docsum" .Values.image.repository) }} + - name: model-volume + {{- if .Values.global.modelUsePVC }} + persistentVolumeClaim: + claimName: {{ .Values.global.modelUsePVC }} + {{- else if .Values.global.modelUseHostPath }} + hostPath: + path: {{ .Values.global.modelUseHostPath }} + type: Directory + {{- else }} + {{- fail "Either global.modelUsePVC or global.modelUseHostPath must be set in offline environment" }} + {{- end }} + {{- end }} {{- with .Values.nodeSelector }} nodeSelector: {{- toYaml . | nindent 8 }} diff --git a/helm-charts/common/llm-uservice/values.yaml b/helm-charts/common/llm-uservice/values.yaml index fbb512119..fccd0ae8f 100644 --- a/helm-charts/common/llm-uservice/values.yaml +++ b/helm-charts/common/llm-uservice/values.yaml @@ -122,6 +122,15 @@ global: # If set, it will overwrite serviceAccount.name. # If set, and serviceAccount.create is false, it will assume this service account is already created by others. sharedSAName: "" + # Running llm-docsum service in air gapped (offline) mode + # If offline is enabled, user must set either modelUseHostPath or modelUsePVC and download models `gpt2` as well as the model used by the LLM inference backend. + offline: false + # To store offline model data in local directory for one node K8s environment, set modelUseHostPath + # Download offline models: huggingface-cli download --cache-dir + modelUseHostPath: "" + # To store offline model data in persistent volume (PV) to be shared by multinode K8s environment, set modelUsePVC + # then download the offline model to the root directory of that PV, like above. + modelUsePVC: "" # Install Prometheus serviceMonitor for service monitoring: false diff --git a/helm-charts/common/llm-uservice/vllm-docsum-gaudi-values.yaml b/helm-charts/common/llm-uservice/vllm-docsum-gaudi-values.yaml index 04201ad58..ba93ff61b 100644 --- a/helm-charts/common/llm-uservice/vllm-docsum-gaudi-values.yaml +++ b/helm-charts/common/llm-uservice/vllm-docsum-gaudi-values.yaml @@ -6,7 +6,7 @@ image: tag: "latest" DOCSUM_BACKEND: "vLLM" -LLM_MODEL_ID: "Intel/neural-chat-7b-v3-3" +LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct MAX_INPUT_TOKENS: 2048 MAX_TOTAL_TOKENS: 4096 retryTimeoutSeconds: 720 @@ -19,7 +19,7 @@ vllm: image: repository: opea/vllm-gaudi tag: "latest" - LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct OMPI_MCA_btl_vader_single_copy_mechanism: none extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq-len-to-capture","2048"] resources: diff --git a/helm-charts/common/llm-uservice/vllm-gaudi-values.yaml b/helm-charts/common/llm-uservice/vllm-gaudi-values.yaml index 7fa2f9213..10ce7074d 100644 --- a/helm-charts/common/llm-uservice/vllm-gaudi-values.yaml +++ b/helm-charts/common/llm-uservice/vllm-gaudi-values.yaml @@ -11,7 +11,7 @@ vllm: image: repository: opea/vllm-gaudi tag: "latest" - LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct OMPI_MCA_btl_vader_single_copy_mechanism: none extraCmdArgs: ["--tensor-parallel-size","1","--block-size","128","--max-num-seqs","256","--max-seq-len-to-capture","2048"] resources: @@ -21,5 +21,5 @@ vllm: failureThreshold: 360 TEXTGEN_BACKEND: "vLLM" -LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 +LLM_MODEL_ID: meta-llama/Meta-Llama-3-8B-Instruct retryTimeoutSeconds: 720