diff --git a/ChatQnA/falcon_vllm_v2.yaml b/ChatQnA/falcon_vllm_v2.yaml new file mode 100644 index 0000000000..de7cb3e2b8 --- /dev/null +++ b/ChatQnA/falcon_vllm_v2.yaml @@ -0,0 +1,523 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + EMBEDDING_SERVER_HOST_IP: embedding-dependency-svc + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + INDEX_NAME: rag-redis + LLM_MODEL: "/data/falcon-three-10b" + LLM_MODEL_ID: "/data/falcon-three-10b" + LLM_SERVER_HOST_IP: llm-dependency-svc + NODE_SELECTOR: opea + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + RERANK_MODEL_ID: BAAI/bge-reranker-base + RERANK_SERVER_HOST_IP: reranking-dependency-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 + + LOG_FALG: "true" + VLLM_LOGGING_LEVEL: DEBUG + VLLM_SKIP_WARMUP: "true" + LLM_SERVER_PORT: '9009' + RERANK_SERVER_PORT: '8808' + EMBEDDING_SERVER_PORT: '6006' + +kind: ConfigMap +metadata: + name: qna-config + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna-template1024:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 8000Mi + requests: + cpu: 8 + memory: 8000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + ports: + - name: service + nodePort: 30888 + port: 8888 + targetPort: 8888 + selector: + app: chatqna-backend-server-deploy + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + ports: + - containerPort: 6007 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: dataprep-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: dataprep-svc + namespace: default +spec: + ports: + - name: port1 + port: 6007 + targetPort: 6007 + selector: + app: dataprep-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + imagePullPolicy: IfNotPresent + name: embedding-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + cpu: 80 + memory: 20000Mi + requests: + cpu: 80 + memory: 20000Mi + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 6006 + targetPort: 80 + selector: + app: embedding-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + containers: + - args: + - "--model" + - "/data/falcon-three-10b" + - "--tensor-parallel-size" + - "1" + - "--gpu-memory-utilization" + - "0.5" + - "--max-num-seqs" + - "256" + - "--block-size" + - "128" + - "--max-seq-len-to-capture" + - "2048" + - "--host" + - "0.0.0.0" + - "--port" + - "80" + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + envFrom: + - configMapRef: + name: qna-config + image: opea/vllm-hpu:latest + imagePullPolicy: IfNotPresent + name: llm-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + capabilities: + add: + - SYS_NICE + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 9009 + targetPort: 80 + selector: + app: llm-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(RERANK_MODEL_ID) + - --auto-truncate + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + - name: MAX_WARMUP_SEQUENCE_LENGTH + value: '512' + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tei-gaudi:1.5.0 + imagePullPolicy: IfNotPresent + name: reranking-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 8808 + targetPort: 80 + selector: + app: reranking-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + ports: + - containerPort: 7000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: retriever-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: retriever-svc + namespace: default +spec: + ports: + - name: service + port: 7000 + targetPort: 7000 + selector: + app: retriever-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: vector-db + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: redis/redis-stack:7.2.0-v9 + imagePullPolicy: IfNotPresent + name: vector-db + ports: + - containerPort: 6379 + - containerPort: 8001 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: vector-db + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + selector: + app: vector-db + type: ClusterIP +--- + diff --git a/ChatQnA/falcon_vllm_v2_7b.yaml b/ChatQnA/falcon_vllm_v2_7b.yaml new file mode 100644 index 0000000000..d1bc4b1a33 --- /dev/null +++ b/ChatQnA/falcon_vllm_v2_7b.yaml @@ -0,0 +1,521 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + EMBEDDING_SERVER_HOST_IP: embedding-dependency-svc + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + INDEX_NAME: rag-redis + LLM_MODEL: "/data/falcon-three-7b" + LLM_MODEL_ID: "/data/falcon-three-7b" + LLM_SERVER_HOST_IP: llm-dependency-svc + NODE_SELECTOR: opea + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + RERANK_MODEL_ID: BAAI/bge-reranker-base + RERANK_SERVER_HOST_IP: reranking-dependency-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 + + VLLM_SKIP_WARMUP: "true" + LLM_SERVER_PORT: '9009' + RERANK_SERVER_PORT: '8808' + EMBEDDING_SERVER_PORT: '6006' + +kind: ConfigMap +metadata: + name: qna-config + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna-template1024:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 8000Mi + requests: + cpu: 8 + memory: 8000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + ports: + - name: service + nodePort: 30888 + port: 8888 + targetPort: 8888 + selector: + app: chatqna-backend-server-deploy + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + ports: + - containerPort: 6007 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: dataprep-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: dataprep-svc + namespace: default +spec: + ports: + - name: port1 + port: 6007 + targetPort: 6007 + selector: + app: dataprep-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + imagePullPolicy: IfNotPresent + name: embedding-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + cpu: 80 + memory: 20000Mi + requests: + cpu: 80 + memory: 20000Mi + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 6006 + targetPort: 80 + selector: + app: embedding-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + containers: + - args: + - "--model" + - "/data/falcon-three-7b" + - "--tensor-parallel-size" + - "1" + - "--gpu-memory-utilization" + - "0.5" + - "--max-num-seqs" + - "256" + - "--block-size" + - "128" + - "--max-seq-len-to-capture" + - "2048" + - "--host" + - "0.0.0.0" + - "--port" + - "80" + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + envFrom: + - configMapRef: + name: qna-config + image: opea/vllm-hpu:latest + imagePullPolicy: IfNotPresent + name: llm-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + capabilities: + add: + - SYS_NICE + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 9009 + targetPort: 80 + selector: + app: llm-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(RERANK_MODEL_ID) + - --auto-truncate + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + - name: MAX_WARMUP_SEQUENCE_LENGTH + value: '512' + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tei-gaudi:1.5.0 + imagePullPolicy: IfNotPresent + name: reranking-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 8808 + targetPort: 80 + selector: + app: reranking-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + ports: + - containerPort: 7000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: retriever-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: retriever-svc + namespace: default +spec: + ports: + - name: service + port: 7000 + targetPort: 7000 + selector: + app: retriever-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: vector-db + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: redis/redis-stack:7.2.0-v9 + imagePullPolicy: IfNotPresent + name: vector-db + ports: + - containerPort: 6379 + - containerPort: 8001 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: vector-db + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + selector: + app: vector-db + type: ClusterIP +--- + diff --git a/ChatQnA/neuralchat_vllm.yaml b/ChatQnA/neuralchat_vllm.yaml new file mode 100644 index 0000000000..e547fae665 --- /dev/null +++ b/ChatQnA/neuralchat_vllm.yaml @@ -0,0 +1,520 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: v1 +data: + EMBEDDING_MODEL_ID: BAAI/bge-base-en-v1.5 + EMBEDDING_SERVER_HOST_IP: embedding-dependency-svc + HUGGINGFACEHUB_API_TOKEN: ${HF_TOKEN} + INDEX_NAME: rag-redis + LLM_MODEL_ID: Intel/neural-chat-7b-v3-3 + LLM_MODEL: Intel/neural-chat-7b-v3-3 + LLM_SERVER_HOST_IP: llm-dependency-svc + NODE_SELECTOR: opea + REDIS_URL: redis://vector-db.default.svc.cluster.local:6379 + RERANK_MODEL_ID: BAAI/bge-reranker-base + RERANK_SERVER_HOST_IP: reranking-dependency-svc + RETRIEVER_SERVICE_HOST_IP: retriever-svc + TEI_EMBEDDING_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_ENDPOINT: http://embedding-dependency-svc.default.svc.cluster.local:6006 + TEI_RERANKING_ENDPOINT: http://reranking-dependency-svc.default.svc.cluster.local:8808 + TGI_LLM_ENDPOINT: http://llm-dependency-svc.default.svc.cluster.local:9009 + VLLM_SKIP_WARMUP: "true" + LLM_SERVER_PORT: '9009' + RERANK_SERVER_PORT: '8808' + EMBEDDING_SERVER_PORT: '6006' + +kind: ConfigMap +metadata: + name: qna-config + namespace: default +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: chatqna-backend-server-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: chatqna-backend-server-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: chatqna-backend-server-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/chatqna:latest + imagePullPolicy: IfNotPresent + name: chatqna-backend-server-deploy + ports: + - containerPort: 8888 + resources: + limits: + cpu: 8 + memory: 8000Mi + requests: + cpu: 8 + memory: 8000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: chatqna-backend-server-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: chatqna-backend-server-svc + namespace: default +spec: + ports: + - name: service + nodePort: 30888 + port: 8888 + targetPort: 8888 + selector: + app: chatqna-backend-server-deploy + type: NodePort +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: dataprep-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: dataprep-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: dataprep-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/dataprep-redis:latest + imagePullPolicy: IfNotPresent + name: dataprep-deploy + ports: + - containerPort: 6007 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: dataprep-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: dataprep-svc + namespace: default +spec: + ports: + - name: port1 + port: 6007 + targetPort: 6007 + selector: + app: dataprep-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: embedding-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: embedding-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: embedding-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(EMBEDDING_MODEL_ID) + - --auto-truncate + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/text-embeddings-inference:cpu-1.5 + imagePullPolicy: IfNotPresent + name: embedding-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + cpu: 80 + memory: 20000Mi + requests: + cpu: 80 + memory: 20000Mi + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: embedding-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: embedding-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 6006 + targetPort: 80 + selector: + app: embedding-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: llm-dependency-deploy + namespace: default +spec: + replicas: 7 + selector: + matchLabels: + app: llm-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: llm-dependency-deploy + spec: + containers: + - args: + - "--model" + - "Intel/neural-chat-7b-v3-3" + - "--tensor-parallel-size" + - "1" + - "--gpu-memory-utilization" + - "0.5" + - "--max-num-seqs" + - "256" + - "--block-size" + - "128" + - "--max-seq-len-to-capture" + - "2048" + - "--host" + - "0.0.0.0" + - "--port" + - "80" + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + envFrom: + - configMapRef: + name: qna-config + image: opea/vllm-hpu:latest + imagePullPolicy: IfNotPresent + name: llm-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + securityContext: + capabilities: + add: + - SYS_NICE + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: llm-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: llm-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 9009 + targetPort: 80 + selector: + app: llm-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: reranking-dependency-deploy + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: reranking-dependency-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: reranking-dependency-deploy + spec: + containers: + - args: + - --model-id + - $(RERANK_MODEL_ID) + - --auto-truncate + env: + - name: OMPI_MCA_btl_vader_single_copy_mechanism + value: none + - name: PT_HPU_ENABLE_LAZY_COLLECTIVES + value: 'true' + - name: runtime + value: habana + - name: HABANA_VISIBLE_DEVICES + value: all + - name: HF_TOKEN + value: ${HF_TOKEN} + - name: MAX_WARMUP_SEQUENCE_LENGTH + value: '512' + envFrom: + - configMapRef: + name: qna-config + image: ghcr.io/huggingface/tei-gaudi:1.5.0 + imagePullPolicy: IfNotPresent + name: reranking-dependency-deploy + ports: + - containerPort: 80 + resources: + limits: + habana.ai/gaudi: 1 + volumeMounts: + - mountPath: /data + name: model-volume + - mountPath: /dev/shm + name: shm + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: reranking-dependency-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway + volumes: + - hostPath: + path: /mnt/models + type: Directory + name: model-volume + - emptyDir: + medium: Memory + sizeLimit: 1Gi + name: shm +--- +apiVersion: v1 +kind: Service +metadata: + name: reranking-dependency-svc + namespace: default +spec: + ports: + - name: service + port: 8808 + targetPort: 80 + selector: + app: reranking-dependency-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: retriever-deploy + namespace: default +spec: + replicas: 2 + selector: + matchLabels: + app: retriever-deploy + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: retriever-deploy + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: opea/retriever-redis:latest + imagePullPolicy: IfNotPresent + name: retriever-deploy + ports: + - containerPort: 7000 + resources: + requests: + cpu: 4 + memory: 4000Mi + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: retriever-deploy + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: retriever-svc + namespace: default +spec: + ports: + - name: service + port: 7000 + targetPort: 7000 + selector: + app: retriever-deploy + type: ClusterIP +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vector-db + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: vector-db + template: + metadata: + annotations: + sidecar.istio.io/rewriteAppHTTPProbers: 'true' + labels: + app: vector-db + spec: + containers: + - envFrom: + - configMapRef: + name: qna-config + image: redis/redis-stack:7.2.0-v9 + imagePullPolicy: IfNotPresent + name: vector-db + ports: + - containerPort: 6379 + - containerPort: 8001 + hostIPC: true + nodeSelector: + node-type: chatqna-opea + serviceAccountName: default + topologySpreadConstraints: + - labelSelector: + matchLabels: + app: vector-db + maxSkew: 1 + topologyKey: kubernetes.io/hostname + whenUnsatisfiable: ScheduleAnyway +--- +apiVersion: v1 +kind: Service +metadata: + name: vector-db + namespace: default +spec: + ports: + - name: vector-db-service + port: 6379 + targetPort: 6379 + - name: vector-db-insight + port: 8001 + targetPort: 8001 + selector: + app: vector-db + type: ClusterIP +--- +