Add TRT-LLM Gen. AI Autoscaling & Load Balancing Guide

This change adds a guide for deploying autoscaling & load balancing of TensorRT-LLM Gen. AI models. Includes: - Guidance - Helm chart w/ multiple example models value files - YAML files necessary for setting up a Kubernetes cluster - Build files for required container images - Grafana dashboard configuration JSON file
triton-inference-server · May 28, 2024 · a01c20f · a01c20f
1 parent b3759c8
commit a01c20f
Show file tree

Hide file tree

Showing 40 changed files with 4,930 additions and 1 deletion.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -65,7 +65,7 @@ repos:
   - id: check-json
   - id: check-toml
   - id: check-yaml
-    exclude: ^deploy(\/[^\/]+)*\/templates\/.*$
+    exclude: ^Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/.+$
   - id: check-shebang-scripts-are-executable
   - id: end-of-file-fixer
     types_or: [c, c++, cuda, proto, textproto, java, python]

diff --git a/Deployment/Kubernetes/README.md b/Deployment/Kubernetes/README.md
@@ -0,0 +1,3 @@
+# Kubernetes Deployment of Triton Server Guides
+
+* [TensorRT-LLM Gen. AI Autoscaling &amp; Load Balancing](./TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md)
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/README.md
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/Chart.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/Chart.yaml
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v2
+appVersion: 0.1.0
+description: Triton + TensorRT-LLM autoscaling and load balancing example.
+icon: https://www.nvidia.com/content/dam/en-zz/Solutions/about-nvidia/logo-and-brand/[email protected]
+name: triton_trt-llm_aslb-example
+version: 0.1.0
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/gpt2_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/gpt2_values.yaml
@@ -0,0 +1,20 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+gpu:
+- Tesla-T4
+- Tesla-V100-SXM2-16GB
+
+model:
+  name: gpt2
diff --git a/.../Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b-chat_values.yaml b/.../Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b-chat_values.yaml
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See values.yaml for reference values.
+
+gpu:
+- Tesla-V100-SXM2-16GB
+
+model:
+  name: llama-2-7b-chat
+  pullSecret: hf-model-pull
+  tensorrtLlm:
+    parallelism:
+      tensor: 2
+
+autoscaling:
+  metric:
+    value: 1500m
diff --git a/...yment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b_values.yaml b/...yment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-2-7b_values.yaml
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See values.yaml for reference values.
+
+gpu:
+- NVIDIA-A10G
+- NVIDIA-A100-SXM4-40GB
+
+model:
+  name: llama-2-7b
+  pullSecret: hf-model-pull
diff --git a/...rnetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-70b-instruct_values.yaml b/...rnetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-70b-instruct_values.yaml
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See values.yaml for reference values.
+
+gpu:
+- NVIDIA-A100-SXM4-40GB
+
+model:
+  name: llama-3-70b-instruct
+  pullSecret: hf-model-pull
+  tensorrtLlm:
+    parallelism:
+      tensor: 8
+
+autoscaling:
+  metric:
+    value: 3500m
diff --git a/...ernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b-instruct_values.yaml b/...ernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b-instruct_values.yaml
@@ -0,0 +1,29 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See values.yaml for reference values.
+
+gpu:
+- Tesla-V100-SXM2-16GB
+
+model:
+  name: llama-3-8b-instruct
+  pullSecret: hf-model-pull
+  tensorrtLlm:
+    parallelism:
+      tensor: 2
+
+autoscaling:
+  metric:
+    value: 1500m
diff --git a/...yment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b_values.yaml b/...yment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/llama-3-8b_values.yaml
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See values.yaml for reference values.
+
+gpu:
+- NVIDIA-A10G
+- NVIDIA-A100-SXM4-40GB
+
+model:
+  name: llama-3-8b
+  pullSecret: hf-model-pull
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/opt125m_values.yaml b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/opt125m_values.yaml
@@ -0,0 +1,23 @@
+# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# See values.yaml for reference values.
+
+gpu:
+- Tesla-V100-SXM2-16GB
+- Tesla-T4
+
+model:
+  name: opt125m
+  pullSecret: hf-model-pull
diff --git a/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/NOTES.txt b/Deployment/Kubernetes/TensorRT-LLM_Autoscaling_and_Load_Balancing/chart/templates/NOTES.txt
@@ -0,0 +1,12 @@
+{{ $.Chart.Name }} ({{ $.Chart.Version }}) installation complete.
+
+Release Name: {{ $.Release.Name }}
+Namespace: {{ $.Release.Namespace }}
+Deployment Name: {{ $.Release.Name }}
+Service Name: {{ $.Release.Name }}
+
+Helpful commands:
+
+  $ helm status --namespace={{ $.Release.Namespace }} {{ $.Release.Name }}
+  $ helm get --namespace={{ $.Release.Namespace }} all {{ $.Release.Name }}
+  $ kubectl get --namespace={{ $.Release.Namespace }} --selector='app={{ $.Release.Name }}' deployments,pods,hpa,services,podmonitors