[Cherry Pick] Multi node implementation (#434)

* Multi-Node Inference Implementation (kserve#3972) Signed-off-by: jooho lee <[email protected]> * fix lint and unit test for odh Signed-off-by: jooho lee <[email protected]> --------- Signed-off-by: jooho lee <[email protected]>
spolti · Nov 7, 2024 · ee9a342 · ee9a342
1 parent 89922f6
commit ee9a342
Show file tree

Hide file tree

Showing 43 changed files with 8,394 additions and 348 deletions.
diff --git a/config/crd/full/serving.kserve.io_clusterservingruntimes.yaml b/config/crd/full/serving.kserve.io_clusterservingruntimes.yaml
@@ -3218,7 +3218,9 @@ spec:
                       additionalProperties:
                         type: string
                       type: object
-                    size:
+                    pipelineParallelSize:
+                      type: integer
+                    tensorParallelSize:
                       type: integer
                     tolerations:
                       items:

diff --git a/config/crd/full/serving.kserve.io_inferenceservices.yaml b/config/crd/full/serving.kserve.io_inferenceservices.yaml
@@ -16027,6 +16027,8 @@ spec:
                             pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
                             x-kubernetes-int-or-string: true
                           type: object
+                        pipelineParallelSize:
+                          type: integer
                         preemptionPolicy:
                           type: string
                         priority:
@@ -16163,10 +16165,10 @@ spec:
                           type: boolean
                         shareProcessNamespace:
                           type: boolean
-                        size:
-                          type: integer
                         subdomain:
                           type: string
+                        tensorParallelSize:
+                          type: integer
                         terminationGracePeriodSeconds:
                           format: int64
                           type: integer

diff --git a/config/crd/full/serving.kserve.io_servingruntimes.yaml b/config/crd/full/serving.kserve.io_servingruntimes.yaml
@@ -3218,7 +3218,9 @@ spec:
                       additionalProperties:
                         type: string
                       type: object
-                    size:
+                    pipelineParallelSize:
+                      type: integer
+                    tensorParallelSize:
                       type: integer
                     tolerations:
                       items:

diff --git a/config/runtimes/kserve-huggingfaceserver-multinode.yaml b/config/runtimes/kserve-huggingfaceserver-multinode.yaml
@@ -0,0 +1,180 @@
+apiVersion: serving.kserve.io/v1alpha1
+kind: ClusterServingRuntime
+metadata:
+  name: kserve-huggingfaceserver-multinode
+spec:
+  annotations:
+    prometheus.kserve.io/port: "8080"
+    prometheus.kserve.io/path: "/metrics"
+  supportedModelFormats:
+    - name: huggingface
+      version: "1"
+      autoSelect: true
+      priority: 2
+  protocolVersions:
+    - v2
+    - v1
+  containers:
+    - name: kserve-container
+      image: kserve/huggingfaceserver:latest
+      command: ["bash", "-c"]
+      args:
+        - |
+          ray start --head --disable-usage-stats --include-dashboard false 
+          # wait for other node to join
+          until [[ $(ray status | grep -c node_) -eq ${PIPELINE_PARALLEL_SIZE} ]]; do
+            echo "Waiting..."
+            sleep 1
+          done
+          ray status
+
+          export MODEL=${MODEL_ID}
+          if [[ ! -z ${MODEL_DIR} ]]
+          then
+            MODEL=${MODEL_DIR}
+          fi
+
+          python3 -m huggingfaceserver --model_name=${MODEL_NAME}  --model_dir=${MODEL} --tensor-parallel-size=${TENSOR_PARALLEL_SIZE} --pipeline-parallel-size=${PIPELINE_PARALLEL_SIZE}
+
+      env:
+        - name: RAY_PORT
+          value: "6379"
+        - name: RAY_ADDRESS
+          value: 127.0.0.1:6379
+        - name: POD_NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: VLLM_CONFIG_ROOT
+          value: /tmp      
+        - name: HF_HUB_CACHE
+          value: /tmp
+      resources:
+        requests:
+          cpu: "2"
+          memory: 6Gi
+        limits:
+          cpu: "4"
+          memory: 12Gi
+      volumeMounts:
+        - name: shm
+          mountPath: /dev/shm
+      livenessProbe:
+        failureThreshold: 3
+        periodSeconds: 30
+        successThreshold: 1
+        timeoutSeconds: 5
+        initialDelaySeconds: 10
+        exec:
+          command:
+            - bash
+            - -c
+            - |
+              ./huggingfaceserver/health_check.py liveness
+      readinessProbe:
+        failureThreshold: 2
+        periodSeconds: 10
+        successThreshold: 1
+        timeoutSeconds: 5
+        initialDelaySeconds: 10
+        exec:
+          command:
+            - bash
+            - -c
+            - |
+              ./huggingfaceserver/health_check.py readiness ${PIPELINE_PARALLEL_SIZE} http://localhost:8080
+      startupProbe:
+        failureThreshold: 40
+        periodSeconds: 30
+        successThreshold: 1
+        timeoutSeconds: 5
+        initialDelaySeconds: 5
+        exec:
+          command:
+            - bash
+            - -c
+            - |
+              ./huggingfaceserver/health_check.py startup
+  volumes:
+    - name: shm
+      emptyDir:
+        medium: Memory
+        sizeLimit: 3Gi
+  workerSpec:
+    pipelineParallelSize: 2
+    tensorParallelSize: 1
+    containers:
+      - name: worker-container
+        image: kserve/huggingfaceserver:latest
+        command: ["bash", "-c"]
+        args:
+          - |
+            SECONDS=0
+
+            while true; do              
+              if (( SECONDS <= 120 )); then
+                if ray health-check --address "${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379" > /dev/null 2>&1; then
+                  echo "Global Control Service(GCS) is ready."
+                  break
+                fi
+                echo "$SECONDS seconds elapsed: Waiting for Global Control Service(GCS) to be ready."
+              else
+                if ray health-check --address "${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379"; then
+                  echo "Global Control Service(GCS) is ready. Any error messages above can be safely ignored."
+                  break
+                fi
+                echo "$SECONDS seconds elapsed: Still waiting for Global Control Service(GCS) to be ready."
+              fi
+              
+              sleep 5
+            done
+
+            RAY_HEAD_ADDRESS="${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379"
+            echo "Attempting to connect to Ray cluster at $RAY_HEAD_ADDRESS ..."
+            ray start --address="$RAY_HEAD_ADDRESS" --block
+        env:
+          - name: POD_NAME
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.name
+          - name: POD_NAMESPACE
+            valueFrom:
+              fieldRef:
+                fieldPath: metadata.namespace
+        resources:
+          requests:
+            cpu: "2"
+            memory: 6Gi
+          limits:
+            cpu: "4"
+            memory: 12Gi
+        volumeMounts:
+          - name: shm
+            mountPath: /dev/shm
+        livenessProbe:
+          failureThreshold: 3
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 5
+          exec:
+            command:
+              - bash
+              - -c
+              - |
+                ./huggingfaceserver/health_check.py registered_nodes ${PIPELINE_PARALLEL_SIZE}
+        startupProbe:
+          failureThreshold: 12
+          periodSeconds: 5
+          successThreshold: 1
+          timeoutSeconds: 5
+          exec:
+            command:
+              - bash
+              - -c
+              - |
+                ./huggingfaceserver/health_check.py startup
+    volumes:
+      - name: shm
+        emptyDir:
+          medium: Memory
+          sizeLimit: 3Gi
diff --git a/config/runtimes/kustomization.yaml b/config/runtimes/kustomization.yaml
@@ -12,6 +12,7 @@ resources:
 - kserve-lgbserver.yaml
 - kserve-torchserve.yaml
 - kserve-huggingfaceserver.yaml
+- kserve-huggingfaceserver-multinode.yaml
 
 images:
   # SMS Only Runtimes

diff --git a/pkg/apis/serving/v1alpha1/servingruntime_types.go b/pkg/apis/serving/v1alpha1/servingruntime_types.go
@@ -271,9 +271,16 @@ type SupportedRuntime struct {
 type WorkerSpec struct {
 	ServingRuntimePodSpec `json:",inline"`
 
-	// Configure the number of replicas in the worker set, each worker set represents the unit of scaling
+	// PipelineParallelSize defines the number of parallel workers.
+	// It specifies the number of model partitions across multiple devices, allowing large models to be split and processed concurrently across these partitions
+	// It also represents the number of replicas in the worker set, where each worker set serves as a scaling unit.
 	// +optional
-	Size int `json:"size,omitempty"`
+	PipelineParallelSize *int `json:"pipelineParallelSize,omitempty"`
+
+	// TensorParallelSize specifies the number of GPUs to be used per node.
+	// It indicates the degree of parallelism for tensor computations across the available GPUs.
+	// +optional
+	TensorParallelSize *int `json:"tensorParallelSize,omitempty"`
 }
 
 func init() {
@@ -289,6 +296,10 @@ func (srSpec *ServingRuntimeSpec) IsMultiModelRuntime() bool {
 	return srSpec.MultiModel != nil && *srSpec.MultiModel
 }
 
+func (srSpec *ServingRuntimeSpec) IsMultiNodeRuntime() bool {
+	return srSpec.WorkerSpec != nil
+}
+
 func (srSpec *ServingRuntimeSpec) IsProtocolVersionSupported(modelProtocolVersion constants.InferenceServiceProtocol) bool {
 	if len(modelProtocolVersion) == 0 || srSpec.ProtocolVersions == nil || len(srSpec.ProtocolVersions) == 0 {
 		return true

diff --git a/pkg/apis/serving/v1alpha1/zz_generated.deepcopy.go b/pkg/apis/serving/v1alpha1/zz_generated.deepcopy.go
diff --git a/pkg/apis/serving/v1beta1/component.go b/pkg/apis/serving/v1beta1/component.go
@@ -30,15 +30,25 @@ import (
 
 // Known error messages
 const (
-	MinReplicasShouldBeLessThanMaxError = "'MinReplicas' cannot be greater than MaxReplicas"
-	MinReplicasLowerBoundExceededError  = "'MinReplicas' cannot be less than 0"
-	MaxReplicasLowerBoundExceededError  = "'MaxReplicas' cannot be less than 0"
-	ParallelismLowerBoundExceededError  = "parallelism cannot be less than 0"
-	UnsupportedStorageURIFormatError    = "storageUri, must be one of: [%s] or match https://{}.blob.core.windows.net/{}/{} or be an absolute or relative local path. StorageUri [%s] is not supported"
-	UnsupportedStorageSpecFormatError   = "storage.spec.type, must be one of: [%s]. storage.spec.type [%s] is not supported"
-	InvalidLoggerType                   = "invalid logger type"
-	InvalidISVCNameFormatError          = "the InferenceService \"%s\" is invalid: a InferenceService name must consist of lower case alphanumeric characters or '-', and must start with alphabetical character. (e.g. \"my-name\" or \"abc-123\", regex used for validation is '%s')"
-	InvalidProtocol                     = "invalid protocol %s. Must be one of [%s]"
+	MinReplicasShouldBeLessThanMaxError              = "'MinReplicas' cannot be greater than MaxReplicas"
+	MinReplicasLowerBoundExceededError               = "'MinReplicas' cannot be less than 0"
+	MaxReplicasLowerBoundExceededError               = "'MaxReplicas' cannot be less than 0"
+	ParallelismLowerBoundExceededError               = "parallelism cannot be less than 0"
+	UnsupportedStorageURIFormatError                 = "storageUri, must be one of: [%s] or match https://{}.blob.core.windows.net/{}/{} or be an absolute or relative local path. StorageUri [%s] is not supported"
+	UnsupportedStorageSpecFormatError                = "storage.spec.type, must be one of: [%s]. storage.spec.type [%s] is not supported"
+	InvalidLoggerType                                = "invalid logger type"
+	InvalidISVCNameFormatError                       = "the InferenceService \"%s\" is invalid: a InferenceService name must consist of lower case alphanumeric characters or '-', and must start with alphabetical character. (e.g. \"my-name\" or \"abc-123\", regex used for validation is '%s')"
+	InvalidProtocol                                  = "invalid protocol %s. Must be one of [%s]"
+	MissingStorageURI                                = "the InferenceService %q is invalid: StorageURI must be set for multinode enabled"
+	InvalidAutoScalerError                           = "the InferenceService %q is invalid: Multinode only supports 'external' autoscaler(%s)"
+	InvalidNotSupportedStorageURIProtocolError       = "the InferenceService %q is invalid: Multinode only supports 'pvc' Storage Protocol(%s)"
+	InvalidCustomGPUTypesAnnotationFormatError       = "the InferenceService %q is invalid: invalid format for %s annotation: must be a valid JSON array"
+	InvalidUnknownGPUTypeError                       = "the InferenceService %q is invalid: Unknown GPU resource type. Set 'serving.kserve.io/gpu-resource-types' annotation to use custom gpu resource type"
+	InvalidWorkerSpecPipelineParallelSizeValueError  = "the InferenceService %q is invalid: WorkerSpec.PipelineParallelSize cannot be less than 2(%s)"
+	InvalidWorkerSpecTensorParallelSizeValueError    = "the InferenceService %q is invalid: WorkerSpec.TensorParallelSize cannot be less than 1(%s)"
+	DisallowedMultipleContainersInWorkerSpecError    = "the InferenceService %q is invalid: setting multiple containers in workerSpec is not allowed"
+	DisallowedWorkerSpecPipelineParallelSizeEnvError = "the InferenceService %q is invalid: setting PIPELINE_PARALLEL_SIZE in environment variables is not allowed"
+	DisallowedWorkerSpecTensorParallelSizeEnvError   = "the InferenceService %q is invalid: setting TENSOR_PARALLEL_SIZE in environment variables is not allowed"
 )
 
 // Constants