Skip to content

Commit

Permalink
[Cherry Pick] Multi node implementation (#434)
Browse files Browse the repository at this point in the history
* Multi-Node Inference Implementation (kserve#3972)

Signed-off-by: jooho lee <[email protected]>

* fix lint and unit test for odh

Signed-off-by: jooho lee <[email protected]>

---------

Signed-off-by: jooho lee <[email protected]>
  • Loading branch information
Jooho authored Nov 7, 2024
1 parent 89922f6 commit ee9a342
Show file tree
Hide file tree
Showing 43 changed files with 8,394 additions and 348 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -3218,7 +3218,9 @@ spec:
additionalProperties:
type: string
type: object
size:
pipelineParallelSize:
type: integer
tensorParallelSize:
type: integer
tolerations:
items:
Expand Down
6 changes: 4 additions & 2 deletions config/crd/full/serving.kserve.io_inferenceservices.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16027,6 +16027,8 @@ spec:
pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$
x-kubernetes-int-or-string: true
type: object
pipelineParallelSize:
type: integer
preemptionPolicy:
type: string
priority:
Expand Down Expand Up @@ -16163,10 +16165,10 @@ spec:
type: boolean
shareProcessNamespace:
type: boolean
size:
type: integer
subdomain:
type: string
tensorParallelSize:
type: integer
terminationGracePeriodSeconds:
format: int64
type: integer
Expand Down
4 changes: 3 additions & 1 deletion config/crd/full/serving.kserve.io_servingruntimes.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3218,7 +3218,9 @@ spec:
additionalProperties:
type: string
type: object
size:
pipelineParallelSize:
type: integer
tensorParallelSize:
type: integer
tolerations:
items:
Expand Down
180 changes: 180 additions & 0 deletions config/runtimes/kserve-huggingfaceserver-multinode.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,180 @@
apiVersion: serving.kserve.io/v1alpha1
kind: ClusterServingRuntime
metadata:
name: kserve-huggingfaceserver-multinode
spec:
annotations:
prometheus.kserve.io/port: "8080"
prometheus.kserve.io/path: "/metrics"
supportedModelFormats:
- name: huggingface
version: "1"
autoSelect: true
priority: 2
protocolVersions:
- v2
- v1
containers:
- name: kserve-container
image: kserve/huggingfaceserver:latest
command: ["bash", "-c"]
args:
- |
ray start --head --disable-usage-stats --include-dashboard false
# wait for other node to join
until [[ $(ray status | grep -c node_) -eq ${PIPELINE_PARALLEL_SIZE} ]]; do
echo "Waiting..."
sleep 1
done
ray status
export MODEL=${MODEL_ID}
if [[ ! -z ${MODEL_DIR} ]]
then
MODEL=${MODEL_DIR}
fi
python3 -m huggingfaceserver --model_name=${MODEL_NAME} --model_dir=${MODEL} --tensor-parallel-size=${TENSOR_PARALLEL_SIZE} --pipeline-parallel-size=${PIPELINE_PARALLEL_SIZE}
env:
- name: RAY_PORT
value: "6379"
- name: RAY_ADDRESS
value: 127.0.0.1:6379
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: VLLM_CONFIG_ROOT
value: /tmp
- name: HF_HUB_CACHE
value: /tmp
resources:
requests:
cpu: "2"
memory: 6Gi
limits:
cpu: "4"
memory: 12Gi
volumeMounts:
- name: shm
mountPath: /dev/shm
livenessProbe:
failureThreshold: 3
periodSeconds: 30
successThreshold: 1
timeoutSeconds: 5
initialDelaySeconds: 10
exec:
command:
- bash
- -c
- |
./huggingfaceserver/health_check.py liveness
readinessProbe:
failureThreshold: 2
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 5
initialDelaySeconds: 10
exec:
command:
- bash
- -c
- |
./huggingfaceserver/health_check.py readiness ${PIPELINE_PARALLEL_SIZE} http://localhost:8080
startupProbe:
failureThreshold: 40
periodSeconds: 30
successThreshold: 1
timeoutSeconds: 5
initialDelaySeconds: 5
exec:
command:
- bash
- -c
- |
./huggingfaceserver/health_check.py startup
volumes:
- name: shm
emptyDir:
medium: Memory
sizeLimit: 3Gi
workerSpec:
pipelineParallelSize: 2
tensorParallelSize: 1
containers:
- name: worker-container
image: kserve/huggingfaceserver:latest
command: ["bash", "-c"]
args:
- |
SECONDS=0
while true; do
if (( SECONDS <= 120 )); then
if ray health-check --address "${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379" > /dev/null 2>&1; then
echo "Global Control Service(GCS) is ready."
break
fi
echo "$SECONDS seconds elapsed: Waiting for Global Control Service(GCS) to be ready."
else
if ray health-check --address "${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379"; then
echo "Global Control Service(GCS) is ready. Any error messages above can be safely ignored."
break
fi
echo "$SECONDS seconds elapsed: Still waiting for Global Control Service(GCS) to be ready."
fi
sleep 5
done
RAY_HEAD_ADDRESS="${HEAD_SVC}.${POD_NAMESPACE}.svc.cluster.local:6379"
echo "Attempting to connect to Ray cluster at $RAY_HEAD_ADDRESS ..."
ray start --address="$RAY_HEAD_ADDRESS" --block
env:
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
resources:
requests:
cpu: "2"
memory: 6Gi
limits:
cpu: "4"
memory: 12Gi
volumeMounts:
- name: shm
mountPath: /dev/shm
livenessProbe:
failureThreshold: 3
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 5
exec:
command:
- bash
- -c
- |
./huggingfaceserver/health_check.py registered_nodes ${PIPELINE_PARALLEL_SIZE}
startupProbe:
failureThreshold: 12
periodSeconds: 5
successThreshold: 1
timeoutSeconds: 5
exec:
command:
- bash
- -c
- |
./huggingfaceserver/health_check.py startup
volumes:
- name: shm
emptyDir:
medium: Memory
sizeLimit: 3Gi
1 change: 1 addition & 0 deletions config/runtimes/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ resources:
- kserve-lgbserver.yaml
- kserve-torchserve.yaml
- kserve-huggingfaceserver.yaml
- kserve-huggingfaceserver-multinode.yaml

images:
# SMS Only Runtimes
Expand Down
15 changes: 13 additions & 2 deletions pkg/apis/serving/v1alpha1/servingruntime_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -271,9 +271,16 @@ type SupportedRuntime struct {
type WorkerSpec struct {
ServingRuntimePodSpec `json:",inline"`

// Configure the number of replicas in the worker set, each worker set represents the unit of scaling
// PipelineParallelSize defines the number of parallel workers.
// It specifies the number of model partitions across multiple devices, allowing large models to be split and processed concurrently across these partitions
// It also represents the number of replicas in the worker set, where each worker set serves as a scaling unit.
// +optional
Size int `json:"size,omitempty"`
PipelineParallelSize *int `json:"pipelineParallelSize,omitempty"`

// TensorParallelSize specifies the number of GPUs to be used per node.
// It indicates the degree of parallelism for tensor computations across the available GPUs.
// +optional
TensorParallelSize *int `json:"tensorParallelSize,omitempty"`
}

func init() {
Expand All @@ -289,6 +296,10 @@ func (srSpec *ServingRuntimeSpec) IsMultiModelRuntime() bool {
return srSpec.MultiModel != nil && *srSpec.MultiModel
}

func (srSpec *ServingRuntimeSpec) IsMultiNodeRuntime() bool {
return srSpec.WorkerSpec != nil
}

func (srSpec *ServingRuntimeSpec) IsProtocolVersionSupported(modelProtocolVersion constants.InferenceServiceProtocol) bool {
if len(modelProtocolVersion) == 0 || srSpec.ProtocolVersions == nil || len(srSpec.ProtocolVersions) == 0 {
return true
Expand Down
10 changes: 10 additions & 0 deletions pkg/apis/serving/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

28 changes: 19 additions & 9 deletions pkg/apis/serving/v1beta1/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,25 @@ import (

// Known error messages
const (
MinReplicasShouldBeLessThanMaxError = "'MinReplicas' cannot be greater than MaxReplicas"
MinReplicasLowerBoundExceededError = "'MinReplicas' cannot be less than 0"
MaxReplicasLowerBoundExceededError = "'MaxReplicas' cannot be less than 0"
ParallelismLowerBoundExceededError = "parallelism cannot be less than 0"
UnsupportedStorageURIFormatError = "storageUri, must be one of: [%s] or match https://{}.blob.core.windows.net/{}/{} or be an absolute or relative local path. StorageUri [%s] is not supported"
UnsupportedStorageSpecFormatError = "storage.spec.type, must be one of: [%s]. storage.spec.type [%s] is not supported"
InvalidLoggerType = "invalid logger type"
InvalidISVCNameFormatError = "the InferenceService \"%s\" is invalid: a InferenceService name must consist of lower case alphanumeric characters or '-', and must start with alphabetical character. (e.g. \"my-name\" or \"abc-123\", regex used for validation is '%s')"
InvalidProtocol = "invalid protocol %s. Must be one of [%s]"
MinReplicasShouldBeLessThanMaxError = "'MinReplicas' cannot be greater than MaxReplicas"
MinReplicasLowerBoundExceededError = "'MinReplicas' cannot be less than 0"
MaxReplicasLowerBoundExceededError = "'MaxReplicas' cannot be less than 0"
ParallelismLowerBoundExceededError = "parallelism cannot be less than 0"
UnsupportedStorageURIFormatError = "storageUri, must be one of: [%s] or match https://{}.blob.core.windows.net/{}/{} or be an absolute or relative local path. StorageUri [%s] is not supported"
UnsupportedStorageSpecFormatError = "storage.spec.type, must be one of: [%s]. storage.spec.type [%s] is not supported"
InvalidLoggerType = "invalid logger type"
InvalidISVCNameFormatError = "the InferenceService \"%s\" is invalid: a InferenceService name must consist of lower case alphanumeric characters or '-', and must start with alphabetical character. (e.g. \"my-name\" or \"abc-123\", regex used for validation is '%s')"
InvalidProtocol = "invalid protocol %s. Must be one of [%s]"
MissingStorageURI = "the InferenceService %q is invalid: StorageURI must be set for multinode enabled"
InvalidAutoScalerError = "the InferenceService %q is invalid: Multinode only supports 'external' autoscaler(%s)"
InvalidNotSupportedStorageURIProtocolError = "the InferenceService %q is invalid: Multinode only supports 'pvc' Storage Protocol(%s)"
InvalidCustomGPUTypesAnnotationFormatError = "the InferenceService %q is invalid: invalid format for %s annotation: must be a valid JSON array"
InvalidUnknownGPUTypeError = "the InferenceService %q is invalid: Unknown GPU resource type. Set 'serving.kserve.io/gpu-resource-types' annotation to use custom gpu resource type"
InvalidWorkerSpecPipelineParallelSizeValueError = "the InferenceService %q is invalid: WorkerSpec.PipelineParallelSize cannot be less than 2(%s)"
InvalidWorkerSpecTensorParallelSizeValueError = "the InferenceService %q is invalid: WorkerSpec.TensorParallelSize cannot be less than 1(%s)"
DisallowedMultipleContainersInWorkerSpecError = "the InferenceService %q is invalid: setting multiple containers in workerSpec is not allowed"
DisallowedWorkerSpecPipelineParallelSizeEnvError = "the InferenceService %q is invalid: setting PIPELINE_PARALLEL_SIZE in environment variables is not allowed"
DisallowedWorkerSpecTensorParallelSizeEnvError = "the InferenceService %q is invalid: setting TENSOR_PARALLEL_SIZE in environment variables is not allowed"
)

// Constants
Expand Down
Loading

0 comments on commit ee9a342

Please sign in to comment.