From 1d4f5b160b17693b8192dc446524c9e0eab46855 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=8A=92=E6=83=85=E7=86=8A?= <2669184984@qq.com> Date: Sun, 20 Jul 2025 21:54:30 +0800 Subject: [PATCH] add ecosystem. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 抒情熊 <2669184984@qq.com> --- content/en/docs/MXNet_on_volcano.md | 49 ++++++++++ content/en/docs/argo_on_volcano.md | 113 +++++++++++++++++++++ content/en/docs/cromwell_on_volcano.md | 52 ++++++++++ content/en/docs/horovod_on_volcano.md | 111 +++++++++++++++++++++ content/en/docs/kubeflow_on_volcano.md | 130 +++++++++++++++++++++---- content/en/docs/pytorch_on_volcano.md | 76 +++++++++++++++ content/en/docs/ray_on_volcano.md | 90 +++++++++++++++++ content/zh/docs/MXNet_on_volcano.md | 52 ++++++++++ content/zh/docs/argo_on_volcano.md | 113 +++++++++++++++++++++ content/zh/docs/cromwell_on_volcano.md | 52 ++++++++++ content/zh/docs/horovod_on_volcano.md | 111 +++++++++++++++++++++ content/zh/docs/kubeflow_on_volcano.md | 130 +++++++++++++++++++++---- content/zh/docs/pytorch_on_volcano.md | 76 +++++++++++++++ content/zh/docs/ray_on_volcano.md | 94 ++++++++++++++++++ 14 files changed, 1209 insertions(+), 40 deletions(-) create mode 100644 content/en/docs/MXNet_on_volcano.md create mode 100644 content/en/docs/argo_on_volcano.md create mode 100644 content/en/docs/cromwell_on_volcano.md create mode 100644 content/en/docs/horovod_on_volcano.md create mode 100644 content/en/docs/pytorch_on_volcano.md create mode 100644 content/en/docs/ray_on_volcano.md create mode 100644 content/zh/docs/MXNet_on_volcano.md create mode 100644 content/zh/docs/argo_on_volcano.md create mode 100644 content/zh/docs/cromwell_on_volcano.md create mode 100644 content/zh/docs/horovod_on_volcano.md create mode 100644 content/zh/docs/pytorch_on_volcano.md create mode 100644 content/zh/docs/ray_on_volcano.md diff --git a/content/en/docs/MXNet_on_volcano.md b/content/en/docs/MXNet_on_volcano.md new file mode 100644 index 00000000..6f37c21c --- /dev/null +++ b/content/en/docs/MXNet_on_volcano.md @@ -0,0 +1,49 @@ ++++ +title = "MXNet on Volcano" + +date = 2025-07-20 +lastmod = 2025-07-20 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "MXNet" +[menu.docs] + parent = "zoology" + weight = 3 + ++++ + + + +# MXNet Introduction + +MXNet is an open-source deep learning framework designed for efficient and flexible training and deployment of deep neural networks. It supports seamless scaling from a single GPU to multiple GPUs, and further to distributed multi-machine multi-GPU setups. + +# MXNet on Volcano + +Combining MXNet with Volcano allows you to fully leverage Kubernetes' container orchestration capabilities and Volcano's batch scheduling functionality to achieve efficient distributed training. + +Click [here](https://github.com/apache/mxnet/blob/master/example/distributed_training-horovod/gluon_mnist.py) to view the example provided by the MXNet team. This directory contains the following files: + +- Dockerfile: Builds the standalone worker image. +- Makefile: Used to build the above image. +- train-mnist-cpu.yaml: Volcano Job specification. + +To run the example, edit the image name and version in `train-mnist-cpu.yaml`. Then run: + +``` +kubectl apply -f train-mnist-cpu.yaml -n ${NAMESPACE} +``` + +to create the Job. + +Then use: + +``` +kubectl -n ${NAMESPACE} describe job.batch.volcano.sh mxnet-job +``` + +to view the status. diff --git a/content/en/docs/argo_on_volcano.md b/content/en/docs/argo_on_volcano.md new file mode 100644 index 00000000..934dd5fe --- /dev/null +++ b/content/en/docs/argo_on_volcano.md @@ -0,0 +1,113 @@ ++++ +title = "Argo on Volcano" + +date = 2025-07-20 +lastmod = 2025-07-20 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "Argo" +[menu.docs] + parent = "zoology" + weight = 3 + ++++ + + + +### Argo Introduction + +Argo is an open-source Kubernetes native workflow engine that allows users to define and execute containerized workflows. The Argo project includes multiple components, with Argo Workflows being the core component used for orchestrating parallel jobs on Kubernetes, supporting DAG (Directed Acyclic Graph) and step templates. + +### Argo on Volcano + +By integrating Argo Workflow with Volcano, you can combine the advantages of both: Argo provides powerful workflow orchestration capabilities, while Volcano provides advanced scheduling features. + +#### Integration Method + +Argo resource templates allow for the creation, deletion, or updating of any type of Kubernetes resource (including CRDs). We can use resource templates to integrate Volcano Jobs into Argo Workflow, thereby adding job dependency management and DAG flow control capabilities to Volcano. + +#### Configuring RBAC Permissions + +Before integration, ensure that Argo Workflow has sufficient permissions to manage Volcano resources: + +1. Argo Workflow needs to specify a serviceAccount, which can be specified as follows: + + ``` + argo submit --serviceaccount + ``` + +2. Add Volcano resource management permissions to the serviceAccount: + + ```yaml + yaml- apiGroups: + - batch.volcano.sh + resources: + - "*" + verbs: + - "*" + ``` + +#### Example + +Here is an example YAML for creating a Volcano Job using Argo Workflow: + +```yaml +yamlapiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + generateName: volcano-job- +spec: + entrypoint: nginx-tmpl + serviceAccountName: argo # Specify service account + templates: + - name: nginx-tmpl + activeDeadlineSeconds: 120 # Limit workflow execution time + resource: # Indicates this is a resource template + action: create # kubectl operation type + successCondition: status.state.phase = Completed + failureCondition: status.state.phase = Failed + manifest: | + apiVersion: batch.volcano.sh/v1alpha1 + kind: Job + metadata: + generateName: test-job- + ownerReferences: # Add owner references to ensure resource lifecycle management + - apiVersion: argoproj.io/v1alpha1 + blockOwnerDeletion: true + kind: Workflow + name: "{{workflow.name}}" + uid: "{{workflow.uid}}" + spec: + minAvailable: 1 + schedulerName: volcano + policies: + - event: PodEvicted + action: RestartJob + plugins: + ssh: [] + env: [] + svc: [] + maxRetry: 5 + queue: default + tasks: + - replicas: 2 + name: "default-nginx" + template: + metadata: + name: web + spec: + containers: + - image: nginx:latest + imagePullPolicy: IfNotPresent + name: nginx + resources: + requests: + cpu: "100m" + restartPolicy: OnFailure +``` + +For more information and advanced configurations, please check the [link](https://github.com/volcano-sh/volcano/tree/master/example/integrations/argo) to learn more. \ No newline at end of file diff --git a/content/en/docs/cromwell_on_volcano.md b/content/en/docs/cromwell_on_volcano.md new file mode 100644 index 00000000..fcf487ce --- /dev/null +++ b/content/en/docs/cromwell_on_volcano.md @@ -0,0 +1,52 @@ ++++ +title = "Cromwell on Volcano" + +date = 2025-07-20 +lastmod = 2025-07-20 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "Cromwell" +[menu.docs] + parent = "zoology" + weight = 3 + ++++ + + + +# Cromwell Introduction + +Cromwell is a workflow management system designed for scientific workflows. + +# Cromwell on Volcano + +Cromwell can be integrated with Volcano to efficiently schedule and execute bioinformatics workflows in Kubernetes environments. + +To make Cromwell interact with a Volcano cluster and dispatch jobs to it, you can use the following basic configuration: + +```hocon +hoconhoconVolcano { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + runtime-attributes = """ + Int runtime_minutes = 600 + Int cpus = 2 + Int requested_memory_mb_per_core = 8000 + String queue = "short" + """ + + submit = """ + vcctl job run -f ${script} + """ + kill = "vcctl job delete -N ${job_id}" + check-alive = "vcctl job view -N ${job_id}" + job-id-regex = "(\\d+)" + } +} +``` + +Please note that this configuration example is community-contributed and therefore not officially supported. \ No newline at end of file diff --git a/content/en/docs/horovod_on_volcano.md b/content/en/docs/horovod_on_volcano.md new file mode 100644 index 00000000..051b1d05 --- /dev/null +++ b/content/en/docs/horovod_on_volcano.md @@ -0,0 +1,111 @@ ++++ +title = "Horovod on Volcano" + +date = 2025-07-20 +lastmod = 2025-07-20 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "Horovod" +[menu.docs] + parent = "zoology" + weight = 3 + ++++ + + + +# Horovod Introduction + +Horovod is a distributed deep learning training framework compatible with PyTorch, TensorFlow, Keras, and Apache MXNet. With Horovod, existing training scripts can be scaled to run on hundreds of GPUs with just a few lines of Python code. It achieves near-linear performance improvements on large-scale GPU clusters. + +## Horovod on Volcano + +Volcano as a cloud-native batch system, provides native support for Horovod distributed training jobs. Through Volcano's scheduling capabilities, users can easily deploy and manage Horovod training tasks on Kubernetes clusters. + +Below is an example configuration for running Horovod on Volcano: + +```yaml +yamlapiVersion: batch.volcano.sh/v1alpha1 +kind: Job +metadata: + name: lm-horovod-job + labels: + "volcano.sh/job-type": Horovod +spec: + minAvailable: 4 + schedulerName: volcano + plugins: + ssh: [] + svc: [] + policies: + - event: PodEvicted + action: RestartJob + tasks: + - replicas: 1 + name: master + policies: + - event: TaskCompleted + action: CompleteJob + template: + spec: + containers: + - command: + - /bin/sh + - -c + - | + WORKER_HOST=`cat /etc/volcano/worker.host | tr "\n" ","`; + mkdir -p /var/run/sshd; /usr/sbin/sshd; + mpiexec --allow-run-as-root --host ${WORKER_HOST} -np 3 python tensorflow_mnist_lm.py; + image: volcanosh/horovod-tf-mnist:0.5 + name: master + ports: + - containerPort: 22 + name: job-port + resources: + requests: + cpu: "500m" + memory: "1024Mi" + limits: + cpu: "500m" + memory: "1024Mi" + restartPolicy: OnFailure + imagePullSecrets: + - name: default-secret + - replicas: 3 + name: worker + template: + spec: + containers: + - command: + - /bin/sh + - -c + - | + mkdir -p /var/run/sshd; /usr/sbin/sshd -D; + image: volcanosh/horovod-tf-mnist:0.5 + name: worker + ports: + - containerPort: 22 + name: job-port + resources: + requests: + cpu: "1000m" + memory: "2048Mi" + limits: + cpu: "1000m" + memory: "2048Mi" + restartPolicy: OnFailure + imagePullSecrets: + - name: default-secret +``` + +In this configuration, we define a Horovod distributed training job with the following key components: + +1. Task structure: Consists of 1 master node and 3 worker nodes, totaling 4 Pods +2. Communication mechanism: Utilizes Volcano's SSH plugin for inter-node communication +3. Resource allocation: Master node is allocated fewer resources (500m CPU/1Gi memory), while worker nodes receive more resources (1000m CPU/2Gi memory) +4. Fault tolerance: When a Pod is evicted, the entire job restarts +5. Job completion policy: When the master task completes, the entire job is marked as complete diff --git a/content/en/docs/kubeflow_on_volcano.md b/content/en/docs/kubeflow_on_volcano.md index 442bf617..69974197 100644 --- a/content/en/docs/kubeflow_on_volcano.md +++ b/content/en/docs/kubeflow_on_volcano.md @@ -2,7 +2,7 @@ title = "Kubeflow on Volcano" date = 2021-06-29 -lastmod = 2021-06-29 +lastmod = 2025-07-20 draft = false # Is this a draft? true/false toc = true # Show table of contents? true/false @@ -18,8 +18,6 @@ linktitle = "Kubeflow" - - ### Kubeflow introduction Kubernetes has become the de facto standard for cloud native application choreography and management, and more and more applications are migrating to Kubernetes. The field of artificial intelligence and machine learning naturally contains a large number of computation-intensive tasks. Developers are very willing to build an AI platform based on Kubernetes and make full use of the resource management, application scheduling, operation and maintenance monitoring capabilities provided by Kubernetes. However, it is a very complicated and tedious process to build an end-to-end AI computing platform based on Kubernetes, which needs to deal with many links. In addition to the model training we are familiar with, it also includes data collection, preprocessing, resource management, feature extraction, data verification, model management, model release, monitoring and other links. For an AI algorithm engineer, if he wants to do model training, he has to build a set of AI computing platform. This process is time-consuming and laborious, and requires a lot of knowledge accumulation[1]. @@ -39,9 +37,11 @@ What scenarios can we use Kubeflow for: ### Kubeflow on Volcano +#### TFJob + Volcano is an enhanced high performance computing task batch processing system built on Kubernetes. As a platform for high performance computing scenarios, it makes up for Kubernetes' lack of basic capabilities in machine learning, deep learning, HPC, and big data computing scenarios, including gang-schedule scheduling capability, computational task queue management, task-topology, and GPU affinity scheduling. In addition, Volcano has enhanced the batch creation and life cycle management of computing tasks, fair-share, binpack scheduling and other aspects on the basis of the native Kubernetes capability. Volcano has fully solved the problem of distributed training in Kubeflow mentioned above. -#### download kfctl +##### download kfctl First of all, you need to download kfctl, you can choose the appropriate compressed package file according to the system [1]. @@ -50,9 +50,7 @@ $ tar -xvf kfctl_v1.0.2-0-ga476281_linux.tar.gz $ sudo mv ./kfctl /usr/local/bin/kfctl ``` - - -#### Configure environment variables +##### Configure environment variables ``` $ export PATH= $PATH:"" @@ -62,9 +60,7 @@ $ export KF_DIR=${BASE_DIR}/${KF_NAME} $ export CONFIG_URI="https://raw.githubusercontent.com/kubeflow/manifests/v1.0-branch/kfdef/kfctl_k8s_istio.v1.0.2.yaml" ``` - - -#### Install Kubeflow +##### Install Kubeflow ``` $ mkdir -p ${KF_DIR} @@ -78,9 +74,7 @@ Confirm the installation results with the following instructions. $ kubectl -n kubeflow get all ``` - - -#### deploy Mnist +##### deploy Mnist Download the official test set provided by Kubuflow. @@ -88,9 +82,7 @@ Download the official test set provided by Kubuflow. git clone https://github.com/kubeflow/examples.git ``` - - -#### Start using Notebook +##### Start using Notebook External interface service is provided, where the nodes under the cluster need to be bound to public network IP. If Notebook is not installed, please use pip3 to install it first. @@ -106,9 +98,7 @@ $ jupyter notebook --allow-root Access your-IP:30200/,Enter the configuration password to enter the Notebook. - - -#### Run the official instance on the Notebook[2] +##### Run the official instance on the Notebook 1.Open Notebook and deploy TFJob。Open the notebook `mnist/mnist_vanilla_k8s.ipynb` ,Follow the guidelines to deploy a distributed TF Job. @@ -243,4 +233,104 @@ spec: ``` kubectl apply -f mnist.yaml -``` \ No newline at end of file +``` + +#### Gang Scheduling + +Gang Scheduling is a scheduling strategy primarily used for distributed/parallel tasks. It ensures that a group of Pods (typically belonging to the same distributed training task) either start together or not at all, avoiding partial node execution that could lead to training failures or resource waste. + +Kubeflow supports gang scheduling through Volcano. You must first install the Volcano scheduler in your cluster as an auxiliary scheduler for Kubernetes and configure the Operator to select the scheduler name for gang scheduling, as shown below: + +- training-operator + +```diff +diff... + spec: + containers: + - command: + - /manager ++ - --gang-scheduler-name=volcano + image: kubeflow/training-operator + name: training-operator +... +``` + +- mpi-operator + +```diff +diff... + spec: + containers: + - args: ++ - --gang-scheduling=volcano + - -alsologtostderr + - --lock-namespace=mpi-operator + image: mpioperator/mpi-operator:0.4.0 + name: mpi-operator +... +``` + +Note: The Volcano scheduler implements gang-scheduling with Kubeflow Operators through [PodGroup](https://volcano.sh/en/docs/podgroup/), and the Operator automatically creates the corresponding PodGroup for the job. + +For more detailed information, please check the [link](https://www.kubeflow.org/docs/components/trainer/legacy-v1/user-guides/job-scheduling/). + +#### Using Arena + +Arena is a command-line tool that simplifies the submission and management of AI training and batch jobs on Kubernetes (including Volcano). + +##### Submit a Volcano Job + +``` +$ arena submit volcanojob --name=demo + +configmap/demo-volcanojob created +configmap/demo-volcanojob labeled +job.batch.volcano.sh/demo created +INFO[0003] The Job demo has been submitted successfully +INFO[0003] You can run `arena get demo --type volcanojob` to check the job status +``` + +You can specify more parameters: + +``` +$ arena submit volcanojob --name demo12 --taskImages busybox,busybox --taskReplicas 2 +``` + +##### Get Volcano Job Details + +``` +arena get --type volcanojob demo12 + +STATUS: SUCCEEDED +NAMESPACE: default +TRAINING DURATION: 2m + +NAME STATUS TRAINER AGE INSTANCE NODE +demo12 SUCCEEDED VOLCANOJOB 2m demo12-task-0-0 11.245.101.184 +demo12 SUCCEEDED VOLCANOJOB 2m demo12-task-0-1 11.245.101.184 +demo12 SUCCEEDED VOLCANOJOB 2m demo12-task-1-0 11.245.101.184 +demo12 SUCCEEDED VOLCANOJOB 2m demo12-task-1-1 11.245.101.184 +``` + +It created two tasks, each with 2 replicas. + +##### Delete a Volcano Job + +``` +$ arena delete --type=volcanojob demo + +job.batch.volcano.sh "demo" deleted +configmap "demo-volcanojob" deleted +INFO[0000] The Job demo has been deleted successfully +``` + +##### View All Volcano Jobs + +``` +$ arena list + +NAME STATUS TRAINER AGE NODE +demo RUNNING VOLCANOJOB 2m 11.245.101.184 +``` + +For more detailed information, please check the [link](https://github.com/kubeflow/arena/blob/master/docs/training/volcanojob/volcanojob.md). \ No newline at end of file diff --git a/content/en/docs/pytorch_on_volcano.md b/content/en/docs/pytorch_on_volcano.md new file mode 100644 index 00000000..23402ff6 --- /dev/null +++ b/content/en/docs/pytorch_on_volcano.md @@ -0,0 +1,76 @@ ++++ +title = "Pytorch on Volcano" + +date = 2021-06-29 +lastmod = 2021-06-29 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "Pytorch" +[menu.docs] + parent = "zoology" + weight = 6 + ++++ + +### PyTorch introduction + +PyTorch is an open-source machine learning framework developed by Facebook (now Meta) AI Research team. It is known for its dynamic computation graph and intuitive Python interface, enabling researchers and developers to build and train deep learning models with greater flexibility. PyTorch provides powerful GPU acceleration capabilities, supports distributed training, and has a rich ecosystem of tools. + +### PyTorch on Volcano + +Volcano's support for PyTorch is implemented through the PyTorch plugin, which not only allows users to write less YAML configuration but also ensures the proper functioning of PyTorch jobs. + +The PyTorch plugin accomplishes three tasks: + +- Opens ports used by PyTorch for all containers in the job +- Enforces the `svc` plugin +- Automatically adds environment variables required for PyTorch distributed training to containers, such as `MASTER_ADDR`, `MASTER_PORT`, `WORLD_SIZE`, `RANK`, etc. + +#### Parameter List + +| No. | Name | Type | Default | Required | Description | Example | +| ---- | ------ | ------ | ------- | -------- | -------------------------------- | --------------- | +| 1 | master | string | master | No | Name of the PyTorch master node | --master=master | +| 2 | worker | string | worker | No | Name of the PyTorch worker node | --worker=worker | +| 3 | port | string | 23456 | No | Port to be opened for containers | --port=23456 | + +#### Example + +```yaml +yamlapiVersion: batch.volcano.sh/v1alpha1 +kind: Job +metadata: + name: pytorch-job +spec: + minAvailable: 1 + schedulerName: volcano + plugins: + pytorch: ["--master=master","--worker=worker","--port=23456"] # PyTorch plugin registration + tasks: + - replicas: 1 + name: master + policies: + - event: TaskCompleted + action: CompleteJob + template: + spec: + containers: + - image: gcr.io/kubeflow-ci/pytorch-dist-sendrecv-test:1.0 + imagePullPolicy: IfNotPresent + name: master + restartPolicy: OnFailure + - replicas: 2 + name: worker + template: + spec: + containers: + - image: gcr.io/kubeflow-ci/pytorch-dist-sendrecv-test:1.0 + imagePullPolicy: IfNotPresent + name: worker + workingDir: /home + restartPolicy: OnFailure +``` diff --git a/content/en/docs/ray_on_volcano.md b/content/en/docs/ray_on_volcano.md new file mode 100644 index 00000000..66402c2b --- /dev/null +++ b/content/en/docs/ray_on_volcano.md @@ -0,0 +1,90 @@ +# Ray on Volcano + +date = 2025-07-20 +lastmod = 2025-07-20 + +draft = false +toc = true +type = "docs" + +linktitle = "Ray" +[menu.docs] +parent = "zoology" +weight = 6 + ++++ + +### Ray Introduction + +Today, machine learning workloads are becoming increasingly compute-intensive. Single-node development environments (such as laptops) are convenient but cannot scale to meet these demands. + +Ray is a unified approach to scaling Python and AI applications from laptops to clusters. + +With Ray, you can seamlessly scale the same code from your laptop to a cluster. Ray is designed to be general-purpose, meaning it can efficiently run any type of workload. If your application is written in Python, you can scale it with Ray without additional infrastructure. + +KubeRay is a powerful open-source Kubernetes operator that simplifies the deployment and management of Ray applications on Kubernetes. + +### Ray on Volcano + +KubeRay's Volcano integration enables more efficient scheduling of Ray Pods in multi-tenant Kubernetes environments. + +#### Installing KubeRay Operator + +To deploy the KubeRay Operator with Volcano batch scheduling support, you can use one of the following two methods: + +##### Method 1: Using a values.yaml Configuration File + +Set Volcano as the batch scheduler in your values.yaml file: + +```yaml +yaml# values.yaml file +batchScheduler: + name: volcano +``` + +Then install the Helm chart using this configuration file: + +```bash +bashhelm install kuberay-operator kuberay/kuberay-operator --version 1.4.2 -f values.yaml +``` + +##### Method 2: Using Command Line Parameters + +Specify the batch scheduler directly in the Helm installation command using the --set parameter: + +```bash +bashhelm install kuberay-operator kuberay/kuberay-operator --version 1.4.2 --set batchScheduler.name=volcano +``` + +#### Installing RayCluster with Volcano Scheduling + +To manage a RayCluster using the Volcano scheduler, follow these steps: + +##### Basic Installation + +1. Download the RayCluster example configuration that supports Volcano scheduling: + + ```bash + bashcurl -LO https://raw.githubusercontent.com/ray-project/kuberay/v1.4.2/ray-operator/config/samples/ray-cluster.volcano-scheduler.yaml + ``` + +2. Apply the configuration to create the RayCluster: + + ```bash + bashkubectl apply -f ray-cluster.volcano-scheduler.yaml + ``` + +3. Verify the cluster status: + + ```bash + bashkubectl get pod -l ray.io/cluster=test-cluster-0 + ``` + + After successful deployment, you should see output similar to: + + ``` + NAME READY STATUS RESTARTS AGE + test-cluster-0-head-jj9bg 1/1 Running 0 36s + ``` + +Now you can use Volcano and KubeRay working together. For more detailed information, please check the [link](https://docs.ray.io/en/master/cluster/kubernetes/k8s-ecosystem/volcano.html). \ No newline at end of file diff --git a/content/zh/docs/MXNet_on_volcano.md b/content/zh/docs/MXNet_on_volcano.md new file mode 100644 index 00000000..1743e9db --- /dev/null +++ b/content/zh/docs/MXNet_on_volcano.md @@ -0,0 +1,52 @@ ++++ +title = "MXNet on Volcano" + +date = 2025-07-20 +lastmod = 2025-07-20 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "MXNet" +[menu.docs] + parent = "zoology" + weight = 3 + ++++ + + + + + +### MXNet简介 + +MXNet是一个开源的深度学习框架,它设计用于高效且灵活地训练和部署深度神经网络,支持从单GPU到多GPU,再到分布式多机多GPU的无缝扩展。 + +### MXNet on volcano + +将MXNet与Volcano结合使用,可以充分利用Kubernetes的容器编排能力和Volcano的批处理调度功能,实现高效的分布式训练。 + +点击[这里](https://github.com/apache/mxnet/blob/master/example/distributed_training-horovod/gluon_mnist.py)查看MXNet 团队给出的示例。该目录包含以下文件: + +- Dockerfile:构建独立工作器镜像。 +- Makefile:用于构建上述图像。 +- train-mnist-cpu.yaml:Volcano Job 规范。 + +要运行示例,请编辑`train-mnist-cpu.yaml`镜像的名称和版本。然后运行 + +``` +kubectl apply -f train-mnist-cpu.yaml -n ${NAMESPACE} +``` + +来创造Job。 + +然后使用 + +``` +kubectl -n ${NAMESPACE} describe job.batch.volcano.sh mxnet-job +``` + +查看状态。 + diff --git a/content/zh/docs/argo_on_volcano.md b/content/zh/docs/argo_on_volcano.md new file mode 100644 index 00000000..b92484a5 --- /dev/null +++ b/content/zh/docs/argo_on_volcano.md @@ -0,0 +1,113 @@ ++++ +title = "Argo on Volcano" + +date = 2025-07-20 +lastmod = 2025-07-20 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "Argo" +[menu.docs] + parent = "zoology" + weight = 3 + ++++ + + + +### Argo简介 + +Argo是一个开源的Kubernetes原生工作流引擎,它允许用户定义和执行容器化的工作流。Argo项目包含多个组件,其中Argo Workflows是核心组件,用于在Kubernetes上编排并行作业,支持DAG(有向无环图)和步骤模板。 + +### Argo on volcano + +通过将Argo Workflow与Volcano集成,可以结合两者的优势:Argo提供强大的工作流编排能力,而Volcano提供高级调度功能。 + +#### 集成方式 + +Argo资源模板允许创建、删除或更新任何类型的Kubernetes资源(包括CRD)。我们可以使用资源模板将Volcano Jobs集成到Argo Workflow中,从而为Volcano添加作业依赖管理和DAG流程控制功能。 + +#### 配置RBAC权限 + +集成前需要确保Argo Workflow有足够的权限来管理Volcano资源: + +1. Argo Workflow需要指定serviceAccount,可通过以下方式指定: + + ``` + argo submit --serviceaccount + ``` + +2. 为serviceAccount添加Volcano资源的管理权限: + + ```yaml + yaml- apiGroups: + - batch.volcano.sh + resources: + - "*" + verbs: + - "*" + ``` + +#### 示例 + +以下是使用Argo Workflow创建Volcano Job的示例YAML: + +```yaml +yamlapiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + generateName: volcano-job- +spec: + entrypoint: nginx-tmpl + serviceAccountName: argo # 指定服务账户 + templates: + - name: nginx-tmpl + activeDeadlineSeconds: 120 # 限制工作流执行时间 + resource: # 表示这是一个资源模板 + action: create # kubectl操作类型 + successCondition: status.state.phase = Completed + failureCondition: status.state.phase = Failed + manifest: | + apiVersion: batch.volcano.sh/v1alpha1 + kind: Job + metadata: + generateName: test-job- + ownerReferences: # 添加所有者引用,确保资源生命周期管理 + - apiVersion: argoproj.io/v1alpha1 + blockOwnerDeletion: true + kind: Workflow + name: "{{workflow.name}}" + uid: "{{workflow.uid}}" + spec: + minAvailable: 1 + schedulerName: volcano + policies: + - event: PodEvicted + action: RestartJob + plugins: + ssh: [] + env: [] + svc: [] + maxRetry: 5 + queue: default + tasks: + - replicas: 2 + name: "default-nginx" + template: + metadata: + name: web + spec: + containers: + - image: nginx:latest + imagePullPolicy: IfNotPresent + name: nginx + resources: + requests: + cpu: "100m" + restartPolicy: OnFailurer +``` + +如果要查看更多信息和高级配置,请查看[链接](https://github.com/volcano-sh/volcano/tree/master/example/integrations/argo)了解更多。 \ No newline at end of file diff --git a/content/zh/docs/cromwell_on_volcano.md b/content/zh/docs/cromwell_on_volcano.md new file mode 100644 index 00000000..9b969db8 --- /dev/null +++ b/content/zh/docs/cromwell_on_volcano.md @@ -0,0 +1,52 @@ ++++ +title = "Cromwell on Volcano" + +date = 2025-07-20 +lastmod = 2025-07-20 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "Cromwell" +[menu.docs] + parent = "zoology" + weight = 3 + ++++ + + + +### Cromwell简介 + +Cromwell 是一个面向科学工作流程的工作流管理系统。 + +### Cromwell on volcano + +Cromwell可以与Volcano集成,以便在Kubernetes环境中高效地调度和执行生物信息学工作流。 + +要使Cromwell与Volcano集群交互并向其分派作业,可以使用以下基本配置: + +```hocon +hoconVolcano { + actor-factory = "cromwell.backend.impl.sfs.config.ConfigBackendLifecycleActorFactory" + config { + runtime-attributes = """ + Int runtime_minutes = 600 + Int cpus = 2 + Int requested_memory_mb_per_core = 8000 + String queue = "short" + """ + + submit = """ + vcctl job run -f ${script} + """ + kill = "vcctl job delete -N ${job_id}" + check-alive = "vcctl job view -N ${job_id}" + job-id-regex = "(\\d+)" + } +} +``` + +需要注意,这个配置示例是社区贡献的,因此不受官方支持。 \ No newline at end of file diff --git a/content/zh/docs/horovod_on_volcano.md b/content/zh/docs/horovod_on_volcano.md new file mode 100644 index 00000000..4bc563af --- /dev/null +++ b/content/zh/docs/horovod_on_volcano.md @@ -0,0 +1,111 @@ ++++ +title = "Horovod on Volcano" + +date = 2025-07-20 +lastmod = 2025-07-20 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "Horovod" +[menu.docs] + parent = "zoology" + weight = 3 + ++++ + + + +### Horovod简介 + +Horovod 是一个适用于 PyTorch、TensorFlow、Keras 和 Apache MXNet 的分布式深度学习训练框架。使用 Horovod,现有的训练脚本只需几行 Python 代码即可扩展至在数百个 GPU 上运行。能在大规模GPU集群上获得接近线性的性能提升。 + +### Horovod on volcano + +Volcano 作为云原生批处理系统,提供了对 Horovod 分布式训练作业的原生支持。通过 Volcano 的调度能力,用户可以轻松地在 Kubernetes 集群上部署和管理 Horovod 训练任务。 + +以下是一个 Horovod 在 Volcano 上运行的示例配置: + +```yaml +yamlapiVersion: batch.volcano.sh/v1alpha1 +kind: Job +metadata: + name: lm-horovod-job + labels: + "volcano.sh/job-type": Horovod +spec: + minAvailable: 4 + schedulerName: volcano + plugins: + ssh: [] + svc: [] + policies: + - event: PodEvicted + action: RestartJob + tasks: + - replicas: 1 + name: master + policies: + - event: TaskCompleted + action: CompleteJob + template: + spec: + containers: + - command: + - /bin/sh + - -c + - | + WORKER_HOST=`cat /etc/volcano/worker.host | tr "\n" ","`; + mkdir -p /var/run/sshd; /usr/sbin/sshd; + mpiexec --allow-run-as-root --host ${WORKER_HOST} -np 3 python tensorflow_mnist_lm.py; + image: volcanosh/horovod-tf-mnist:0.5 + name: master + ports: + - containerPort: 22 + name: job-port + resources: + requests: + cpu: "500m" + memory: "1024Mi" + limits: + cpu: "500m" + memory: "1024Mi" + restartPolicy: OnFailure + imagePullSecrets: + - name: default-secret + - replicas: 3 + name: worker + template: + spec: + containers: + - command: + - /bin/sh + - -c + - | + mkdir -p /var/run/sshd; /usr/sbin/sshd -D; + image: volcanosh/horovod-tf-mnist:0.5 + name: worker + ports: + - containerPort: 22 + name: job-port + resources: + requests: + cpu: "1000m" + memory: "2048Mi" + limits: + cpu: "1000m" + memory: "2048Mi" + restartPolicy: OnFailure + imagePullSecrets: + - name: default-secret +``` + +在这个配置中,我们定义了一个 Horovod 分布式训练作业,包含以下关键内容: + +1. 任务结构:由1个master节点和3个worker节点组成,总共4个Pod +2. 通信机制:利用Volcano的SSH插件实现节点间通信 +3. 资源分配:master节点分配较少资源(500m CPU/1Gi内存),worker节点分配更多资源(1000m CPU/2Gi内存) +4. 容错机制:当Pod被驱逐时,整个作业会重启 +5. 作业完成策略:当master任务完成时,整个作业被标记为完成 diff --git a/content/zh/docs/kubeflow_on_volcano.md b/content/zh/docs/kubeflow_on_volcano.md index 127b6590..fdc8d00a 100644 --- a/content/zh/docs/kubeflow_on_volcano.md +++ b/content/zh/docs/kubeflow_on_volcano.md @@ -2,7 +2,7 @@ title = "Kubeflow on Volcano" date = 2021-04-07 -lastmod = 2021-04-07 +lastmod = 2025-07-20 draft = false # Is this a draft? true/false toc = true # Show table of contents? true/false @@ -41,7 +41,9 @@ Kubeflow诞生于2017年,Kubeflow项目是基于容器和Kubernetes构建, Volcano是一款构建于Kubernetes之上的增强型高性能计算任务批量处理系统。作为一个面向高性能计算场景的平台,它弥补了kubernetes在机器学习、深度学习、HPC、大数据计算等场景下的基本能力缺失,其中包括gang-schedule的调度能力、计算任务队列管理、task-topology和GPU亲和性调度。另外,Volcano在原生kubernetes能力基础上对计算任务的批量创建及生命周期管理、fair-share、binpack调度等方面做了增强。Volcano充分解决了上文提到的Kubeflow分布式训练面临的问题。 -#### 下载kfctl +#### TFJob + +##### 下载kfctl 首先需要下载kfctl,可以根据系统来选择合适的压缩包文件[1]。 @@ -50,9 +52,7 @@ $ tar -xvf kfctl_v1.0.2-0-ga476281_linux.tar.gz $ sudo mv ./kfctl /usr/local/bin/kfctl ``` - - -#### 配置环境变量 +##### 配置环境变量 ``` $ export PATH= $PATH:"" @@ -62,9 +62,7 @@ $ export KF_DIR=${BASE_DIR}/${KF_NAME} $ export CONFIG_URI="https://raw.githubusercontent.com/kubeflow/manifests/v1.0-branch/kfdef/kfctl_k8s_istio.v1.0.2.yaml" ``` - - -#### 安装kubeflow +##### 安装kubeflow ``` $ mkdir -p ${KF_DIR} @@ -78,9 +76,7 @@ $ Kfctl apply -V -f ${CONFIG_URI} $ kubectl -n kubeflow get all ``` - - -#### 部署Mnist示例 +##### 部署Mnist示例 首先下载kubuflow官方提供的测试集。 @@ -88,16 +84,12 @@ $ kubectl -n kubeflow get all git clone https://github.com/kubeflow/examples.git ``` - - ``` pip3 install jupyter notebook jupyter notebook --allow-root ##启动jupyter ``` - - -#### 启动使用notebook +##### 启动使用notebook 提供对外接口服务,这里需要将集群下的节点绑定公网IP。如果没有安装notebook请先使用pip3安装。 @@ -113,9 +105,7 @@ $ jupyter notebook --allow-root 访问公网IP:30200,输入配置密码即可进入notebook。 - - -#### 在notebook上运行官方实例[2] +##### 在notebook上运行官方实例 1.打开notebook进行TFJob的部署。Open the notebook `mnist/mnist_vanilla_k8s.ipynb` ,根据指引来进行分布式Tf Job的部署。 @@ -250,4 +240,104 @@ spec: ``` kubectl apply -f mnist.yaml -``` \ No newline at end of file +``` + +#### 群组调度 + +群组调度(Gang Scheduling)是一种调度策略,主要用于分布式/并行任务。它保证一组 Pod(通常属于同一个分布式训练任务)要么一起启动,要么都不启动,以避免部分节点运行导致训练失败或资源浪费。 + +Kubeflow 支持通过volcano实现其中的群组调度。您必须首先在集群中安装 volcano 调度程序作为 Kubernetes 的辅助调度程序,并配置Operator以选择用于群组调度的调度程序名称,如下所示: + +- training-operator + +```diff +... + spec: + containers: + - command: + - /manager ++ - --gang-scheduler-name=volcano + image: kubeflow/training-operator + name: training-operator +... +``` + +- mpi-operator + +```diff +... + spec: + containers: + - args: ++ - --gang-scheduling=volcano + - -alsologtostderr + - --lock-namespace=mpi-operator + image: mpioperator/mpi-operator:0.4.0 + name: mpi-operator +... +``` + +说明: Volcano 调度器与 Kubeflow 中的 Operator 是通过[PodGroup](https://volcano.sh/en/docs/podgroup/)实现 gang-scheduling 的,Operator 会自动创建作业对应的 PodGroup。 + +如果要查看详细信息,请查看[链接](https://www.kubeflow.org/docs/components/trainer/legacy-v1/user-guides/job-scheduling/)了解更多。 + +#### 通过Arena + +Arena 是一个命令行工具,简化了 AI 训练和批量作业在 Kubernetes(包括 Volcano)上的提交和管理流程。 + +##### 提交volcanojob + +``` +$ arena submit volcanojob --name=demo + +configmap/demo-volcanojob created +configmap/demo-volcanojob labeled +job.batch.volcano.sh/demo created +INFO[0003] The Job demo has been submitted successfully +INFO[0003] You can run `arena get demo --type volcanojob` to check the job status +``` + +可以指定更多的参数: + +``` +$ arena submit volcanojob --name demo12 --taskImages busybox,busybox --taskReplicas 2 +``` + +##### 获取volcanojob详细信息 + +``` +arena get --type volcanojob demo12 + +STATUS: SUCCEEDED +NAMESPACE: default +TRAINING DURATION: 2m + +NAME STATUS TRAINER AGE INSTANCE NODE +demo12 SUCCEEDED VOLCANOJOB 2m demo12-task-0-0 11.245.101.184 +demo12 SUCCEEDED VOLCANOJOB 2m demo12-task-0-1 11.245.101.184 +demo12 SUCCEEDED VOLCANOJOB 2m demo12-task-1-0 11.245.101.184 +demo12 SUCCEEDED VOLCANOJOB 2m demo12-task-1-1 11.245.101.184 +``` + +它创建了两个任务,每个任务有 2 个副本。 + +##### 删除volcanojob + +``` +$ arena delete --type=volcanojob demo + +job.batch.volcano.sh "demo" deleted +configmap "demo-volcanojob" deleted +INFO[0000] The Job demo has been deleted successfully +``` + +##### 查看所有volcanojob + +``` +$ arena list + +NAME STATUS TRAINER AGE NODE +demo RUNNING VOLCANOJOB 2m 11.245.101.184 +``` + +如果要查看详细信息,请查看[链接](https://github.com/kubeflow/arena/blob/master/docs/training/volcanojob/volcanojob.md)了解更多。 \ No newline at end of file diff --git a/content/zh/docs/pytorch_on_volcano.md b/content/zh/docs/pytorch_on_volcano.md new file mode 100644 index 00000000..eea2edca --- /dev/null +++ b/content/zh/docs/pytorch_on_volcano.md @@ -0,0 +1,76 @@ ++++ +title = "Pytorch on Volcano" + +date = 2021-06-29 +lastmod = 2021-06-29 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "Pytorch" +[menu.docs] + parent = "zoology" + weight = 6 + ++++ + +### PyTorch简介 + +PyTorch 是一个开源的机器学习框架,由 Facebook(现 Meta)AI 研究团队开发。它以其动态计算图和直观的 Python 接口而闻名,使研究人员和开发者能够更灵活地构建和训练深度学习模型。PyTorch 提供了强大的 GPU 加速功能,支持分布式训练,并拥有丰富的工具生态系统, + +### PyTorch on volcano + +Volcano 对 pytorch的支持通过PyTorch 插件实现,它不仅允许用户编写更少的 YAML 配置,还能确保 PyTorch 作业的正常运行。 + +PyTorch 插件将完成三项任务: + +- 为作业的所有容器开放 PyTorch 使用的端口 +- 强制启用 `svc` 插件 +- 自动为容器添加 PyTorch 分布式训练所需的环境变量,如 `MASTER_ADDR`、`MASTER_PORT`、`WORLD_SIZE`、`RANK` 等 + +#### 参数列表 + +| 序号 | 名称 | 类型 | 默认值 | 是否必需 | 描述 | 示例 | +| ---- | ------ | ------ | ------ | -------- | ---------------------- | --------------- | +| 1 | master | 字符串 | master | 否 | PyTorch 主节点的名称 | --master=master | +| 2 | worker | 字符串 | worker | 否 | PyTorch 工作节点的名称 | --worker=worker | +| 3 | port | 字符串 | 23456 | 否 | 为容器开放的端口 | --port=23456 | + +#### 示例 + +```yaml +yamlapiVersion: batch.volcano.sh/v1alpha1 +kind: Job +metadata: + name: pytorch-job +spec: + minAvailable: 1 + schedulerName: volcano + plugins: + pytorch: ["--master=master","--worker=worker","--port=23456"] # PyTorch 插件注册 + tasks: + - replicas: 1 + name: master + policies: + - event: TaskCompleted + action: CompleteJob + template: + spec: + containers: + - image: gcr.io/kubeflow-ci/pytorch-dist-sendrecv-test:1.0 + imagePullPolicy: IfNotPresent + name: master + restartPolicy: OnFailure + - replicas: 2 + name: worker + template: + spec: + containers: + - image: gcr.io/kubeflow-ci/pytorch-dist-sendrecv-test:1.0 + imagePullPolicy: IfNotPresent + name: worker + workingDir: /home + restartPolicy: OnFailure +``` diff --git a/content/zh/docs/ray_on_volcano.md b/content/zh/docs/ray_on_volcano.md new file mode 100644 index 00000000..157b4547 --- /dev/null +++ b/content/zh/docs/ray_on_volcano.md @@ -0,0 +1,94 @@ ++++ +title = "Ray on Volcano" + +date = 2025-07-20 +lastmod = 2025-07-20 + +draft = false # Is this a draft? true/false +toc = true # Show table of contents? true/false +type = "docs" # Do not modify. + +# Add menu entry to sidebar. +linktitle = "Ray" +[menu.docs] + parent = "zoology" + weight = 6 + ++++ + +### Ray简介 + +如今,机器学习工作负载的计算密集度日益提升。单节点开发环境(例如笔记本电脑)虽然便捷,但无法扩展以满足这些需求。 + +Ray 是一种将 Python 和 AI 应用程序从笔记本电脑扩展到集群的统一方法。 + +使用 Ray,您可以将同一段代码从笔记本电脑无缝扩展到集群。Ray 的设计目标是通用,这意味着它可以高效地运行任何类型的工作负载。如果您的应用程序是用 Python 编写的,您可以使用 Ray 进行扩展,无需其他基础架构。 + +KubeRay 是一个功能强大的开源 Kubernetes 运维工具,可简化Ray应用程序在 Kubernetes 上的部署和管理。 + +### Ray on volcano + +KubeRay 的 Volcano 集成能够在多租户 Kubernetes 环境中更高效地调度 Ray Pod。 + +#### 安装 KubeRay Operator + +部署 KubeRay Operator 并启用 Volcano 批量调度支持,您可以通过以下两种方式实现: + +##### 方式一:使用 values.yaml 配置文件 + +在 values.yaml 文件中设置 Volcano 作为批量调度器: + +```yaml +yaml# values.yaml 文件 +batchScheduler: + name: volcano +``` + +然后使用此配置文件安装 Helm chart: + +```bash +bashhelm install kuberay-operator kuberay/kuberay-operator --version 1.4.2 -f values.yaml +``` + +##### 方式二:使用命令行参数 + +直接在 Helm 安装命令中通过 --set 参数指定批量调度器: + +```bash +bashhelm install kuberay-operator kuberay/kuberay-operator --version 1.4.2 --set batchScheduler.name=volcano +``` + +#### 安装 RayCluster 并配置 Volcano 调度 + +要使用 Volcano 调度器管理 RayCluster,请按照以下步骤操作: + +##### 基本安装 + +1. 下载支持 Volcano 调度的 RayCluster 示例配置: + + ```bash + bashcurl -LO https://raw.githubusercontent.com/ray-project/kuberay/v1.4.2/ray-operator/config/samples/ray-cluster.volcano-scheduler.yaml + ``` + +2. 应用配置创建 RayCluster: + + ```bash + bashkubectl apply -f ray-cluster.volcano-scheduler.yaml + ``` + +3. 验证集群状态: + + ```bash + bashkubectl get pod -l ray.io/cluster=test-cluster-0 + ``` + + 成功部署后应显示如下输出: + + ``` + NAME READY STATUS RESTARTS AGE + test-cluster-0-head-jj9bg 1/1 Running 0 36s + ``` + +接下来即可使用 Volcano 和 KubeRay 协同工作。如果要查看详细信息,请查看[链接](https://docs.ray.io/en/master/cluster/kubernetes/k8s-ecosystem/volcano.html)了解更多。 + +####