diff --git a/README.md b/README.md index a88c74f..cfbbe4a 100644 --- a/README.md +++ b/README.md @@ -1,53 +1,38 @@ -# Running RDMA (remote direct memory access) GPU workloads on OKE using GPU Operator and Network Operator - +# Running RDMA (remote direct memory access) GPU workloads on OKE Oracle Cloud Infrastructure Container Engine for Kubernetes (OKE) is a fully-managed, scalable, and highly available service that you can use to deploy your containerized applications to the cloud. Please visit the [OKE documentation page](https://docs.oracle.com/en-us/iaas/Content/ContEng/Concepts/contengoverview.htm) for more information. -This guide has the instructions for deploying an OKE cluster using H100 & A100 bare metal nodes with RDMA connectivity using the [GPU Operator](https://github.com/NVIDIA/gpu-operator) and [Network Operator](https://github.com/Mellanox/network-operator). - -> [!IMPORTANT] -> Currently, creating SR-IOV Virtual Functions is supported in limited regions. For H100, all regions with H100s are supported. For A100s, Phoenix (PHX) and Osaka (KIX) regions are supported. For other regions, please contact your sales representative. - -### What is NVIDIA GPU Operator? -Kubernetes provides access to special hardware resources such as NVIDIA GPUs, NICs, Infiniband adapters and other devices through the device plugin framework. However, configuring and managing nodes with these hardware resources requires configuration of multiple software components such as drivers, container runtimes or other libraries which are difficult and prone to errors. The NVIDIA GPU Operator uses the operator framework within Kubernetes to automate the management of all NVIDIA software components needed to provision GPU. These components include the NVIDIA drivers (to enable CUDA), Kubernetes device plugin for GPUs, the NVIDIA Container Runtime, automatic node labelling, DCGM based monitoring and others. - -### What is NVIDIA Network Operator? -NVIDIA Network Operator leverages Kubernetes CRDs and Operator SDK to manage Networking related Components in order to enable Fast networking, RDMA and GPUDirect for workloads in a Kubernetes cluster. - -The Goal of Network Operator is to manage all networking related components to enable execution of RDMA and GPUDirect RDMA workloads in a kubernetes cluster. - ### Supported Operating Systems -For the A100 and H100 shapes (BM.GPU.H100.8, BM.GPU.A100-v2.8, BM.GPU4.8), Oracle Linux 8 with the Red Hat Compatible Kernel (RHCK) is supported. +For the A100 and H100 shapes (BM.GPU.H100.8, BM.GPU.A100-v2.8, BM.GPU4.8, BM.GPU.B4.8), Ubuntu 22.04 is supported. ### Required policies -The Terraform deployment template uses the [Self Managed Nodes](https://docs.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengworkingwithselfmanagednodes.htm) functionality of OKE. +The OCI Resource Manager stack template uses the [Self Managed Nodes](https://docs.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengworkingwithselfmanagednodes.htm) functionality of OKE. -You must create the necessary OKE policies: +Below policies are required. The OCI Resource Manager stack will create them for you if you have the necessary permissions. If you don't have the permissions, please find more information about the policies below. - [Policy Configuration for Cluster Creation and Deployment](https://docs.oracle.com/en-us/iaas/Content/ContEng/Concepts/contengpolicyconfig.htm) - [Creating a Dynamic Group and a Policy for Self-Managed Nodes](https://docs.oracle.com/en-us/iaas/Content/ContEng/Tasks/contengdynamicgrouppolicyforselfmanagednodes.htm) ## Instructions for deploying an OKE cluster with GPUs and RDMA connectivity +You will need a CPU pool and a GPU pool. The OCI Resource Manager stack deploys an operational worker pool by default and you choose to deploy addidional CPU/GPU worker pools. -You will need a CPU and a GPU pool. The Terraform template deploys an operational/system worker pool (CPU) and a GPU worker pool. - -The GPU pool requires you to use an image provided by the Oracle HPC team, you can find the import link below. This image included the OFED drivers and necessary packages configured for RDMA. - -For the non-GPU worker pools, you can use the default OKE images (no need to specify them in the Terraform template). +You can use the below image for both CPU and GPU pools. > [!NOTE] -> The GPU image has the GPU drivers pre-installed (GPU driver version 535.154.05 with CUDA 12.2). Deploying the GPU driver as a container with the GPU Operator is currently not supported. +> The GPU image has the GPU drivers pre-installed (GPU driver version 535.154.05 with CUDA 12.2). #### Image to import and use for the H100 and A100 nodes -[OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-OKE-2024.02.12-0](https://objectstorage.us-ashburn-1.oraclecloud.com/p/f6mKO0d_OG7gL4EyE5rvOWObL6LBgQ1XXtpM2H67SYmFHQ-tBwxyg7Wmii94VYc8/n/hpc_limited_availability/b/images/o/OracleLinux-8-OCA-RHCK-OFED-5.8-3.0.7.0-GPU-535-OKE-2024.02.12-0) +You can use the instructions [here.](https://docs.oracle.com/en-us/iaas/Content/Compute/Tasks/imageimportexport.htm#Importing) for importing the below image to your tenancy. + +[Image to import](https://objectstorage.ca-toronto-1.oraclecloud.com/p/oXC6BcCkB0lXhycxV-0UuDqGGnVtFWfLOkwuJWA5WbsBDb4FkHwnsOHa_ElRcfL2/n/hpc_limited_availability/b/images/o/Ubuntu-22-OCA-OFED-23.10-2.1.3.1-GPU-535-CUDA-12.2-2024.03.15-0) -### Deploy the cluster using the Terraform template -You can find the template in the [terraform directory](./terraform/). +### Deploy the cluster using the Oracle Cloud Resource Manager template +You can easily deploy the cluster using the **Deploy to Oracle Cloud** button below. -Make sure to update the variables in the `worker pools` blocks. +[![Deploy to Oracle Cloud](https://oci-resourcemanager-plugin.plugins.oci.oraclecloud.com/latest/deploy-to-oracle-cloud.svg)](https://cloud.oracle.com/resourcemanager/stacks/create?zipUrl=https://github.com/oracle-quickstart/oci-hpc-oke/releases/download/v24.6.0/oke-rdma-quickstart-v24.6.0.zip) -You can find more information on setting up Terraform for OCI [here](https://docs.oracle.com/en-us/iaas/developer-tutorials/tutorials/tf-provider/01-summary.htm). +For the image ID, use the ID of the image that you imported in the previous step. The template will deploy a `bastion` instance and an `operator` instance. The `operator` instance will have access to the OKE cluster. You can connect to the `operator` instance via SSH with `ssh -J opc@ opc@`. @@ -61,136 +46,67 @@ NAME STATUS ROLES AGE VERSION 10.0.127.206 Ready node 2d3h v1.25.6 10.0.127.32 Ready node 2d3h v1.25.6 10.0.83.93 Ready 2d23h v1.25.6 -10.0.96.81 Ready node 2d23h v1.25.6 -``` - -### Get the latest Helm 3 version -```sh -curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 -chmod 700 get_helm.sh -./get_helm.sh -``` - -### Add Helm repos for Network Operator and GPU Operator -```sh -helm repo add nvidia https://helm.ngc.nvidia.com/nvidia -helm repo update -``` - -### Deploy GPU Operator -``` -helm install --wait \ - -n gpu-operator --create-namespace \ - gpu-operator nvidia/gpu-operator \ - --version v23.9.1 \ - --set driver.enabled=false \ - --set operator.defaultRuntime=crio \ - --set toolkit.version=v1.14.5-ubi8 \ - --set driver.rdma.enabled=true \ - --set driver.rdma.useHostMofed=true -``` - -Wait until all network operator pods are running with `kubectl get pods -n gpu-operator`. - -### Deploy Network Operator - -> [!IMPORTANT] -> The device name you will use when deploying the Network Operator is different between A100 and H100 shapes. Please make sure that you are running the correct command based on your shape. - -#### A100 shapes (BM.GPU.A100-v2.8, BM.GPU4.8) -``` -helm install --wait \ - -n network-operator --create-namespace \ - network-operator nvidia/network-operator \ - --version v23.10.0 \ - --set deployCR=true \ - --set nfd.enabled=false \ - --set rdmaSharedDevicePlugin.deploy=false \ - --set nvPeerDriver.deploy=true \ - --set sriovDevicePlugin.deploy=true \ - --set secondaryNetwork.ipamPlugin.deploy=false \ - --set nvIpam.deploy=true \ - --set-json sriovDevicePlugin.resources='[{"name": "sriov_rdma_vf", "drivers": ["mlx5_core"], "devices": ["101a"], "isRdma": [true]}]' -``` - -#### H100 shapes (BM.GPU.H100.8) -``` -helm install --wait \ - -n network-operator --create-namespace \ - network-operator nvidia/network-operator \ - --version v23.10.0 \ - --set deployCR=true \ - --set nfd.enabled=false \ - --set rdmaSharedDevicePlugin.deploy=false \ - --set nvPeerDriver.deploy=true \ - --set sriovDevicePlugin.deploy=true \ - --set secondaryNetwork.ipamPlugin.deploy=false \ - --set nvIpam.deploy=true \ - --set-json sriovDevicePlugin.resources='[{"name": "sriov_rdma_vf", "drivers": ["mlx5_core"], "devices": ["101e"], "isRdma": [true]}]' -``` - -### Deploy SR-IOV CNI +10.0.96.82 Ready node 2d23h v1.25.6 ``` -kubectl apply -f https://raw.githubusercontent.com/openshift/sriov-cni/master/images/k8s-v1.16/sriov-cni-daemonset.yaml -``` - -### Deploy RDMA CNI -``` -kubectl apply -f https://raw.githubusercontent.com/k8snetworkplumbingwg/rdma-cni/master/deployment/rdma-cni-daemonset.yaml -``` - -Wait until all network operator pods are running with `kubectl get pods -n network-operator`. - -### Deploy the Virtual Function Configuration daemonset -``` -kubectl apply -f https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/main/manifests/vf-config.yaml -``` -### Create Network Attachment Definition -```sh -kubectl apply -f https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/main/manifests/network-attachment-definition.yaml -``` - -### Create the IP Pool for Nvidia IPAM -``` -kubectl apply -f https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/main/manifests/ip-pool.yaml -``` - -### Create the topology ConfigMap -This step creates a ConfigMap that can be used as the NCCL topology file when running your jobs that use NCCL as the backend. +### Using the host RDMA network interfaces in manifests +In order to use the RDMA interfaces on the host in your pods, you should have the below sections in your manifests: -You can find the topology files in the [topology directory](https://github.com/oracle-quickstart/oci-hpc-oke/tree/main/manifests/topology) in this repo. Please make sure you use the correct topology file based on your shape when creating the ConfigMap. - -``` -SHAPE= - -curl -s -o ./topo.xml https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/main/manifests/topology/$SHAPE.xml - -kubectl create configmap nccl-topology --from-file ./topo.xml +```yaml +spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + volumes: + - { name: devinf, hostPath: { path: /dev/infiniband }} + - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }} ``` -### Confirm that the GPUs are Virtual Functions (VFs) are correctly exposed -Once the Network Operator pods are deployed, the GPU nodes with RDMA NICs will start reporting `nvidia.com/sriov_rdma_vf` as an available resource. You can request that resource in your pod manifests for assigning RDMA VFs to pods. - -By default, we create one Virtual Function per Physical Function. So for the H100 and A100 bare metal shapes, you will see 16 VFs per node exposed as a resource. - +```yaml +securityContext: + privileged: true + capabilities: + add: [ "IPC_LOCK" ] ``` -kubectl get nodes -l 'node.kubernetes.io/instance-type in (BM.GPU.H100.8, BM.GPU.A100-v2.8, BM.GPU4.8, BM.GPU.B4.8)' --sort-by=.status.capacity."nvidia\.com/gpu" -o=custom-columns='NODE:metadata.name,GPUs:status.capacity.nvidia\.com/gpu,RDMA-VFs:status.capacity.nvidia\.com/sriov_rdma_vf' - -NODE GPUs RDMA-VFs -10.79.148.115 8 16 -10.79.151.167 8 16 -10.79.156.205 8 16 +```yaml + volumeMounts: + - { mountPath: /dev/infiniband, name: devinf } + - { mountPath: /dev/shm, name: shm } ``` - -### Requesting VFs in manifests -Network Operator exposes the RDMA Virtual Functions (VFs) as allocatable resources. To use them, you need to add the following annotation to your manifests. The next step in this guide has an example for running the NCCL test, you can use that manifest as an example. +Here's a simple example. You can also look at the NCCL test manifests in the repo [here.](../manifests/) ```yaml - template: - metadata: - annotations: - k8s.v1.cni.cncf.io/networks: oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov +apiVersion: v1 +kind: Pod +metadata: + name: rdma-test-pod-1 +spec: + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + volumes: + - { name: devinf, hostPath: { path: /dev/infiniband }} + - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }} + restartPolicy: OnFailure + containers: + - image: oguzpastirmaci/mofed-perftest:5.4-3.6.8.1-ubuntu20.04-amd64 + name: mofed-test-ctr + securityContext: + privileged: true + capabilities: + add: [ "IPC_LOCK" ] + volumeMounts: + - { mountPath: /dev/infiniband, name: devinf } + - { mountPath: /dev/shm, name: shm } + resources: + requests: + cpu: 8 + ephemeral-storage: 32Gi + memory: 2Gi + command: + - sh + - -c + - | + ls -l /dev/infiniband /sys/class/net + sleep 1000000 ``` ### Optional - Deploy Volcano and run the NCCL test @@ -207,16 +123,26 @@ kubectl create rolebinding default-view --namespace default --serviceaccount def #### Run the NCCL test > [!IMPORTANT] -> The NCCL parameters are different between the H100 and A100 shapes. Please make sure that you are using the correct manifest. +> The NCCL parameters are different between the H100 and A100 shapes. Please make sure that you are using the correct manifest for your bare metal GPU shapes. + +##### BM.GPU.H100 +``` +kubectl apply -f https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/main/manifests/BM.GPU.H100.8-nccl-test.yaml +``` + +##### BM.GPU.A100-v2.8 +``` +kubectl apply -f https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/main/manifests/BM.GPU.A100-v2.8-nccl-test.yaml +``` -##### H100 +##### BM.GPU4.8 ``` -kubectl apply -f https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/main/manifests/h100-nccl-test.yaml +kubectl apply -f https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/main/manifests/BM.GPU4.8-nccl-test.yaml ``` -##### A100 +##### BM.GPU.B4.8 ``` -kubectl apply -f https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/main/manifests/a100-nccl-test.yaml +kubectl apply -f https://raw.githubusercontent.com/oracle-quickstart/oci-hpc-oke/main/manifests/BM.GPU.B4.8-nccl-test.yaml ``` The initial pull of the container will take long. Once the master pod `nccl-allreduce-job0-mpimaster-0` starts running, you can check it logs for the NCCL test result. diff --git a/manifests/BM.GPU.A100-v2.8-nccl-test.yaml b/manifests/BM.GPU.A100-v2.8-nccl-test.yaml new file mode 100644 index 0000000..21d2ff2 --- /dev/null +++ b/manifests/BM.GPU.A100-v2.8-nccl-test.yaml @@ -0,0 +1,118 @@ +apiVersion: batch.volcano.sh/v1alpha1 +kind: Job +metadata: + annotations: + name: nccl-allreduce-job0 +spec: + minAvailable: 0 + plugins: + ssh: [] + svc: [] + queue: default + schedulerName: volcano + tasks: + - name: mpimaster + policies: + - action: CompleteJob + event: TaskCompleted + replicas: 1 + template: + metadata: + spec: + containers: + - command: + - /bin/bash + - -c + - | + set -e -o pipefail; trap 'exit=1' SIGINT + NUM_GPUS=8 + NUM_HOSTS=$(sed -n '$=' /etc/volcano/mpiworker.host) + NP=$(($NUM_HOSTS*$NUM_GPUS)) + mpirun --allow-run-as-root \ + -mca coll ^hcoll -mca plm_rsh_args "-p 2222" \ + -mca coll_hcoll_enable 0 \ + -np $NP -npernode $NUM_GPUS --bind-to numa \ + -hostfile /etc/volcano/mpiworker.host \ + -x NCCL_DEBUG=WARN \ + -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \ + -x NCCL_IB_QPS_PER_CONNECTION=4 \ + -x NCCL_IB_GID_INDEX=3 \ + -x NCCL_IB_HCA==mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12 \ + -x NCCL_IB_TC=41 \ + -x NCCL_IB_SL=0 \ + -x NCCL_IB_TIMEOUT=22 \ + -x HCOLL_ENABLE_MCAST_ALL=0 \ + -x UCX_TLS=tcp \ + -x UCX_NET_DEVICES=eth0 \ + /workspace/nccl-tests/build/all_reduce_perf -b 1G -f 2 -g 1 -e 4G -c 1 + ports: + - { name: mpijob-port, containerPort: 2222, protocol: TCP } + image: ord.ocir.io/hpc_limited_availability/nccl-tests:pytorch-24.02-nccl-2.20.5-1 + name: mpimaster + resources: + limits: + ephemeral-storage: 32Gi + requests: + cpu: 8 + ephemeral-storage: 32Gi + memory: 2Gi + securityContext: + privileged: true + capabilities: + add: + - IPC_LOCK + - CAP_SYS_ADMIN + volumeMounts: + - { mountPath: /dev/infiniband, name: devinf } + - { mountPath: /dev/shm, name: shm } + workingDir: /workspace + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + restartPolicy: OnFailure + terminationGracePeriodSeconds: 2 + volumes: + - { name: devinf, hostPath: { path: /dev/infiniband }} + - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }} + - minAvailable: 0 + name: mpiworker + replicas: 2 + template: + metadata: + spec: + containers: + - command: + - /bin/bash + - -c + - mkdir -p /var/run/sshd; /usr/sbin/sshd -D -p 2222 || sleep 999999999; + image: ord.ocir.io/hpc_limited_availability/nccl-tests:pytorch-24.02-nccl-2.20.5-1 + name: mpiworker + ports: + - { name: mpijob-port, containerPort: 2222, protocol: TCP } + resources: + limits: + ephemeral-storage: 32Gi + nvidia.com/gpu: 8 + requests: + cpu: 64 + ephemeral-storage: 32Gi + memory: 512Gi + nvidia.com/gpu: 8 + securityContext: + privileged: true + capabilities: + add: + - IPC_LOCK + - CAP_SYS_ADMIN + volumeMounts: + - { mountPath: /dev/infiniband, name: devinf } + - { mountPath: /dev/shm, name: shm } + workingDir: /workspace + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + restartPolicy: OnFailure + terminationGracePeriodSeconds: 15 + tolerations: + - { key: nvidia.com/gpu, operator: Exists } + volumes: + - { name: devinf, hostPath: { path: /dev/infiniband }} + - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }} \ No newline at end of file diff --git a/manifests/BM.GPU.B4.8-nccl-test.yaml b/manifests/BM.GPU.B4.8-nccl-test.yaml new file mode 100644 index 0000000..21d2ff2 --- /dev/null +++ b/manifests/BM.GPU.B4.8-nccl-test.yaml @@ -0,0 +1,118 @@ +apiVersion: batch.volcano.sh/v1alpha1 +kind: Job +metadata: + annotations: + name: nccl-allreduce-job0 +spec: + minAvailable: 0 + plugins: + ssh: [] + svc: [] + queue: default + schedulerName: volcano + tasks: + - name: mpimaster + policies: + - action: CompleteJob + event: TaskCompleted + replicas: 1 + template: + metadata: + spec: + containers: + - command: + - /bin/bash + - -c + - | + set -e -o pipefail; trap 'exit=1' SIGINT + NUM_GPUS=8 + NUM_HOSTS=$(sed -n '$=' /etc/volcano/mpiworker.host) + NP=$(($NUM_HOSTS*$NUM_GPUS)) + mpirun --allow-run-as-root \ + -mca coll ^hcoll -mca plm_rsh_args "-p 2222" \ + -mca coll_hcoll_enable 0 \ + -np $NP -npernode $NUM_GPUS --bind-to numa \ + -hostfile /etc/volcano/mpiworker.host \ + -x NCCL_DEBUG=WARN \ + -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \ + -x NCCL_IB_QPS_PER_CONNECTION=4 \ + -x NCCL_IB_GID_INDEX=3 \ + -x NCCL_IB_HCA==mlx5_1,mlx5_2,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_14,mlx5_15,mlx5_16,mlx5_17,mlx5_9,mlx5_10,mlx5_11,mlx5_12 \ + -x NCCL_IB_TC=41 \ + -x NCCL_IB_SL=0 \ + -x NCCL_IB_TIMEOUT=22 \ + -x HCOLL_ENABLE_MCAST_ALL=0 \ + -x UCX_TLS=tcp \ + -x UCX_NET_DEVICES=eth0 \ + /workspace/nccl-tests/build/all_reduce_perf -b 1G -f 2 -g 1 -e 4G -c 1 + ports: + - { name: mpijob-port, containerPort: 2222, protocol: TCP } + image: ord.ocir.io/hpc_limited_availability/nccl-tests:pytorch-24.02-nccl-2.20.5-1 + name: mpimaster + resources: + limits: + ephemeral-storage: 32Gi + requests: + cpu: 8 + ephemeral-storage: 32Gi + memory: 2Gi + securityContext: + privileged: true + capabilities: + add: + - IPC_LOCK + - CAP_SYS_ADMIN + volumeMounts: + - { mountPath: /dev/infiniband, name: devinf } + - { mountPath: /dev/shm, name: shm } + workingDir: /workspace + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + restartPolicy: OnFailure + terminationGracePeriodSeconds: 2 + volumes: + - { name: devinf, hostPath: { path: /dev/infiniband }} + - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }} + - minAvailable: 0 + name: mpiworker + replicas: 2 + template: + metadata: + spec: + containers: + - command: + - /bin/bash + - -c + - mkdir -p /var/run/sshd; /usr/sbin/sshd -D -p 2222 || sleep 999999999; + image: ord.ocir.io/hpc_limited_availability/nccl-tests:pytorch-24.02-nccl-2.20.5-1 + name: mpiworker + ports: + - { name: mpijob-port, containerPort: 2222, protocol: TCP } + resources: + limits: + ephemeral-storage: 32Gi + nvidia.com/gpu: 8 + requests: + cpu: 64 + ephemeral-storage: 32Gi + memory: 512Gi + nvidia.com/gpu: 8 + securityContext: + privileged: true + capabilities: + add: + - IPC_LOCK + - CAP_SYS_ADMIN + volumeMounts: + - { mountPath: /dev/infiniband, name: devinf } + - { mountPath: /dev/shm, name: shm } + workingDir: /workspace + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + restartPolicy: OnFailure + terminationGracePeriodSeconds: 15 + tolerations: + - { key: nvidia.com/gpu, operator: Exists } + volumes: + - { name: devinf, hostPath: { path: /dev/infiniband }} + - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }} \ No newline at end of file diff --git a/manifests/BM.GPU.H100.8-nccl-test.yaml b/manifests/BM.GPU.H100.8-nccl-test.yaml new file mode 100644 index 0000000..b433f57 --- /dev/null +++ b/manifests/BM.GPU.H100.8-nccl-test.yaml @@ -0,0 +1,124 @@ +apiVersion: batch.volcano.sh/v1alpha1 +kind: Job +metadata: + annotations: + name: nccl-allreduce-job0 +spec: + minAvailable: 0 + plugins: + ssh: [] + svc: [] + queue: default + schedulerName: volcano + tasks: + - name: mpimaster + policies: + - action: CompleteJob + event: TaskCompleted + replicas: 1 + template: + metadata: + spec: + containers: + - command: + - /bin/bash + - -c + - | + set -e -o pipefail; trap 'exit=1' SIGINT + NUM_GPUS=8 + NUM_HOSTS=$(sed -n '$=' /etc/volcano/mpiworker.host) + NP=$(($NUM_HOSTS*$NUM_GPUS)) + mpirun --allow-run-as-root \ + -mca coll ^hcoll -mca plm_rsh_args "-p 2222" \ + -mca coll_hcoll_enable 0 \ + -np $NP -npernode $NUM_GPUS --bind-to numa \ + -hostfile /etc/volcano/mpiworker.host \ + -x NCCL_CROSS_NIC=2 \ + -x NCCL_SOCKET_NTHREADS=16 \ + -x NCCL_DEBUG=WARN \ + -x NCCL_CUMEM_ENABLE=0 \ + -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \ + -x NCCL_IB_QPS_PER_CONNECTION=16 \ + -x NCCL_IB_GID_INDEX=3 \ + -x NCCL_IB_HCA==mlx5_0,mlx5_1,mlx5_3,mlx5_4,mlx5_5,mlx5_6,mlx5_7,mlx5_8,mlx5_9,mlx5_10,mlx5_12,mlx5_13,mlx5_14,mlx5_15,mlx5_16,mlx5_17 \ + -x NCCL_IB_TC=41 \ + -x NCCL_IB_SL=0 \ + -x NCCL_IB_TIMEOUT=22 \ + -x HCOLL_ENABLE_MCAST_ALL=0 \ + -x UCX_TLS=tcp \ + -x UCX_NET_DEVICES=eth0 \ + -x RX_QUEUE_LEN=8192 \ + -x IB_RX_QUEUE_LEN=8192 \ + -x NCCL_SOCKET_IFNAME=eth0 \ + -x NCCL_IGNORE_CPU_AFFINITY=1 \ + /workspace/nccl-tests/build/alltoall_perf -b 8 -f 2 -g 1 -e 4G -c 1 + while :; do { [[ $exit ]] && break; }; sleep 1; done + ports: + - { name: mpijob-port, containerPort: 2222, protocol: TCP } + image: ord.ocir.io/hpc_limited_availability/nccl-tests:pytorch-24.02-nccl-2.20.5-1 + name: mpimaster + resources: + limits: + ephemeral-storage: 32Gi + requests: + cpu: 128 + ephemeral-storage: 32Gi + memory: 512Gi + securityContext: + privileged: true + capabilities: + add: + - IPC_LOCK + volumeMounts: + - { mountPath: /dev/infiniband, name: devinf } + - { mountPath: /dev/shm, name: shm } + workingDir: /workspace + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + restartPolicy: OnFailure + terminationGracePeriodSeconds: 2 + volumes: + - { name: devinf, hostPath: { path: /dev/infiniband }} + - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }} + - minAvailable: 0 + name: mpiworker + replicas: 2 + template: + metadata: + spec: + containers: + - command: + - /bin/bash + - -c + - mkdir -p /var/run/sshd; /usr/sbin/sshd -D -p 2222 || sleep 999999999; + image: ord.ocir.io/hpc_limited_availability/nccl-tests:pytorch-24.02-nccl-2.20.5-1 + name: mpiworker + ports: + - { name: mpijob-port, containerPort: 2222, protocol: TCP } + resources: + limits: + ephemeral-storage: 32Gi + nvidia.com/gpu: 8 + requests: + cpu: 100 + ephemeral-storage: 32Gi + memory: 512Gi + nvidia.com/gpu: 8 + securityContext: + privileged: true + capabilities: + add: + - IPC_LOCK + volumeMounts: + - { mountPath: /dev/infiniband, name: devinf } + - { mountPath: /dev/shm, name: shm } + workingDir: /workspace + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + restartPolicy: OnFailure + terminationGracePeriodSeconds: 15 + tolerations: + - { key: nvidia.com/gpu, operator: Exists } + volumes: + - { name: devinf, hostPath: { path: /dev/infiniband }} + - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }} \ No newline at end of file diff --git a/manifests/BM.GPU4.8-nccl-test.yaml b/manifests/BM.GPU4.8-nccl-test.yaml new file mode 100644 index 0000000..c9d78d0 --- /dev/null +++ b/manifests/BM.GPU4.8-nccl-test.yaml @@ -0,0 +1,118 @@ +apiVersion: batch.volcano.sh/v1alpha1 +kind: Job +metadata: + annotations: + name: nccl-allreduce-job0 +spec: + minAvailable: 0 + plugins: + ssh: [] + svc: [] + queue: default + schedulerName: volcano + tasks: + - name: mpimaster + policies: + - action: CompleteJob + event: TaskCompleted + replicas: 1 + template: + metadata: + spec: + containers: + - command: + - /bin/bash + - -c + - | + set -e -o pipefail; trap 'exit=1' SIGINT + NUM_GPUS=8 + NUM_HOSTS=$(sed -n '$=' /etc/volcano/mpiworker.host) + NP=$(($NUM_HOSTS*$NUM_GPUS)) + mpirun --allow-run-as-root \ + -mca coll ^hcoll -mca plm_rsh_args "-p 2222" \ + -mca coll_hcoll_enable 0 \ + -np $NP -npernode $NUM_GPUS --bind-to numa \ + -hostfile /etc/volcano/mpiworker.host \ + -x NCCL_DEBUG=WARN \ + -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \ + -x NCCL_IB_QPS_PER_CONNECTION=4 \ + -x NCCL_IB_GID_INDEX=3 \ + -x NCCL_IB_HCA==mlx5_0,mlx5_2,mlx5_6,mlx5_8,mlx5_10,mlx5_12,mlx5_14,mlx5_16,mlx5_1,mlx5_3,mlx5_7,mlx5_9,mlx5_11,mlx5_13,mlx5_15,mlx5_17 \ + -x NCCL_IB_TC=41 \ + -x NCCL_IB_SL=0 \ + -x NCCL_IB_TIMEOUT=22 \ + -x HCOLL_ENABLE_MCAST_ALL=0 \ + -x UCX_TLS=tcp \ + -x UCX_NET_DEVICES=eth0 \ + /workspace/nccl-tests/build/all_reduce_perf -b 1G -f 2 -g 1 -e 4G -c 1 + ports: + - { name: mpijob-port, containerPort: 2222, protocol: TCP } + image: ord.ocir.io/hpc_limited_availability/nccl-tests:pytorch-24.02-nccl-2.20.5-1 + name: mpimaster + resources: + limits: + ephemeral-storage: 32Gi + requests: + cpu: 8 + ephemeral-storage: 32Gi + memory: 2Gi + securityContext: + privileged: true + capabilities: + add: + - IPC_LOCK + - CAP_SYS_ADMIN + volumeMounts: + - { mountPath: /dev/infiniband, name: devinf } + - { mountPath: /dev/shm, name: shm } + workingDir: /workspace + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + restartPolicy: OnFailure + terminationGracePeriodSeconds: 2 + volumes: + - { name: devinf, hostPath: { path: /dev/infiniband }} + - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }} + - minAvailable: 0 + name: mpiworker + replicas: 2 + template: + metadata: + spec: + containers: + - command: + - /bin/bash + - -c + - mkdir -p /var/run/sshd; /usr/sbin/sshd -D -p 2222 || sleep 999999999; + image: ord.ocir.io/hpc_limited_availability/nccl-tests:pytorch-24.02-nccl-2.20.5-1 + name: mpiworker + ports: + - { name: mpijob-port, containerPort: 2222, protocol: TCP } + resources: + limits: + ephemeral-storage: 32Gi + nvidia.com/gpu: 8 + requests: + cpu: 64 + ephemeral-storage: 32Gi + memory: 512Gi + nvidia.com/gpu: 8 + securityContext: + privileged: true + capabilities: + add: + - IPC_LOCK + - CAP_SYS_ADMIN + volumeMounts: + - { mountPath: /dev/infiniband, name: devinf } + - { mountPath: /dev/shm, name: shm } + workingDir: /workspace + dnsPolicy: ClusterFirstWithHostNet + hostNetwork: true + restartPolicy: OnFailure + terminationGracePeriodSeconds: 15 + tolerations: + - { key: nvidia.com/gpu, operator: Exists } + volumes: + - { name: devinf, hostPath: { path: /dev/infiniband }} + - { name: shm, emptyDir: { medium: Memory, sizeLimit: 32Gi }} \ No newline at end of file diff --git a/manifests/a100-nccl-test.yaml b/manifests/a100-nccl-test.yaml deleted file mode 100644 index e046f50..0000000 --- a/manifests/a100-nccl-test.yaml +++ /dev/null @@ -1,144 +0,0 @@ -apiVersion: batch.volcano.sh/v1alpha1 -kind: Job -metadata: - name: nccl-allreduce-job0 -spec: - minAvailable: 1 - schedulerName: volcano - plugins: - ssh: [] - svc: [] - queue: default - tasks: - - replicas: 1 - name: mpimaster - policies: - - event: TaskCompleted - action: CompleteJob - template: - spec: - volumes: - - name: topo - configMap: - name: nccl-topology - items: - - key: topo.xml - path: topo.xml - - name: root - hostPath: - path: / - type: Directory - initContainers: - - command: - - /bin/bash - - -c - - | - until [[ "$(kubectl get pod -l volcano.sh/job-name=nccl-allreduce-job0,volcano.sh/task-spec=mpiworker -o json | jq '.items | length')" != 0 ]]; do - echo "Waiting for MPI worker pods..." - sleep 3 - done - echo "Waiting for MPI worker pods to be ready..." - kubectl wait pod -l volcano.sh/job-name=nccl-allreduce-job0,volcano.sh/task-spec=mpiworker --for=condition=Ready --timeout=600s && sleep 2 - image: aga.ocir.io/hpc_limited_availability/oke/kubectl:latest - name: wait-for-workers - serviceAccount: mpi-worker-view - terminationGracePeriodSeconds: 2 - tolerations: - - key: nvidia.com/gpu - operator: Exists - containers: - - command: - - /bin/bash - - -c - - | - MPI_HOST=$(cat /etc/volcano/mpiworker.host | tr "\n" ",") - mkdir -p /var/run/sshd; /usr/sbin/sshd - mpirun --allow-run-as-root \ - -np 16 -npernode 8 --bind-to numa \ - -hostfile /etc/volcano/mpiworker.host \ - --mca pml ucx -mca coll ^hcoll \ - -x HCOLL_ENABLE_MCAST_ALL=0 \ - -x coll_hcoll_enable=0 \ - -x UCX_NET_DEVICES=eth0 \ - -x NCCL_CROSS_NIC=1 \ - -x NCCL_IB_GID_INDEX=3 \ - -x NCCL_SOCKET_IFNAME==eth0 \ - -x NCCL_IB_QPS_PER_CONNECTION=4 \ - -x NCCL_IB_TC=41 \ - -x NCCL_IB_SL=0 \ - -x NCCL_IB_HCA=mlx5 \ - -x NCCL_TOPO_FILE=/topo/topo.xml \ - /workspace/nccl-tests/build/all_reduce_perf -b 8 -f 2 -g 1 -e 8G -c 1; sleep 3600 - image: iad.ocir.io/hpc_limited_availability/nccl-tests:pytorch-23.10-nccl-2.19.3-1 - volumeMounts: - - { mountPath: /topo, name: topo } - - { mountPath: /host, name: root } - securityContext: - capabilities: - add: ["IPC_LOCK"] - name: mpimaster - ports: - - containerPort: 22 - name: mpijob-port - workingDir: /workspace - resources: - requests: - cpu: 2 - memory: 128Mi - ephemeral-storage: 16Gi - restartPolicy: OnFailure - - replicas: 2 - minAvailable: 2 - name: mpiworker - template: - metadata: - annotations: - k8s.v1.cni.cncf.io/networks: oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov - spec: - containers: - - name: mpiworker - command: - - /bin/bash - - -c - - mkdir -p /var/run/sshd; /usr/sbin/sshd -D; - image: iad.ocir.io/hpc_limited_availability/nccl-tests:pytorch-23.10-nccl-2.19.3-1 - securityContext: - capabilities: - add: ["IPC_LOCK"] - ports: - - containerPort: 22 - name: mpijob-port - workingDir: /workspace - resources: - requests: - nvidia.com/gpu: 8 - nvidia.com/sriov_rdma_vf: 16 - ephemeral-storage: 1Gi - limits: - nvidia.com/gpu: 8 - nvidia.com/sriov_rdma_vf: 16 - ephemeral-storage: 1Gi - volumeMounts: - - { mountPath: /topo, name: topo } - - mountPath: /dev/shm - name: shm - restartPolicy: OnFailure - terminationGracePeriodSeconds: 15 - tolerations: - - key: nvidia.com/gpu - operator: Exists - volumes: - - name: topo - configMap: - name: nccl-topology - items: - - key: topo.xml - path: topo.xml - - name: root - hostPath: - path: / - type: Directory - - name: shm - emptyDir: - medium: Memory - sizeLimit: 8Gi \ No newline at end of file diff --git a/manifests/h100-nccl-test.yaml b/manifests/h100-nccl-test.yaml deleted file mode 100644 index 3d1da0b..0000000 --- a/manifests/h100-nccl-test.yaml +++ /dev/null @@ -1,153 +0,0 @@ -apiVersion: batch.volcano.sh/v1alpha1 -kind: Job -metadata: - name: nccl-allreduce-job0 -spec: - minAvailable: 1 - schedulerName: volcano - plugins: - ssh: [] - svc: [] - queue: default - tasks: - - replicas: 1 - name: mpimaster - policies: - - event: TaskCompleted - action: CompleteJob - template: - spec: - volumes: - - name: topo - configMap: - name: nccl-topology - items: - - key: topo.xml - path: topo.xml - - name: root - hostPath: - path: / - type: Directory - initContainers: - - command: - - /bin/bash - - -c - - | - until [[ "$(kubectl get pod -l volcano.sh/job-name=nccl-allreduce-job0,volcano.sh/task-spec=mpiworker -o json | jq '.items | length')" != 0 ]]; do - echo "Waiting for MPI worker pods..." - sleep 3 - done - echo "Waiting for MPI worker pods to be ready..." - kubectl wait pod -l volcano.sh/job-name=nccl-allreduce-job0,volcano.sh/task-spec=mpiworker --for=condition=Ready --timeout=600s && sleep 2 - image: aga.ocir.io/hpc_limited_availability/oke/kubectl:latest - name: wait-for-workers - serviceAccount: mpi-worker-view - terminationGracePeriodSeconds: 2 - tolerations: - - key: nvidia.com/gpu - operator: Exists - containers: - - command: - - /bin/bash - - -c - - | - MPI_HOST=$(cat /etc/volcano/mpiworker.host | tr "\n" ",") - mkdir -p /var/run/sshd; /usr/sbin/sshd - mpirun --allow-run-as-root \ - -np 16 -npernode 8 --bind-to numa \ - -hostfile /etc/volcano/mpiworker.host \ - -x NCCL_CROSS_NIC=1 \ - -x NCCL_SOCKET_NTHREADS=16 \ - -x NCCL_DEBUG=WARN \ - -x NCCL_CUMEM_ENABLE=0 \ - -x NCCL_IB_SPLIT_DATA_ON_QPS=0 \ - -x NCCL_IB_QPS_PER_CONNECTION=16 \ - -x NCCL_IB_GID_INDEX=3 \ - -x NCCL_IB_TC=41 \ - -x NCCL_IB_SL=0 \ - -x NCCL_IB_TIMEOUT=22 \ - -x NCCL_NET_PLUGIN=none \ - -x HCOLL_ENABLE_MCAST_ALL=0 \ - -x coll_hcoll_enable=0 \ - -x UCX_TLS=tcp \ - -x UCX_NET_DEVICES=eth0 \ - -x RX_QUEUE_LEN=8192 \ - -x IB_RX_QUEUE_LEN=8192 \ - -x NCCL_SOCKET_IFNAME=eth0 \ - -x NCCL_IGNORE_CPU_AFFINITY=1 \ - -x NCCL_TOPO_FILE=/topo/topo.xml \ - -mca coll_hcoll_enable 0 -mca coll ^hcoll \ - /workspace/nccl-tests/build/all_reduce_perf -b 8 -f 2 -g 1 -e 8G -c 1; sleep 3600 - image: iad.ocir.io/hpc_limited_availability/nccl-tests:pytorch-23.10-nccl-2.19.3-1 - volumeMounts: - - { mountPath: /topo, name: topo } - - { mountPath: /host, name: root } - securityContext: - capabilities: - add: ["IPC_LOCK"] - name: mpimaster - ports: - - containerPort: 22 - name: mpijob-port - workingDir: /workspace - resources: - requests: - cpu: 2 - memory: 128Mi - ephemeral-storage: 16Gi - restartPolicy: OnFailure - - replicas: 2 - minAvailable: 2 - name: mpiworker - template: - metadata: - annotations: - k8s.v1.cni.cncf.io/networks: oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov,oci-rdma-sriov - spec: - containers: - - name: mpiworker - command: - - /bin/bash - - -c - - mkdir -p /var/run/sshd; /usr/sbin/sshd -D; - image: iad.ocir.io/hpc_limited_availability/nccl-tests:pytorch-23.10-nccl-2.19.3-1 - securityContext: - capabilities: - add: ["IPC_LOCK"] - ports: - - containerPort: 22 - name: mpijob-port - workingDir: /workspace - resources: - requests: - nvidia.com/gpu: 8 - nvidia.com/sriov_rdma_vf: 16 - ephemeral-storage: 1Gi - limits: - nvidia.com/gpu: 8 - nvidia.com/sriov_rdma_vf: 16 - ephemeral-storage: 1Gi - volumeMounts: - - { mountPath: /topo, name: topo } - - mountPath: /dev/shm - name: shm - restartPolicy: OnFailure - terminationGracePeriodSeconds: 15 - tolerations: - - key: nvidia.com/gpu - operator: Exists - volumes: - - name: topo - configMap: - name: nccl-topology - items: - - key: topo.xml - path: topo.xml - - name: root - hostPath: - path: / - type: Directory - - name: shm - emptyDir: - medium: Memory - sizeLimit: 8Gi \ No newline at end of file diff --git a/manifests/ip-pool.yaml b/manifests/ip-pool.yaml deleted file mode 100644 index 8c9503a..0000000 --- a/manifests/ip-pool.yaml +++ /dev/null @@ -1,9 +0,0 @@ -apiVersion: nv-ipam.nvidia.com/v1alpha1 -kind: IPPool -metadata: - name: default - namespace: network-operator -spec: - subnet: 192.168.0.0/16 - perNodeBlockSize: 100 - gateway: 192.168.0.1 \ No newline at end of file diff --git a/manifests/network-attachment-definition.yaml b/manifests/network-attachment-definition.yaml deleted file mode 100644 index 9a26d9f..0000000 --- a/manifests/network-attachment-definition.yaml +++ /dev/null @@ -1,37 +0,0 @@ ---- -apiVersion: k8s.cni.cncf.io/v1 -kind: NetworkAttachmentDefinition -metadata: - annotations: - k8s.v1.cni.cncf.io/resourceName: nvidia.com/sriov_rdma_vf - name: oci-rdma-sriov - namespace: default -spec: - config: |- - { - "cniVersion": "0.3.1", - "name": "oci-rdma-sriov", - "plugins": [ - { - "type": "sriov", - "name": "sriov-network", - "spoofchk": "off", - "ipam": { - "type": "nv-ipam", - "poolName": "default" - } - }, - { "type": "tuning", - "sysctl": { - "net.ipv4.conf.all.arp_announce": "2", - "net.ipv4.conf.all.arp_filter": "1", - "net.ipv4.conf.all.arp_ignore": "1", - "net.ipv4.conf.all.rp_filter": "0", - "net.ipv4.conf.all.accept_local": "1" - }, - "mtu": 4220 - }, - { "type": "rdma" }, - { "type": "sbr" } - ] - } \ No newline at end of file diff --git a/manifests/nvidia-smi.yaml b/manifests/nvidia-smi.yaml deleted file mode 100644 index 2b68f67..0000000 --- a/manifests/nvidia-smi.yaml +++ /dev/null @@ -1,13 +0,0 @@ -apiVersion: v1 -kind: Pod -metadata: - name: nvidia-version-check -spec: - restartPolicy: OnFailure - containers: - - name: nvidia-version-check - image: nvidia/cuda:11.7.1-base-ubuntu20.04 - command: ["nvidia-smi"] - resources: - limits: - nvidia.com/gpu: "8" \ No newline at end of file diff --git a/manifests/sriov-cni-daemonset.yaml b/manifests/sriov-cni-daemonset.yaml deleted file mode 100644 index 69caf38..0000000 --- a/manifests/sriov-cni-daemonset.yaml +++ /dev/null @@ -1,53 +0,0 @@ ---- -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: kube-sriov-cni-ds-amd64 - namespace: kube-system - labels: - tier: node - app: sriov-cni -spec: - selector: - matchLabels: - name: sriov-cni - template: - metadata: - labels: - name: sriov-cni - tier: node - app: sriov-cni - spec: - nodeSelector: - kubernetes.io/arch: amd64 - tolerations: - - key: node-role.kubernetes.io/master - operator: Exists - effect: NoSchedule - - key: nvidia.com/gpu - operator: Exists - containers: - - name: kube-sriov-cni - image: ghcr.io/k8snetworkplumbingwg/sriov-cni - imagePullPolicy: IfNotPresent - securityContext: - allowPrivilegeEscalation: true - privileged: true - readOnlyRootFilesystem: true - capabilities: - drop: - - ALL - resources: - requests: - cpu: "100m" - memory: "50Mi" - limits: - cpu: "100m" - memory: "50Mi" - volumeMounts: - - name: cnibin - mountPath: /host/opt/cni/bin - volumes: - - name: cnibin - hostPath: - path: /opt/cni/bin diff --git a/manifests/topology/BM.GPU.A100-v2.8.xml b/manifests/topology/BM.GPU.A100-v2.8.xml deleted file mode 100644 index e4829c0..0000000 --- a/manifests/topology/BM.GPU.A100-v2.8.xml +++ /dev/null @@ -1,198 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/manifests/topology/BM.GPU.H100.8.xml b/manifests/topology/BM.GPU.H100.8.xml deleted file mode 100644 index 802c60c..0000000 --- a/manifests/topology/BM.GPU.H100.8.xml +++ /dev/null @@ -1,166 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/manifests/topology/BM.GPU4.8.xml b/manifests/topology/BM.GPU4.8.xml deleted file mode 100644 index 89a1d7e..0000000 --- a/manifests/topology/BM.GPU4.8.xml +++ /dev/null @@ -1,198 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/manifests/vf-config.yaml b/manifests/vf-config.yaml deleted file mode 100644 index b61220d..0000000 --- a/manifests/vf-config.yaml +++ /dev/null @@ -1,63 +0,0 @@ ---- -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: vf-config - namespace: kube-system -spec: - selector: - matchLabels: - app: vf-config - template: - metadata: - labels: - app: vf-config - spec: - priorityClassName: system-node-critical - hostNetwork: true - tolerations: [{ operator: "Exists" }] - terminationGracePeriodSeconds: 0 - affinity: - nodeAffinity: - requiredDuringSchedulingIgnoredDuringExecution: - nodeSelectorTerms: - - matchExpressions: - - key: node.kubernetes.io/instance-type - operator: In - values: - - BM.GPU.A100-v2.8 - - BM.GPU.B4.8 - - BM.GPU4.8 - - BM.GPU.H100.8 - volumes: [{ name: root, hostPath: { path: "/" } }] - containers: - - name: vf-config - image: oraclelinux:9 - imagePullPolicy: Always - securityContext: - privileged: true - capabilities: - add: [CAP_SYS_ADMIN] - volumeMounts: [{ name: root, mountPath: /host }] - resources: {} - command: - - /usr/bin/bash - - -c - - | - set -e -o pipefail; trap 'exit=1' SIGINT - chroot /host /usr/bin/bash -ex <&2 - crictl rmp -f "\$(crictl pods | grep sriov-device | awk '{print \$1}' | tail -1)" || true - EOF - while :; do { [[ $exit ]] && break; }; sleep 1; done # Sleep forever, exit gracefully \ No newline at end of file diff --git a/terraform/cloud-init/ol8.sh b/terraform/cloud-init/ol8.sh deleted file mode 100644 index 0bbe143..0000000 --- a/terraform/cloud-init/ol8.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash - -# Get the API server endpoint & the CA cert from IMDS -OKE_APISERVER_ENDPOINT=$(curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq -r '.metadata."apiserver_host"') -OKE_KUBELET_CA_CERT=$(curl -sH "Authorization: Bearer Oracle" -L http://169.254.169.254/opc/v2/instance/ | jq -r '.metadata."cluster_ca_cert"') - -# Adjust boot volume size -sudo dd iflag=direct if=/dev/oracleoci/oraclevda of=/dev/null count=1 -echo "1" | sudo tee /sys/class/block/`readlink /dev/oracleoci/oraclevda | cut -d'/' -f 2`/device/rescan -sudo /usr/libexec/oci-growfs -y - -bash /etc/oke/oke-install.sh \ - --apiserver-endpoint $OKE_APISERVER_ENDPOINT \ - --kubelet-ca-cert $OKE_KUBELET_CA_CERT \ No newline at end of file diff --git a/terraform/cloud-init/ubuntu.sh b/terraform/cloud-init/ubuntu.sh new file mode 100644 index 0000000..b5967ef --- /dev/null +++ b/terraform/cloud-init/ubuntu.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +# Add OKE repo & install the package +add-apt-repository -y 'deb [trusted=yes] https://objectstorage.us-phoenix-1.oraclecloud.com/p/ryJWdnkQSeI4ruDo9Jh77saOd5XTmORuzjv1k7GmxegExdR4atsUW2y4n7GWjkwq/n/hpc_limited_availability/b/oke_node_repo/o/ubuntu stable main' + +apt install -y oci-oke-node-all=1.27.2* + +oke bootstrap --manage-gpu-services \ No newline at end of file diff --git a/terraform/main.tf b/terraform/main.tf index e65e963..c4194c8 100644 --- a/terraform/main.tf +++ b/terraform/main.tf @@ -1,6 +1,6 @@ module "oke" { source = "oracle-terraform-modules/oke/oci" - version = "5.1.1" + version = "5.1.7" # Provider providers = { oci.home = oci.home } @@ -39,7 +39,7 @@ module "oke" { ocpus = 16, memory = 64, size = 3, - cloud_init = [{ content = "./cloud-init/ol8.sh" }], + cloud_init = [{ content = "./cloud-init/ubuntu.sh" }], } gpu = { @@ -53,7 +53,7 @@ module "oke" { image_type = "custom", image_id = var.gpu_image, node_labels = { "oci.oraclecloud.com/disable-gpu-device-plugin" : "true" }, - cloud_init = [{ content = "./cloud-init/ol8.sh" }], + cloud_init = [{ content = "./cloud-init/ubuntu.sh" }], agent_config = { are_all_plugins_disabled = false, is_management_disabled = false, diff --git a/terraform/variables.tf b/terraform/variables.tf index e31d81e..4ef4a9c 100644 --- a/terraform/variables.tf +++ b/terraform/variables.tf @@ -7,10 +7,10 @@ variable "ssh_public_key_path" { type = string } variable "ssh_private_key_path" { type = string } variable system_pool_image { default = "" } -variable a100_image { default = "" } -variable a100_shape { default = "" } +variable gpu_image { default = "" } +variable gpu_shape { default = "" } variable kubernetes_version { default = "v1.27.2" } variable cluster_type { default = "enhanced" } -variable cluster_name { default = "a100-cluster" } +variable cluster_name { default = "gpu-cluster" } variable cni_type {default = "flannel"} variable cluster_name { default = "oke-gpu-rdma-quickstart" } \ No newline at end of file