huggingface · sramakintel · Jan 16, 2025 · Jan 16, 2025 · Jan 16, 2025 · Jan 17, 2025
@@ -3,10 +3,9 @@ name: optimum-habana-example-chart
 description: This Helm chart deploys example jobs using Optimum for Intel® Gaudi® Accelerators to a Kubernetes cluster.
 
 # Compatible Kubernetes versions
-kubeVersion: 1.27-1.29
+kubeVersion: v1.28.7
 
 # This is the chart version. This version number should be incremented each time you make changes
 # to the chart and its templates, including the app version.
 # Versions are expected to follow Semantic Versioning (https://semver.org/)
 version: 0.1.0
-
@@ -136,6 +136,7 @@ Validated use cases can be found in the `ci` directory:
 | [`ci/multi-card-glue-values.yaml`](ci/multi-card-glue-values.yaml) | 2 | Uses 2 HPUs from a single node with the [`gaudi_spawn.py`](../gaudi_spawn.py) script to [fine tune BERT large](../text-classification/README.md#multi-card-training) (with whole word masking) on the text classification MRPC task using `run_glue.py`.
 | [`ci/single-card-lora-clm-values.yaml`](ci/single-card-lora-clm-values.yaml) | 1 | Uses a single card to [fine tune Llama1-7B](../language-modeling/README.md#peft) with LoRA using the `run_lora_clm.py` script.
 | [`ci/multi-card-lora-clm-values.yaml`](ci/multi-card-lora-clm-values.yaml) | 8 | Uses 8 HPUs from a single node with the [`gaudi_spawn.py`](../gaudi_spawn.py) script to [fine tune Llama1-7B](../language-modeling/README.md#peft) with LoRA using the `run_lora_clm.py` script.
+| [`ci/multi-node-multi-card-lora-clm-values.yaml`](ci/multi-node-multi-card-lora-clm-values.yaml) | 2 | Uses 1 HPU each from two nodes with the [`gaudi_spawn.py`](../gaudi_spawn.py) script to [fine tune Llama1-7B](../language-modeling/README.md#peft) with LoRA using the `run_lora_clm.py` script.
 
 ### Deploy job to the cluster
 

@@ -0,0 +1,142 @@
+# Default values for examples.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+image:
+    # -- Determines when the kubelet will pull the image to the worker nodes. Choose from: `IfNotPresent`, `Always`, or `Never`. If updates to the image have been made, use `Always` to ensure the newest image is used.
+  pullPolicy: Always
+  cleanPodPolicy: Running
+    # -- Repository and name of the docker image
+  repository: 
+  # -- Tag of the docker image
+  tag:
+
+imagePullSecrets: []
+
+# # -- Pod [annotations](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations/) to attach metadata to the job
+podAnnotations: {}
+
+# # -- Specify a pod security context to run as a non-root user
+# podSecurityContext:
+#   fsGroup: 1000
+
+# securityContext:
+#   # -- Run as privileged or unprivileged. Certain deployments may require running as privileged, check with your system admin.
+  # privileged: false
+
+# -- The default 64MB of shared memory for docker containers can be insufficient when using more than one HPU. Setting hostIPC: true allows reusing the host's shared memory space inside the container.
+hostIPC: true
+
+# -- Define a config map's data as container environment variables
+envFrom: []
+
+# -- Define environment variables to set in the container
+env:
+- name: LOGLEVEL
+  value: INFO
+
+secret:
+  # -- Hugging Face token encoded using base64.
+  encodedToken:
+  # -- If a token is provided, specify a mount path that will be used to set HF_TOKEN_PATH
+  secretMountPath: /tmp/hf_token
+
+storage:
+  # -- Name of the storage class to use for the persistent volume claim. To list the available storage classes use: `kubectl get storageclass`.
+  storageClassName: nfs-client
+  # -- [Access modes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#access-modes) for the persistent volume.
+  accessModes:
+  - "ReadWriteMany"
+  # -- Storage [resources](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#resources)
+  resources:
+    requests:
+      storage: 30Gi
+  # -- Locaton where the PVC will be mounted in the pods
+  pvcMountPath: &pvcMountPath /tmp/pvc-mount
+  # -- A data access pod will be deployed when set to true
+  deployDataAccessPod: false
+
+resources:
+  limits:
+    # -- Specify the number of Gaudi card(s)
+    cpu: 16
+    habana.ai/gaudi: 2
+    # -- Specify [Memory limits](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-memory) requests for the job
+    memory: 64Gi
+    # -- Specify hugepages-2Mi requests for the job
+    hugepages-2Mi: 4400Mi
+  requests:
+    # -- Specify the number of Gaudi card(s)
+    cpu: 16
+    habana.ai/gaudi: 2
+    # -- Specify [Memory resource](https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/#meaning-of-memory) requests for the job
+    memory: 64Gi
+    # -- Specify hugepages-2Mi requests for the job
+    hugepages-2Mi: 4400Mi
+
+
+# -- Number of Gaudi nodes to be used
+numNodes: 2
+# -- Number of Gaudi cards to be used per one node
+numCards: 1
+# -- Number of slots per worker
+slotsPerWorker: 1
+
+
+# Define the command to run in the container
+command:
+# python command to supply mpi run commands:
+  - python
+  - /optimum-habana/examples/language-modeling/run_lora_clm.py 
+  - --model_name_or_path
+  - huggyllama/llama-7b 
+  - --dataset_name
+  - tatsu-lab/alpaca
+  - --bf16
+  - --output_dir
+  - *pvcMountPath
+  - --num_train_epochs
+  - "3"
+  - --per_device_train_batch_size
+  - "12"
+  - --evaluation_strategy
+  - "no" 
+  - --save_strategy
+  - "no" 
+  - --learning_rate
+  - "1e-4" 
+  - --warmup_ratio
+  - "0.03" 
+  - --lr_scheduler_type
+  - "constant" 
+  - --max_grad_norm
+  - "0.3" 
+  - --logging_steps
+  - "1"
+  - --do_train 
+  - --do_eval 
+  - --use_habana 
+  - --use_lazy_mode 
+  - --throughput_warmup_steps
+  - "3"
+  - --lora_rank
+  - "8" 
+  - --lora_alpha=16 
+  - --lora_dropout=0.05 
+  - --lora_target_modules
+  - "q_proj"
+  - "v_proj" 
+  - --dataset_concatenation 
+  - --max_seq_length=512 
+  - --low_cpu_mem_usage=True 
+  - --validation_split_percentage=4 
+  - --adam_epsilon=1e-08
+
+# # -- Optionally specify a [node selector](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector) with labels the determine which node your worker pod will land on
+nodeSelector: {}
+
+# # -- Optionally specify [tolerations](https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration/) to allow the worker pod to land on a node with a taint.
+tolerations: []
+
+# # -- Optionally provide node [affinities](https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity) to constrain which node your worker pod will be scheduled on
+affinity: {}
@@ -0,0 +1,91 @@
+{{- if and .Values.numNodes (gt (int .Values.numNodes) 1) }}
+apiVersion: kubeflow.org/v2beta1
+kind: MPIJob
+metadata:
+  name: {{ .Release.Name }}-mpijob
+spec:
+  slotsPerWorker: {{ .Values.slotsPerWorker }}
+  runPolicy:
+    cleanPodPolicy: {{ .Values.image.cleanPodPolicy }}
+  mpiReplicaSpecs:
+    Launcher:
+      replicas: 1
+      template:
+        spec:
+          hostIPC: {{ .Values.hostIPC }}
+          containers:
+            - name: {{ .Release.Name }}-mpijob-container
+              image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
+              imagePullPolicy: {{ .Values.image.pullPolicy }}
+              command: ["/bin/bash", "-c"]
+              args:
+                - >-
+                  /usr/bin/ssh-keygen -A;
+                  /usr/sbin/sshd;
+                  HOSTSFILE=$OMPI_MCA_orte_default_hostfile;
+                  MASTER_ADDR="$(head -n 1 $HOSTSFILE | sed -n s/[[:space:]]slots.*//p)";
+                  echo $MASTER_ADDR;
+                  NUM_NODES=$(wc -l < $HOSTSFILE);
+                  CARDS_PER_NODE={{ .Values.numCards }};
+                  N_CARDS=$((NUM_NODES*CARDS_PER_NODE));
+
+                  SETUP_CMD="git clone --single-branch --branch v1.15.0 https://github.com/huggingface/optimum-habana.git; \
+                             pip install -r optimum-habana/examples/language-modeling/requirements.txt;
+
+                  eval $SETUP_CMD;
+
+                  mpirun --npernode 1 \
+                     --tag-output \
+                     --allow-run-as-root \
+                     --prefix $MPI_ROOT \
+                     -mca routed direct \
+                     git clone --single-branch --branch v1.15.0 https://github.com/huggingface/optimum-habana.git;
+
+                  mpirun --npernode 1 \
+                     --tag-output \
+                     --allow-run-as-root \
+                     --prefix $MPI_ROOT \
+                     -mca routed direct \
+                     pip install -r optimum-habana/examples/language-modeling/requirements.txt;
+
+                  MODEL_PATH=/optimum-habana/examples/language-modeling;
+                  cd $MODEL_PATH;
+                  mpirun -np $N_CARDS --npernode $CARDS_PER_NODE \
+                    --allow-run-as-root \
+                    --bind-to core \
+                    --map-by ppr:$CARDS_PER_NODE:node:PE=6 \
+                    -rank-by core --report-bindings \
+                    --tag-output \
+                    --merge-stderr-to-stdout --prefix $MPI_ROOT \
+                    -x MASTER_ADDR=$MASTER_ADDR \
+                    -mca btl_tcp_if_include eth0 \
+                    -mca oob_tcp_if_include eth0 \
+                    -mca plm_rsh_no_tree_spawn 1 \
+                    {{ .Values.command | join " " }};
+              resources:
+                limits:
+                  cpu: 16
+                  memory: 64Gi
+                  hugepages-2Mi: 4400Mi
+                requests:
+                  cpu: 16
+                  memory: 64Gi
+                  hugepages-2Mi: 4400Mi
+    Worker:
+      replicas: {{ .Values.numNodes }}
+      template:
+        spec:
+          hostIPC: {{ .Values.hostIPC }}
+          containers:
+            - name: {{ .Release.Name }}-mpijob-container
+              image: "{{ .Values.image.repository }}:{{ .Values.image.tag }}"
+              imagePullPolicy: {{ .Values.image.pullPolicy }}
+              command: ["/bin/bash", "-c"]
+              args:
+                - >-
+                  /usr/bin/ssh-keygen -A;
+                  /usr/sbin/sshd;
+                  sleep 365d;
+              resources:
+                {{- toYaml .Values.resources | nindent 16 }}
+{{- end }}