Skip to content

Commit

Permalink
Add provision for kfto core test to utilise gpu
Browse files Browse the repository at this point in the history
  • Loading branch information
abhijeet-dhumal authored and openshift-merge-bot[bot] committed Sep 24, 2024
1 parent e25788e commit 5e94d5e
Show file tree
Hide file tree
Showing 2 changed files with 26 additions and 16 deletions.
11 changes: 4 additions & 7 deletions tests/kfto/core/config_qlora.json
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,9 @@
"dataset_text_field": "output",
"use_flash_attn": false,
"peft_method": "lora",
"r": 8,
"lora_dropout": 0.05,
"lora_alpha": 16,
"target_modules": ["all-linear"],
"use_4bit": true,
"bnb_4bit_compute_dtype": "float16",
"bnb_4bit_quant_type": "nf4",
"quantized_lora_config": {
"auto_gptq": ["triton_v2"]
},
"torch_dtype": "float16",
"fp16": true
}
31 changes: 22 additions & 9 deletions tests/kfto/core/kfto_kueue_sft_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package core

import (
"fmt"
"testing"

. "github.com/onsi/gomega"
Expand All @@ -31,17 +32,17 @@ import (
)

func TestPytorchjobWithSFTtrainerFinetuning(t *testing.T) {
runPytorchjobWithSFTtrainer(t, "config.json")
runPytorchjobWithSFTtrainer(t, "config.json", 0)
}

func TestPytorchjobWithSFTtrainerLoRa(t *testing.T) {
runPytorchjobWithSFTtrainer(t, "config_lora.json")
runPytorchjobWithSFTtrainer(t, "config_lora.json", 0)
}
func TestPytorchjobWithSFTtrainerQLoRa(t *testing.T) {
runPytorchjobWithSFTtrainer(t, "config_qlora.json")
runPytorchjobWithSFTtrainer(t, "config_qlora.json", 1)
}

func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string) {
func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string, numGpus int) {
test := With(t)

// Create a namespace
Expand All @@ -61,7 +62,7 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string) {
NamespaceSelector: &metav1.LabelSelector{},
ResourceGroups: []kueuev1beta1.ResourceGroup{
{
CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory")},
CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory"), corev1.ResourceName("nvidia.com/gpu")},
Flavors: []kueuev1beta1.FlavorQuotas{
{
Name: kueuev1beta1.ResourceFlavorReference(resourceFlavor.Name),
Expand All @@ -74,6 +75,10 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string) {
Name: corev1.ResourceMemory,
NominalQuota: resource.MustParse("12Gi"),
},
{
Name: corev1.ResourceName("nvidia.com/gpu"),
NominalQuota: resource.MustParse(fmt.Sprint(numGpus)),
},
},
},
},
Expand All @@ -85,7 +90,7 @@ func runPytorchjobWithSFTtrainer(t *testing.T, modelConfigFile string) {
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)

// Create training PyTorch job
tuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config)
tuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config, numGpus)

// Make sure the Kueue Workload is admitted
test.Eventually(KueueWorkloads(test, namespace.Name), TestTimeoutLong).
Expand Down Expand Up @@ -149,14 +154,14 @@ func TestPytorchjobUsingKueueQuota(t *testing.T) {
localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name, AsDefaultQueue)

// Create first training PyTorch job
tuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config)
tuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config, 0)

// Make sure the PyTorch job is running
test.Eventually(PytorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutLong).
Should(WithTransform(PytorchJobConditionRunning, Equal(corev1.ConditionTrue)))

// Create second training PyTorch job
secondTuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config)
secondTuningJob := createPyTorchJob(test, namespace.Name, localQueue.Name, *config, 0)

// Make sure the second PyTorch job is suspended, waiting for first job to finish
test.Eventually(PytorchJob(test, namespace.Name, secondTuningJob.Name), TestTimeoutShort).
Expand All @@ -175,7 +180,7 @@ func TestPytorchjobUsingKueueQuota(t *testing.T) {
test.T().Logf("PytorchJob %s/%s ran successfully", secondTuningJob.Namespace, secondTuningJob.Name)
}

func createPyTorchJob(test Test, namespace, localQueueName string, config corev1.ConfigMap) *kftov1.PyTorchJob {
func createPyTorchJob(test Test, namespace, localQueueName string, config corev1.ConfigMap, numGpus int) *kftov1.PyTorchJob {
tuningJob := &kftov1.PyTorchJob{
TypeMeta: metav1.TypeMeta{
APIVersion: corev1.SchemeGroupVersion.String(),
Expand All @@ -194,6 +199,12 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
RestartPolicy: "OnFailure",
Template: corev1.PodTemplateSpec{
Spec: corev1.PodSpec{
Tolerations: []corev1.Toleration{
{
Key: "nvidia.com/gpu",
Operator: corev1.TolerationOpExists,
},
},
InitContainers: []corev1.Container{
{
Name: "copy-model",
Expand Down Expand Up @@ -238,10 +249,12 @@ func createPyTorchJob(test Test, namespace, localQueueName string, config corev1
Requests: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("2"),
corev1.ResourceMemory: resource.MustParse("7Gi"),
"nvidia.com/gpu": resource.MustParse(fmt.Sprint(numGpus)),
},
Limits: corev1.ResourceList{
corev1.ResourceCPU: resource.MustParse("2"),
corev1.ResourceMemory: resource.MustParse("7Gi"),
"nvidia.com/gpu": resource.MustParse(fmt.Sprint(numGpus)),
},
},
SecurityContext: &corev1.SecurityContext{
Expand Down

0 comments on commit 5e94d5e

Please sign in to comment.