From 095c68ec2e6f5ce5cc947ca002d9f6fe5e3b99da Mon Sep 17 00:00:00 2001 From: Karel Suta Date: Tue, 25 Jun 2024 10:29:02 +0200 Subject: [PATCH] Add multiGPU finetuning tests for granite, llama and mixtral models --- go.mod | 6 +- go.sum | 4 +- ... => config_granite_20b_code_instruct.json} | 13 +- tests/kfto/core/config_llama2_13b.json | 19 ++ .../config_meta_llama3_70b_instruct_lora.json | 20 ++ .../core/config_meta_llama3_8b_instruct.json | 19 ++ .../config_mixtral_8x7b_instruct_v01.json | 20 ++ tests/kfto/core/environment.go | 20 ++ tests/kfto/core/kfto_kueue_sft_GPU_test.go | 193 +++++++++--------- 9 files changed, 210 insertions(+), 104 deletions(-) rename tests/kfto/core/{config_GPU.json => config_granite_20b_code_instruct.json} (56%) create mode 100644 tests/kfto/core/config_llama2_13b.json create mode 100644 tests/kfto/core/config_meta_llama3_70b_instruct_lora.json create mode 100644 tests/kfto/core/config_meta_llama3_8b_instruct.json create mode 100644 tests/kfto/core/config_mixtral_8x7b_instruct_v01.json diff --git a/go.mod b/go.mod index 502dcda4..46d37da6 100644 --- a/go.mod +++ b/go.mod @@ -7,9 +7,11 @@ toolchain go1.21.5 require ( github.com/kubeflow/training-operator v1.7.0 github.com/onsi/gomega v1.31.1 - github.com/project-codeflare/codeflare-common v0.0.0-20240618073051-795d7ecc5ac7 + github.com/openshift/api v0.0.0-20230718161610-2a3e8b481cec + github.com/project-codeflare/codeflare-common v0.0.0-20240702071428-eae5837bea22 github.com/prometheus/client_golang v1.18.0 github.com/prometheus/common v0.45.0 + github.com/ray-project/kuberay/ray-operator v1.1.0-alpha.0 k8s.io/api v0.29.2 k8s.io/apimachinery v0.29.2 sigs.k8s.io/kueue v0.6.2 @@ -51,13 +53,11 @@ require ( github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect github.com/openshift-online/ocm-sdk-go v0.1.368 // indirect - github.com/openshift/api v0.0.0-20230718161610-2a3e8b481cec // indirect github.com/openshift/client-go v0.0.0-20230718165156-6014fb98e86a // indirect github.com/pkg/errors v0.9.1 // indirect github.com/project-codeflare/appwrapper v0.8.0 // indirect github.com/prometheus/client_model v0.5.0 // indirect github.com/prometheus/procfs v0.12.0 // indirect - github.com/ray-project/kuberay/ray-operator v1.1.0-alpha.0 // indirect github.com/sirupsen/logrus v1.9.3 // indirect github.com/spf13/pflag v1.0.5 // indirect golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect diff --git a/go.sum b/go.sum index efa30c73..5aa12996 100644 --- a/go.sum +++ b/go.sum @@ -363,8 +363,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/project-codeflare/appwrapper v0.8.0 h1:vWHNtXUtHutN2EzYb6rryLdESnb8iDXsCokXOuNYXvg= github.com/project-codeflare/appwrapper v0.8.0/go.mod h1:FMQ2lI3fz6LakUVXgN1FTdpsc3BBkNIZZgtMmM9J5UM= -github.com/project-codeflare/codeflare-common v0.0.0-20240618073051-795d7ecc5ac7 h1:XTK5l2FRO3BbSk4Qn9xAwsRFTJ4IeGljymQWcfYLlMI= -github.com/project-codeflare/codeflare-common v0.0.0-20240618073051-795d7ecc5ac7/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y= +github.com/project-codeflare/codeflare-common v0.0.0-20240702071428-eae5837bea22 h1:gjbp5kz/azRGmRBJBS6ZmoW2PHGsvYj2Mi0Dre/x5KI= +github.com/project-codeflare/codeflare-common v0.0.0-20240702071428-eae5837bea22/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y= github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw= github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo= github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M= diff --git a/tests/kfto/core/config_GPU.json b/tests/kfto/core/config_granite_20b_code_instruct.json similarity index 56% rename from tests/kfto/core/config_GPU.json rename to tests/kfto/core/config_granite_20b_code_instruct.json index 647fb166..23d65806 100644 --- a/tests/kfto/core/config_GPU.json +++ b/tests/kfto/core/config_granite_20b_code_instruct.json @@ -1,9 +1,9 @@ { - "model_name_or_path": "/tmp/model/bloom-560m", - "training_data_path": "/tmp/dataset/alpaca_data.json", - "output_dir": "/tmp/out", + "model_name_or_path": "ibm-granite/granite-20b-code-instruct", + "training_data_path": "/mnt/scratch/dataset/alpaca_data.json", + "output_dir": "/mnt/output/model", "num_train_epochs": 1.0, - "per_device_train_batch_size": 4, + "per_device_train_batch_size": 1, "per_device_eval_batch_size": 4, "gradient_accumulation_steps": 4, "evaluation_strategy": "no", @@ -11,12 +11,9 @@ "learning_rate": 1e-5, "weight_decay": 0.0, "lr_scheduler_type": "cosine", - "logging_steps": 1.0, - "packing": false, "include_tokens_per_second": true, "response_template": "\n### Response:", "dataset_text_field": "output", "use_flash_attn": false, - "torch_dtype": "float32", - "tokenizer_name_or_path": "/tmp/model/bloom-560m" + "tokenizer_name_or_path": "ibm-granite/granite-20b-code-instruct" } \ No newline at end of file diff --git a/tests/kfto/core/config_llama2_13b.json b/tests/kfto/core/config_llama2_13b.json new file mode 100644 index 00000000..0a480d67 --- /dev/null +++ b/tests/kfto/core/config_llama2_13b.json @@ -0,0 +1,19 @@ +{ + "model_name_or_path": "meta-llama/Llama-2-13b-chat-hf", + "training_data_path": "/mnt/scratch/dataset/alpaca_data.json", + "output_dir": "/mnt/output/model", + "num_train_epochs": 1.0, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 4, + "gradient_accumulation_steps": 4, + "evaluation_strategy": "no", + "save_strategy": "epoch", + "learning_rate": 1e-5, + "weight_decay": 0.0, + "lr_scheduler_type": "cosine", + "include_tokens_per_second": true, + "response_template": "\n### Response:", + "dataset_text_field": "output", + "use_flash_attn": false, + "tokenizer_name_or_path": "meta-llama/Llama-2-13b-chat-hf" +} \ No newline at end of file diff --git a/tests/kfto/core/config_meta_llama3_70b_instruct_lora.json b/tests/kfto/core/config_meta_llama3_70b_instruct_lora.json new file mode 100644 index 00000000..a553343c --- /dev/null +++ b/tests/kfto/core/config_meta_llama3_70b_instruct_lora.json @@ -0,0 +1,20 @@ +{ + "model_name_or_path": "meta-llama/Meta-Llama-3-70B-Instruct", + "training_data_path": "/mnt/scratch/dataset/alpaca_data.json", + "output_dir": "/mnt/output/model", + "num_train_epochs": 1.0, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 4, + "gradient_accumulation_steps": 4, + "evaluation_strategy": "no", + "save_strategy": "epoch", + "learning_rate": 1e-5, + "weight_decay": 0.0, + "lr_scheduler_type": "cosine", + "include_tokens_per_second": true, + "response_template": "\n### Response:", + "dataset_text_field": "output", + "use_flash_attn": false, + "tokenizer_name_or_path": "meta-llama/Meta-Llama-3-70B-Instruct", + "peft_method": "lora" +} \ No newline at end of file diff --git a/tests/kfto/core/config_meta_llama3_8b_instruct.json b/tests/kfto/core/config_meta_llama3_8b_instruct.json new file mode 100644 index 00000000..e83dde83 --- /dev/null +++ b/tests/kfto/core/config_meta_llama3_8b_instruct.json @@ -0,0 +1,19 @@ +{ + "model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct", + "training_data_path": "/mnt/scratch/dataset/alpaca_data.json", + "output_dir": "/mnt/output/model", + "num_train_epochs": 1.0, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 4, + "gradient_accumulation_steps": 4, + "evaluation_strategy": "no", + "save_strategy": "epoch", + "learning_rate": 1e-5, + "weight_decay": 0.0, + "lr_scheduler_type": "cosine", + "include_tokens_per_second": true, + "response_template": "\n### Response:", + "dataset_text_field": "output", + "use_flash_attn": false, + "tokenizer_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct" +} \ No newline at end of file diff --git a/tests/kfto/core/config_mixtral_8x7b_instruct_v01.json b/tests/kfto/core/config_mixtral_8x7b_instruct_v01.json new file mode 100644 index 00000000..33e241e9 --- /dev/null +++ b/tests/kfto/core/config_mixtral_8x7b_instruct_v01.json @@ -0,0 +1,20 @@ +{ + "model_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "training_data_path": "/mnt/scratch/dataset/alpaca_data.json", + "output_dir": "/mnt/output/model", + "num_train_epochs": 1.0, + "per_device_train_batch_size": 1, + "per_device_eval_batch_size": 4, + "gradient_accumulation_steps": 1, + "evaluation_strategy": "no", + "save_strategy": "epoch", + "learning_rate": 1e-5, + "weight_decay": 0.0, + "lr_scheduler_type": "cosine", + "include_tokens_per_second": true, + "response_template": "\n### Response:", + "dataset_text_field": "output", + "use_flash_attn": false, + "tokenizer_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1" +} + diff --git a/tests/kfto/core/environment.go b/tests/kfto/core/environment.go index 7eb42b1b..8f894ee2 100644 --- a/tests/kfto/core/environment.go +++ b/tests/kfto/core/environment.go @@ -29,6 +29,10 @@ const ( bloomModelImageEnvVar = "BLOOM_MODEL_IMAGE" // The environment variable referring to image containing Stanford Alpaca dataset alpacaDatasetImageEnvVar = "ALPACA_DATASET_IMAGE" + // The environment variable for HuggingFace token to download models which require authentication + huggingfaceTokenEnvVar = "HF_TOKEN" + // The environment variable specifying existing namespace to be used for multiGPU tests + multiGpuNamespaceEnvVar = "MULTIGPU_NAMESPACE" ) func GetFmsHfTuningImage(t Test) string { @@ -47,6 +51,22 @@ func GetAlpacaDatasetImage() string { return lookupEnvOrDefault(alpacaDatasetImageEnvVar, "quay.io/ksuta/alpaca-dataset@sha256:c0492ff0005c13ac491e00d074902aa9dd21a49691945b122da23db3a3b3ac76") } +func GetHuggingFaceToken(t Test) string { + image, ok := os.LookupEnv(huggingfaceTokenEnvVar) + if !ok { + t.T().Fatalf("Expected environment variable %s not found, please use this environment variable to specify HuggingFace token to download models.", huggingfaceTokenEnvVar) + } + return image +} + +func GetMultiGpuNamespace(t Test) string { + image, ok := os.LookupEnv(multiGpuNamespaceEnvVar) + if !ok { + t.T().Fatalf("Expected environment variable %s not found, please use this environment variable to specify namespace to be used for multiGPU tests.", multiGpuNamespaceEnvVar) + } + return image +} + func lookupEnvOrDefault(key, value string) string { if v, ok := os.LookupEnv(key); ok { return v diff --git a/tests/kfto/core/kfto_kueue_sft_GPU_test.go b/tests/kfto/core/kfto_kueue_sft_GPU_test.go index a9281f49..067a0159 100644 --- a/tests/kfto/core/kfto_kueue_sft_GPU_test.go +++ b/tests/kfto/core/kfto_kueue_sft_GPU_test.go @@ -17,12 +17,12 @@ limitations under the License. package core import ( + "fmt" "testing" "time" . "github.com/onsi/gomega" . "github.com/project-codeflare/codeflare-common/support" - kueuev1beta1 "sigs.k8s.io/kueue/apis/kueue/v1beta1" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -33,92 +33,69 @@ import ( prometheusmodel "github.com/prometheus/common/model" ) -func TestMultiGpuPytorchjobWithSFTtrainer(t *testing.T) { +var numberOfGpus = 8 + +func TestMultiGpuPytorchjobGranite20bCodeInstruct(t *testing.T) { + runMultiGpuPytorchjob(t, "config_granite_20b_code_instruct.json") +} + +func TestMultiGpuPytorchjobLlama213b(t *testing.T) { + runMultiGpuPytorchjob(t, "config_llama2_13b.json") +} + +func TestMultiGpuPytorchjobMetaLlama38bInstruct(t *testing.T) { + runMultiGpuPytorchjob(t, "config_meta_llama3_8b_instruct.json") +} + +func TestMultiGpuPytorchjobMixtral8x7bInstructv01(t *testing.T) { + runMultiGpuPytorchjob(t, "config_mixtral_8x7b_instruct_v01.json") +} + +func TestMultiGpuPytorchjobMetaLlama370bInstructLoRa(t *testing.T) { + runMultiGpuPytorchjob(t, "config_meta_llama3_70b_instruct_lora.json") +} + +func runMultiGpuPytorchjob(t *testing.T, modelConfigFile string) { test := With(t) - // Create a namespace - namespace := test.NewTestNamespace() + namespace := GetMultiGpuNamespace(test) // Create a ConfigMap with configuration configData := map[string][]byte{ - "config.json": ReadFile(test, "config_GPU.json"), - } - config := CreateConfigMap(test, namespace.Name, configData) - - // Create Kueue resources utilizing GPU - rfSpec := kueuev1beta1.ResourceFlavorSpec{ - NodeLabels: map[string]string{"nvidia.com/gpu.present": "true"}, - } - resourceFlavor := CreateKueueResourceFlavor(test, rfSpec) - defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{}) - cqSpec := kueuev1beta1.ClusterQueueSpec{ - NamespaceSelector: &metav1.LabelSelector{}, - ResourceGroups: []kueuev1beta1.ResourceGroup{ - { - CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory"), corev1.ResourceName("nvidia.com/gpu")}, - Flavors: []kueuev1beta1.FlavorQuotas{ - { - Name: kueuev1beta1.ResourceFlavorReference(resourceFlavor.Name), - Resources: []kueuev1beta1.ResourceQuota{ - { - Name: corev1.ResourceCPU, - NominalQuota: resource.MustParse("2"), - }, - { - Name: corev1.ResourceMemory, - NominalQuota: resource.MustParse("5Gi"), - }, - { - Name: corev1.ResourceName("nvidia.com/gpu"), - NominalQuota: resource.MustParse("2"), - }, - }, - }, - }, - }, - }, + "config.json": ReadFile(test, modelConfigFile), } - clusterQueue := CreateKueueClusterQueue(test, cqSpec) - defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{}) - localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name) + config := CreateConfigMap(test, namespace, configData) + defer test.Client().Core().CoreV1().ConfigMaps(namespace).Delete(test.Ctx(), config.Name, *metav1.NewDeleteOptions(0)) // Create training PyTorch job - tuningJob := createAlpacaPyTorchJob(test, namespace.Name, localQueue.Name, *config) - - // Make sure the Kueue Workload is admitted - test.Eventually(KueueWorkloads(test, namespace.Name), TestTimeoutLong). - Should( - And( - HaveLen(1), - ContainElement(WithTransform(KueueWorkloadAdmitted, BeTrueBecause("Workload failed to be admitted"))), - ), - ) + tuningJob := createAlpacaPyTorchJob(test, namespace, *config) + defer test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Delete(test.Ctx(), tuningJob.Name, *metav1.NewDeleteOptions(0)) // Make sure the PyTorch job is running - test.Eventually(PytorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutLong). + test.Eventually(PytorchJob(test, namespace, tuningJob.Name), TestTimeoutLong). Should(WithTransform(PytorchJobConditionRunning, Equal(corev1.ConditionTrue))) if IsOpenShift(test) { - // Check that both GPUs were utilized recently + // Check that GPUs were utilized recently // That itself doesn't guarantee that PyTorchJob generated the load in GPU, but is the best we can achieve for now - test.Eventually(openShiftPrometheusGpuUtil(test), TestTimeoutMedium). + test.Eventually(openShiftPrometheusGpuUtil(test, namespace), 30*time.Minute). Should( And( - HaveLen(2), - HaveEach( - // Check that both GPUs were utilized on more than 90% - HaveField("Value", BeNumerically(">", 90)), + HaveLen(numberOfGpus), + ContainElement( + // Check that at lest some GPU was utilized on more than 50% + HaveField("Value", BeNumerically(">", 50)), ), ), ) } // Make sure the PyTorch job succeed - test.Eventually(PytorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutLong).Should(WithTransform(PytorchJobConditionSucceeded, Equal(corev1.ConditionTrue))) + test.Eventually(PytorchJob(test, namespace, tuningJob.Name), 30*time.Minute).Should(WithTransform(PytorchJobConditionSucceeded, Equal(corev1.ConditionTrue))) test.T().Logf("PytorchJob %s/%s ran successfully", tuningJob.Namespace, tuningJob.Name) } -func createAlpacaPyTorchJob(test Test, namespace, localQueueName string, config corev1.ConfigMap) *kftov1.PyTorchJob { +func createAlpacaPyTorchJob(test Test, namespace string, config corev1.ConfigMap) *kftov1.PyTorchJob { tuningJob := &kftov1.PyTorchJob{ TypeMeta: metav1.TypeMeta{ APIVersion: corev1.SchemeGroupVersion.String(), @@ -126,9 +103,6 @@ func createAlpacaPyTorchJob(test Test, namespace, localQueueName string, config }, ObjectMeta: metav1.ObjectMeta{ GenerateName: "kfto-sft-", - Labels: map[string]string{ - "kueue.x-k8s.io/queue-name": localQueueName, - }, }, Spec: kftov1.PyTorchJobSpec{ PyTorchReplicaSpecs: map[kftov1.ReplicaType]*kftov1.ReplicaSpec{ @@ -144,31 +118,18 @@ func createAlpacaPyTorchJob(test Test, namespace, localQueueName string, config }, }, InitContainers: []corev1.Container{ - { - Name: "copy-model", - Image: GetBloomModelImage(), - ImagePullPolicy: corev1.PullIfNotPresent, - VolumeMounts: []corev1.VolumeMount{ - { - Name: "tmp-volume", - MountPath: "/tmp", - }, - }, - Command: []string{"/bin/sh", "-c"}, - Args: []string{"mkdir /tmp/model; cp -r /models/bloom-560m /tmp/model"}, - }, { Name: "copy-dataset", Image: GetAlpacaDatasetImage(), ImagePullPolicy: corev1.PullIfNotPresent, VolumeMounts: []corev1.VolumeMount{ { - Name: "tmp-volume", - MountPath: "/tmp", + Name: "scratch-volume", + MountPath: "/mnt/scratch", }, }, Command: []string{"/bin/sh", "-c"}, - Args: []string{"mkdir /tmp/dataset; cp /dataset/alpaca_data_tenth.json /tmp/dataset/alpaca_data.json"}, + Args: []string{"mkdir /mnt/scratch/dataset; cp /dataset/alpaca_data_hundredth.json /mnt/scratch/dataset/alpaca_data.json"}, }, }, Containers: []corev1.Container{ @@ -183,7 +144,15 @@ func createAlpacaPyTorchJob(test Test, namespace, localQueueName string, config }, { Name: "HF_HOME", - Value: "/tmp/huggingface", + Value: "/mnt/scratch/huggingface-home", + }, + { + Name: "HF_TOKEN", + Value: GetHuggingFaceToken(test), + }, + { + Name: "TMPDIR", + Value: "/mnt/scratch", }, }, VolumeMounts: []corev1.VolumeMount{ @@ -192,18 +161,22 @@ func createAlpacaPyTorchJob(test Test, namespace, localQueueName string, config MountPath: "/etc/config", }, { - Name: "tmp-volume", - MountPath: "/tmp", + Name: "scratch-volume", + MountPath: "/mnt/scratch", + }, + { + Name: "output-volume", + MountPath: "/mnt/output", }, }, Resources: corev1.ResourceRequirements{ Requests: corev1.ResourceList{ corev1.ResourceCPU: resource.MustParse("2"), - corev1.ResourceMemory: resource.MustParse("5Gi"), - "nvidia.com/gpu": resource.MustParse("2"), + corev1.ResourceMemory: resource.MustParse("10Gi"), + "nvidia.com/gpu": resource.MustParse(fmt.Sprint(numberOfGpus)), }, Limits: corev1.ResourceList{ - "nvidia.com/gpu": resource.MustParse("2"), + "nvidia.com/gpu": resource.MustParse(fmt.Sprint(numberOfGpus)), }, }, SecurityContext: &corev1.SecurityContext{ @@ -224,9 +197,39 @@ func createAlpacaPyTorchJob(test Test, namespace, localQueueName string, config }, }, { - Name: "tmp-volume", + Name: "scratch-volume", + VolumeSource: corev1.VolumeSource{ + Ephemeral: &corev1.EphemeralVolumeSource{ + VolumeClaimTemplate: &corev1.PersistentVolumeClaimTemplate{ + Spec: corev1.PersistentVolumeClaimSpec{ + AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce}, + Resources: corev1.VolumeResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceStorage: resource.MustParse("500Gi"), + }, + }, + VolumeMode: Ptr(corev1.PersistentVolumeFilesystem), + }, + }, + }, + }, + }, + { + Name: "output-volume", VolumeSource: corev1.VolumeSource{ - EmptyDir: &corev1.EmptyDirVolumeSource{}, + Ephemeral: &corev1.EphemeralVolumeSource{ + VolumeClaimTemplate: &corev1.PersistentVolumeClaimTemplate{ + Spec: corev1.PersistentVolumeClaimSpec{ + AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce}, + Resources: corev1.VolumeResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceStorage: resource.MustParse("500Gi"), + }, + }, + VolumeMode: Ptr(corev1.PersistentVolumeFilesystem), + }, + }, + }, }, }, }, @@ -244,12 +247,20 @@ func createAlpacaPyTorchJob(test Test, namespace, localQueueName string, config return tuningJob } -func openShiftPrometheusGpuUtil(test Test) func(g Gomega) prometheusmodel.Vector { +func openShiftPrometheusGpuUtil(test Test, namespace string) func(g Gomega) prometheusmodel.Vector { return func(g Gomega) prometheusmodel.Vector { prometheusApiClient := GetOpenShiftPrometheusApiClient(test) result, warnings, err := prometheusApiClient.Query(test.Ctx(), "DCGM_FI_DEV_GPU_UTIL", time.Now(), prometheusapiv1.WithTimeout(5*time.Second)) g.Expect(err).NotTo(HaveOccurred()) g.Expect(warnings).Should(HaveLen(0)) - return result.(prometheusmodel.Vector) + + var util prometheusmodel.Vector + for _, sample := range result.(prometheusmodel.Vector) { + if string(sample.Metric["exported_namespace"]) == namespace { + util = append(util, sample) + } + } + + return util } }