From 095c68ec2e6f5ce5cc947ca002d9f6fe5e3b99da Mon Sep 17 00:00:00 2001
From: Karel Suta <ksuta@redhat.com>
Date: Tue, 25 Jun 2024 10:29:02 +0200
Subject: [PATCH] Add multiGPU finetuning tests for granite, llama and mixtral
 models

---
 go.mod                                        |   6 +-
 go.sum                                        |   4 +-
 ... => config_granite_20b_code_instruct.json} |  13 +-
 tests/kfto/core/config_llama2_13b.json        |  19 ++
 .../config_meta_llama3_70b_instruct_lora.json |  20 ++
 .../core/config_meta_llama3_8b_instruct.json  |  19 ++
 .../config_mixtral_8x7b_instruct_v01.json     |  20 ++
 tests/kfto/core/environment.go                |  20 ++
 tests/kfto/core/kfto_kueue_sft_GPU_test.go    | 193 +++++++++---------
 9 files changed, 210 insertions(+), 104 deletions(-)
 rename tests/kfto/core/{config_GPU.json => config_granite_20b_code_instruct.json} (56%)
 create mode 100644 tests/kfto/core/config_llama2_13b.json
 create mode 100644 tests/kfto/core/config_meta_llama3_70b_instruct_lora.json
 create mode 100644 tests/kfto/core/config_meta_llama3_8b_instruct.json
 create mode 100644 tests/kfto/core/config_mixtral_8x7b_instruct_v01.json

diff --git a/go.mod b/go.mod
index 502dcda4..46d37da6 100644
--- a/go.mod
+++ b/go.mod
@@ -7,9 +7,11 @@ toolchain go1.21.5
 require (
 	github.com/kubeflow/training-operator v1.7.0
 	github.com/onsi/gomega v1.31.1
-	github.com/project-codeflare/codeflare-common v0.0.0-20240618073051-795d7ecc5ac7
+	github.com/openshift/api v0.0.0-20230718161610-2a3e8b481cec
+	github.com/project-codeflare/codeflare-common v0.0.0-20240702071428-eae5837bea22
 	github.com/prometheus/client_golang v1.18.0
 	github.com/prometheus/common v0.45.0
+	github.com/ray-project/kuberay/ray-operator v1.1.0-alpha.0
 	k8s.io/api v0.29.2
 	k8s.io/apimachinery v0.29.2
 	sigs.k8s.io/kueue v0.6.2
@@ -51,13 +53,11 @@ require (
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
 	github.com/openshift-online/ocm-sdk-go v0.1.368 // indirect
-	github.com/openshift/api v0.0.0-20230718161610-2a3e8b481cec // indirect
 	github.com/openshift/client-go v0.0.0-20230718165156-6014fb98e86a // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/project-codeflare/appwrapper v0.8.0 // indirect
 	github.com/prometheus/client_model v0.5.0 // indirect
 	github.com/prometheus/procfs v0.12.0 // indirect
-	github.com/ray-project/kuberay/ray-operator v1.1.0-alpha.0 // indirect
 	github.com/sirupsen/logrus v1.9.3 // indirect
 	github.com/spf13/pflag v1.0.5 // indirect
 	golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect
diff --git a/go.sum b/go.sum
index efa30c73..5aa12996 100644
--- a/go.sum
+++ b/go.sum
@@ -363,8 +363,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/project-codeflare/appwrapper v0.8.0 h1:vWHNtXUtHutN2EzYb6rryLdESnb8iDXsCokXOuNYXvg=
 github.com/project-codeflare/appwrapper v0.8.0/go.mod h1:FMQ2lI3fz6LakUVXgN1FTdpsc3BBkNIZZgtMmM9J5UM=
-github.com/project-codeflare/codeflare-common v0.0.0-20240618073051-795d7ecc5ac7 h1:XTK5l2FRO3BbSk4Qn9xAwsRFTJ4IeGljymQWcfYLlMI=
-github.com/project-codeflare/codeflare-common v0.0.0-20240618073051-795d7ecc5ac7/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
+github.com/project-codeflare/codeflare-common v0.0.0-20240702071428-eae5837bea22 h1:gjbp5kz/azRGmRBJBS6ZmoW2PHGsvYj2Mi0Dre/x5KI=
+github.com/project-codeflare/codeflare-common v0.0.0-20240702071428-eae5837bea22/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
 github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
 github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
 github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=
diff --git a/tests/kfto/core/config_GPU.json b/tests/kfto/core/config_granite_20b_code_instruct.json
similarity index 56%
rename from tests/kfto/core/config_GPU.json
rename to tests/kfto/core/config_granite_20b_code_instruct.json
index 647fb166..23d65806 100644
--- a/tests/kfto/core/config_GPU.json
+++ b/tests/kfto/core/config_granite_20b_code_instruct.json
@@ -1,9 +1,9 @@
 {
-    "model_name_or_path": "/tmp/model/bloom-560m",
-    "training_data_path": "/tmp/dataset/alpaca_data.json",
-    "output_dir": "/tmp/out",
+    "model_name_or_path": "ibm-granite/granite-20b-code-instruct",
+    "training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
+    "output_dir": "/mnt/output/model",
     "num_train_epochs": 1.0,
-    "per_device_train_batch_size": 4,
+    "per_device_train_batch_size": 1,
     "per_device_eval_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "evaluation_strategy": "no",
@@ -11,12 +11,9 @@
     "learning_rate": 1e-5,
     "weight_decay": 0.0,
     "lr_scheduler_type": "cosine",
-    "logging_steps": 1.0,
-    "packing": false,
     "include_tokens_per_second": true,
     "response_template": "\n### Response:",
     "dataset_text_field": "output",
     "use_flash_attn": false,
-    "torch_dtype": "float32",
-    "tokenizer_name_or_path": "/tmp/model/bloom-560m"
+    "tokenizer_name_or_path": "ibm-granite/granite-20b-code-instruct"
 }
\ No newline at end of file
diff --git a/tests/kfto/core/config_llama2_13b.json b/tests/kfto/core/config_llama2_13b.json
new file mode 100644
index 00000000..0a480d67
--- /dev/null
+++ b/tests/kfto/core/config_llama2_13b.json
@@ -0,0 +1,19 @@
+{
+    "model_name_or_path": "meta-llama/Llama-2-13b-chat-hf",
+    "training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
+    "output_dir": "/mnt/output/model",
+    "num_train_epochs": 1.0,
+    "per_device_train_batch_size": 1,
+    "per_device_eval_batch_size": 4,
+    "gradient_accumulation_steps": 4,
+    "evaluation_strategy": "no",
+    "save_strategy": "epoch",
+    "learning_rate": 1e-5,
+    "weight_decay": 0.0,
+    "lr_scheduler_type": "cosine",
+    "include_tokens_per_second": true,
+    "response_template": "\n### Response:",
+    "dataset_text_field": "output",
+    "use_flash_attn": false,
+    "tokenizer_name_or_path": "meta-llama/Llama-2-13b-chat-hf"
+}
\ No newline at end of file
diff --git a/tests/kfto/core/config_meta_llama3_70b_instruct_lora.json b/tests/kfto/core/config_meta_llama3_70b_instruct_lora.json
new file mode 100644
index 00000000..a553343c
--- /dev/null
+++ b/tests/kfto/core/config_meta_llama3_70b_instruct_lora.json
@@ -0,0 +1,20 @@
+{
+    "model_name_or_path": "meta-llama/Meta-Llama-3-70B-Instruct",
+    "training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
+    "output_dir": "/mnt/output/model",
+    "num_train_epochs": 1.0,
+    "per_device_train_batch_size": 1,
+    "per_device_eval_batch_size": 4,
+    "gradient_accumulation_steps": 4,
+    "evaluation_strategy": "no",
+    "save_strategy": "epoch",
+    "learning_rate": 1e-5,
+    "weight_decay": 0.0,
+    "lr_scheduler_type": "cosine",
+    "include_tokens_per_second": true,
+    "response_template": "\n### Response:",
+    "dataset_text_field": "output",
+    "use_flash_attn": false,
+    "tokenizer_name_or_path": "meta-llama/Meta-Llama-3-70B-Instruct",
+    "peft_method": "lora"
+}
\ No newline at end of file
diff --git a/tests/kfto/core/config_meta_llama3_8b_instruct.json b/tests/kfto/core/config_meta_llama3_8b_instruct.json
new file mode 100644
index 00000000..e83dde83
--- /dev/null
+++ b/tests/kfto/core/config_meta_llama3_8b_instruct.json
@@ -0,0 +1,19 @@
+{
+    "model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
+    "output_dir": "/mnt/output/model",
+    "num_train_epochs": 1.0,
+    "per_device_train_batch_size": 1,
+    "per_device_eval_batch_size": 4,
+    "gradient_accumulation_steps": 4,
+    "evaluation_strategy": "no",
+    "save_strategy": "epoch",
+    "learning_rate": 1e-5,
+    "weight_decay": 0.0,
+    "lr_scheduler_type": "cosine",
+    "include_tokens_per_second": true,
+    "response_template": "\n### Response:",
+    "dataset_text_field": "output",
+    "use_flash_attn": false,
+    "tokenizer_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct"
+}
\ No newline at end of file
diff --git a/tests/kfto/core/config_mixtral_8x7b_instruct_v01.json b/tests/kfto/core/config_mixtral_8x7b_instruct_v01.json
new file mode 100644
index 00000000..33e241e9
--- /dev/null
+++ b/tests/kfto/core/config_mixtral_8x7b_instruct_v01.json
@@ -0,0 +1,20 @@
+{
+    "model_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
+    "output_dir": "/mnt/output/model",
+    "num_train_epochs": 1.0,
+    "per_device_train_batch_size": 1,
+    "per_device_eval_batch_size": 4,
+    "gradient_accumulation_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "epoch",
+    "learning_rate": 1e-5,
+    "weight_decay": 0.0,
+    "lr_scheduler_type": "cosine",
+    "include_tokens_per_second": true,
+    "response_template": "\n### Response:",
+    "dataset_text_field": "output",
+    "use_flash_attn": false,
+    "tokenizer_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1"
+}
+
diff --git a/tests/kfto/core/environment.go b/tests/kfto/core/environment.go
index 7eb42b1b..8f894ee2 100644
--- a/tests/kfto/core/environment.go
+++ b/tests/kfto/core/environment.go
@@ -29,6 +29,10 @@ const (
 	bloomModelImageEnvVar = "BLOOM_MODEL_IMAGE"
 	// The environment variable referring to image containing Stanford Alpaca dataset
 	alpacaDatasetImageEnvVar = "ALPACA_DATASET_IMAGE"
+	// The environment variable for HuggingFace token to download models which require authentication
+	huggingfaceTokenEnvVar = "HF_TOKEN"
+	// The environment variable specifying existing namespace to be used for multiGPU tests
+	multiGpuNamespaceEnvVar = "MULTIGPU_NAMESPACE"
 )
 
 func GetFmsHfTuningImage(t Test) string {
@@ -47,6 +51,22 @@ func GetAlpacaDatasetImage() string {
 	return lookupEnvOrDefault(alpacaDatasetImageEnvVar, "quay.io/ksuta/alpaca-dataset@sha256:c0492ff0005c13ac491e00d074902aa9dd21a49691945b122da23db3a3b3ac76")
 }
 
+func GetHuggingFaceToken(t Test) string {
+	image, ok := os.LookupEnv(huggingfaceTokenEnvVar)
+	if !ok {
+		t.T().Fatalf("Expected environment variable %s not found, please use this environment variable to specify HuggingFace token to download models.", huggingfaceTokenEnvVar)
+	}
+	return image
+}
+
+func GetMultiGpuNamespace(t Test) string {
+	image, ok := os.LookupEnv(multiGpuNamespaceEnvVar)
+	if !ok {
+		t.T().Fatalf("Expected environment variable %s not found, please use this environment variable to specify namespace to be used for multiGPU tests.", multiGpuNamespaceEnvVar)
+	}
+	return image
+}
+
 func lookupEnvOrDefault(key, value string) string {
 	if v, ok := os.LookupEnv(key); ok {
 		return v
diff --git a/tests/kfto/core/kfto_kueue_sft_GPU_test.go b/tests/kfto/core/kfto_kueue_sft_GPU_test.go
index a9281f49..067a0159 100644
--- a/tests/kfto/core/kfto_kueue_sft_GPU_test.go
+++ b/tests/kfto/core/kfto_kueue_sft_GPU_test.go
@@ -17,12 +17,12 @@ limitations under the License.
 package core
 
 import (
+	"fmt"
 	"testing"
 	"time"
 
 	. "github.com/onsi/gomega"
 	. "github.com/project-codeflare/codeflare-common/support"
-	kueuev1beta1 "sigs.k8s.io/kueue/apis/kueue/v1beta1"
 
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/resource"
@@ -33,92 +33,69 @@ import (
 	prometheusmodel "github.com/prometheus/common/model"
 )
 
-func TestMultiGpuPytorchjobWithSFTtrainer(t *testing.T) {
+var numberOfGpus = 8
+
+func TestMultiGpuPytorchjobGranite20bCodeInstruct(t *testing.T) {
+	runMultiGpuPytorchjob(t, "config_granite_20b_code_instruct.json")
+}
+
+func TestMultiGpuPytorchjobLlama213b(t *testing.T) {
+	runMultiGpuPytorchjob(t, "config_llama2_13b.json")
+}
+
+func TestMultiGpuPytorchjobMetaLlama38bInstruct(t *testing.T) {
+	runMultiGpuPytorchjob(t, "config_meta_llama3_8b_instruct.json")
+}
+
+func TestMultiGpuPytorchjobMixtral8x7bInstructv01(t *testing.T) {
+	runMultiGpuPytorchjob(t, "config_mixtral_8x7b_instruct_v01.json")
+}
+
+func TestMultiGpuPytorchjobMetaLlama370bInstructLoRa(t *testing.T) {
+	runMultiGpuPytorchjob(t, "config_meta_llama3_70b_instruct_lora.json")
+}
+
+func runMultiGpuPytorchjob(t *testing.T, modelConfigFile string) {
 	test := With(t)
 
-	// Create a namespace
-	namespace := test.NewTestNamespace()
+	namespace := GetMultiGpuNamespace(test)
 
 	// Create a ConfigMap with configuration
 	configData := map[string][]byte{
-		"config.json": ReadFile(test, "config_GPU.json"),
-	}
-	config := CreateConfigMap(test, namespace.Name, configData)
-
-	// Create Kueue resources utilizing GPU
-	rfSpec := kueuev1beta1.ResourceFlavorSpec{
-		NodeLabels: map[string]string{"nvidia.com/gpu.present": "true"},
-	}
-	resourceFlavor := CreateKueueResourceFlavor(test, rfSpec)
-	defer test.Client().Kueue().KueueV1beta1().ResourceFlavors().Delete(test.Ctx(), resourceFlavor.Name, metav1.DeleteOptions{})
-	cqSpec := kueuev1beta1.ClusterQueueSpec{
-		NamespaceSelector: &metav1.LabelSelector{},
-		ResourceGroups: []kueuev1beta1.ResourceGroup{
-			{
-				CoveredResources: []corev1.ResourceName{corev1.ResourceName("cpu"), corev1.ResourceName("memory"), corev1.ResourceName("nvidia.com/gpu")},
-				Flavors: []kueuev1beta1.FlavorQuotas{
-					{
-						Name: kueuev1beta1.ResourceFlavorReference(resourceFlavor.Name),
-						Resources: []kueuev1beta1.ResourceQuota{
-							{
-								Name:         corev1.ResourceCPU,
-								NominalQuota: resource.MustParse("2"),
-							},
-							{
-								Name:         corev1.ResourceMemory,
-								NominalQuota: resource.MustParse("5Gi"),
-							},
-							{
-								Name:         corev1.ResourceName("nvidia.com/gpu"),
-								NominalQuota: resource.MustParse("2"),
-							},
-						},
-					},
-				},
-			},
-		},
+		"config.json": ReadFile(test, modelConfigFile),
 	}
-	clusterQueue := CreateKueueClusterQueue(test, cqSpec)
-	defer test.Client().Kueue().KueueV1beta1().ClusterQueues().Delete(test.Ctx(), clusterQueue.Name, metav1.DeleteOptions{})
-	localQueue := CreateKueueLocalQueue(test, namespace.Name, clusterQueue.Name)
+	config := CreateConfigMap(test, namespace, configData)
+	defer test.Client().Core().CoreV1().ConfigMaps(namespace).Delete(test.Ctx(), config.Name, *metav1.NewDeleteOptions(0))
 
 	// Create training PyTorch job
-	tuningJob := createAlpacaPyTorchJob(test, namespace.Name, localQueue.Name, *config)
-
-	// Make sure the Kueue Workload is admitted
-	test.Eventually(KueueWorkloads(test, namespace.Name), TestTimeoutLong).
-		Should(
-			And(
-				HaveLen(1),
-				ContainElement(WithTransform(KueueWorkloadAdmitted, BeTrueBecause("Workload failed to be admitted"))),
-			),
-		)
+	tuningJob := createAlpacaPyTorchJob(test, namespace, *config)
+	defer test.Client().Kubeflow().KubeflowV1().PyTorchJobs(namespace).Delete(test.Ctx(), tuningJob.Name, *metav1.NewDeleteOptions(0))
 
 	// Make sure the PyTorch job is running
-	test.Eventually(PytorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutLong).
+	test.Eventually(PytorchJob(test, namespace, tuningJob.Name), TestTimeoutLong).
 		Should(WithTransform(PytorchJobConditionRunning, Equal(corev1.ConditionTrue)))
 
 	if IsOpenShift(test) {
-		// Check that both GPUs were utilized recently
+		// Check that GPUs were utilized recently
 		// That itself doesn't guarantee that PyTorchJob generated the load in GPU, but is the best we can achieve for now
-		test.Eventually(openShiftPrometheusGpuUtil(test), TestTimeoutMedium).
+		test.Eventually(openShiftPrometheusGpuUtil(test, namespace), 30*time.Minute).
 			Should(
 				And(
-					HaveLen(2),
-					HaveEach(
-						// Check that both GPUs were utilized on more than 90%
-						HaveField("Value", BeNumerically(">", 90)),
+					HaveLen(numberOfGpus),
+					ContainElement(
+						// Check that at lest some GPU was utilized on more than 50%
+						HaveField("Value", BeNumerically(">", 50)),
 					),
 				),
 			)
 	}
 
 	// Make sure the PyTorch job succeed
-	test.Eventually(PytorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutLong).Should(WithTransform(PytorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
+	test.Eventually(PytorchJob(test, namespace, tuningJob.Name), 30*time.Minute).Should(WithTransform(PytorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
 	test.T().Logf("PytorchJob %s/%s ran successfully", tuningJob.Namespace, tuningJob.Name)
 }
 
-func createAlpacaPyTorchJob(test Test, namespace, localQueueName string, config corev1.ConfigMap) *kftov1.PyTorchJob {
+func createAlpacaPyTorchJob(test Test, namespace string, config corev1.ConfigMap) *kftov1.PyTorchJob {
 	tuningJob := &kftov1.PyTorchJob{
 		TypeMeta: metav1.TypeMeta{
 			APIVersion: corev1.SchemeGroupVersion.String(),
@@ -126,9 +103,6 @@ func createAlpacaPyTorchJob(test Test, namespace, localQueueName string, config
 		},
 		ObjectMeta: metav1.ObjectMeta{
 			GenerateName: "kfto-sft-",
-			Labels: map[string]string{
-				"kueue.x-k8s.io/queue-name": localQueueName,
-			},
 		},
 		Spec: kftov1.PyTorchJobSpec{
 			PyTorchReplicaSpecs: map[kftov1.ReplicaType]*kftov1.ReplicaSpec{
@@ -144,31 +118,18 @@ func createAlpacaPyTorchJob(test Test, namespace, localQueueName string, config
 								},
 							},
 							InitContainers: []corev1.Container{
-								{
-									Name:            "copy-model",
-									Image:           GetBloomModelImage(),
-									ImagePullPolicy: corev1.PullIfNotPresent,
-									VolumeMounts: []corev1.VolumeMount{
-										{
-											Name:      "tmp-volume",
-											MountPath: "/tmp",
-										},
-									},
-									Command: []string{"/bin/sh", "-c"},
-									Args:    []string{"mkdir /tmp/model; cp -r /models/bloom-560m /tmp/model"},
-								},
 								{
 									Name:            "copy-dataset",
 									Image:           GetAlpacaDatasetImage(),
 									ImagePullPolicy: corev1.PullIfNotPresent,
 									VolumeMounts: []corev1.VolumeMount{
 										{
-											Name:      "tmp-volume",
-											MountPath: "/tmp",
+											Name:      "scratch-volume",
+											MountPath: "/mnt/scratch",
 										},
 									},
 									Command: []string{"/bin/sh", "-c"},
-									Args:    []string{"mkdir /tmp/dataset; cp /dataset/alpaca_data_tenth.json /tmp/dataset/alpaca_data.json"},
+									Args:    []string{"mkdir /mnt/scratch/dataset; cp /dataset/alpaca_data_hundredth.json /mnt/scratch/dataset/alpaca_data.json"},
 								},
 							},
 							Containers: []corev1.Container{
@@ -183,7 +144,15 @@ func createAlpacaPyTorchJob(test Test, namespace, localQueueName string, config
 										},
 										{
 											Name:  "HF_HOME",
-											Value: "/tmp/huggingface",
+											Value: "/mnt/scratch/huggingface-home",
+										},
+										{
+											Name:  "HF_TOKEN",
+											Value: GetHuggingFaceToken(test),
+										},
+										{
+											Name:  "TMPDIR",
+											Value: "/mnt/scratch",
 										},
 									},
 									VolumeMounts: []corev1.VolumeMount{
@@ -192,18 +161,22 @@ func createAlpacaPyTorchJob(test Test, namespace, localQueueName string, config
 											MountPath: "/etc/config",
 										},
 										{
-											Name:      "tmp-volume",
-											MountPath: "/tmp",
+											Name:      "scratch-volume",
+											MountPath: "/mnt/scratch",
+										},
+										{
+											Name:      "output-volume",
+											MountPath: "/mnt/output",
 										},
 									},
 									Resources: corev1.ResourceRequirements{
 										Requests: corev1.ResourceList{
 											corev1.ResourceCPU:    resource.MustParse("2"),
-											corev1.ResourceMemory: resource.MustParse("5Gi"),
-											"nvidia.com/gpu":      resource.MustParse("2"),
+											corev1.ResourceMemory: resource.MustParse("10Gi"),
+											"nvidia.com/gpu":      resource.MustParse(fmt.Sprint(numberOfGpus)),
 										},
 										Limits: corev1.ResourceList{
-											"nvidia.com/gpu": resource.MustParse("2"),
+											"nvidia.com/gpu": resource.MustParse(fmt.Sprint(numberOfGpus)),
 										},
 									},
 									SecurityContext: &corev1.SecurityContext{
@@ -224,9 +197,39 @@ func createAlpacaPyTorchJob(test Test, namespace, localQueueName string, config
 									},
 								},
 								{
-									Name: "tmp-volume",
+									Name: "scratch-volume",
+									VolumeSource: corev1.VolumeSource{
+										Ephemeral: &corev1.EphemeralVolumeSource{
+											VolumeClaimTemplate: &corev1.PersistentVolumeClaimTemplate{
+												Spec: corev1.PersistentVolumeClaimSpec{
+													AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce},
+													Resources: corev1.VolumeResourceRequirements{
+														Requests: corev1.ResourceList{
+															corev1.ResourceStorage: resource.MustParse("500Gi"),
+														},
+													},
+													VolumeMode: Ptr(corev1.PersistentVolumeFilesystem),
+												},
+											},
+										},
+									},
+								},
+								{
+									Name: "output-volume",
 									VolumeSource: corev1.VolumeSource{
-										EmptyDir: &corev1.EmptyDirVolumeSource{},
+										Ephemeral: &corev1.EphemeralVolumeSource{
+											VolumeClaimTemplate: &corev1.PersistentVolumeClaimTemplate{
+												Spec: corev1.PersistentVolumeClaimSpec{
+													AccessModes: []corev1.PersistentVolumeAccessMode{corev1.ReadWriteOnce},
+													Resources: corev1.VolumeResourceRequirements{
+														Requests: corev1.ResourceList{
+															corev1.ResourceStorage: resource.MustParse("500Gi"),
+														},
+													},
+													VolumeMode: Ptr(corev1.PersistentVolumeFilesystem),
+												},
+											},
+										},
 									},
 								},
 							},
@@ -244,12 +247,20 @@ func createAlpacaPyTorchJob(test Test, namespace, localQueueName string, config
 	return tuningJob
 }
 
-func openShiftPrometheusGpuUtil(test Test) func(g Gomega) prometheusmodel.Vector {
+func openShiftPrometheusGpuUtil(test Test, namespace string) func(g Gomega) prometheusmodel.Vector {
 	return func(g Gomega) prometheusmodel.Vector {
 		prometheusApiClient := GetOpenShiftPrometheusApiClient(test)
 		result, warnings, err := prometheusApiClient.Query(test.Ctx(), "DCGM_FI_DEV_GPU_UTIL", time.Now(), prometheusapiv1.WithTimeout(5*time.Second))
 		g.Expect(err).NotTo(HaveOccurred())
 		g.Expect(warnings).Should(HaveLen(0))
-		return result.(prometheusmodel.Vector)
+
+		var util prometheusmodel.Vector
+		for _, sample := range result.(prometheusmodel.Vector) {
+			if string(sample.Metric["exported_namespace"]) == namespace {
+				util = append(util, sample)
+			}
+		}
+
+		return util
 	}
 }