Add multiGPU finetuning tests for granite, llama and mixtral models

opendatahub-io · Jul 9, 2024 · 095c68e · 095c68e
1 parent d82f293
commit 095c68e
Show file tree

Hide file tree

Showing 9 changed files with 210 additions and 104 deletions.
diff --git a/go.mod b/go.mod
@@ -7,9 +7,11 @@ toolchain go1.21.5
 require (
 	github.com/kubeflow/training-operator v1.7.0
 	github.com/onsi/gomega v1.31.1
-	github.com/project-codeflare/codeflare-common v0.0.0-20240618073051-795d7ecc5ac7
+	github.com/openshift/api v0.0.0-20230718161610-2a3e8b481cec
+	github.com/project-codeflare/codeflare-common v0.0.0-20240702071428-eae5837bea22
 	github.com/prometheus/client_golang v1.18.0
 	github.com/prometheus/common v0.45.0
+	github.com/ray-project/kuberay/ray-operator v1.1.0-alpha.0
 	k8s.io/api v0.29.2
 	k8s.io/apimachinery v0.29.2
 	sigs.k8s.io/kueue v0.6.2
@@ -51,13 +53,11 @@ require (
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
 	github.com/openshift-online/ocm-sdk-go v0.1.368 // indirect
-	github.com/openshift/api v0.0.0-20230718161610-2a3e8b481cec // indirect
 	github.com/openshift/client-go v0.0.0-20230718165156-6014fb98e86a // indirect
 	github.com/pkg/errors v0.9.1 // indirect
 	github.com/project-codeflare/appwrapper v0.8.0 // indirect
 	github.com/prometheus/client_model v0.5.0 // indirect
 	github.com/prometheus/procfs v0.12.0 // indirect
-	github.com/ray-project/kuberay/ray-operator v1.1.0-alpha.0 // indirect
 	github.com/sirupsen/logrus v1.9.3 // indirect
 	github.com/spf13/pflag v1.0.5 // indirect
 	golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect

diff --git a/go.sum b/go.sum
@@ -363,8 +363,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
 github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/project-codeflare/appwrapper v0.8.0 h1:vWHNtXUtHutN2EzYb6rryLdESnb8iDXsCokXOuNYXvg=
 github.com/project-codeflare/appwrapper v0.8.0/go.mod h1:FMQ2lI3fz6LakUVXgN1FTdpsc3BBkNIZZgtMmM9J5UM=
-github.com/project-codeflare/codeflare-common v0.0.0-20240618073051-795d7ecc5ac7 h1:XTK5l2FRO3BbSk4Qn9xAwsRFTJ4IeGljymQWcfYLlMI=
-github.com/project-codeflare/codeflare-common v0.0.0-20240618073051-795d7ecc5ac7/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
+github.com/project-codeflare/codeflare-common v0.0.0-20240702071428-eae5837bea22 h1:gjbp5kz/azRGmRBJBS6ZmoW2PHGsvYj2Mi0Dre/x5KI=
+github.com/project-codeflare/codeflare-common v0.0.0-20240702071428-eae5837bea22/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
 github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
 github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
 github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=

diff --git a/tests/kfto/core/config_GPU.json → ...ore/config_granite_20b_code_instruct.json b/tests/kfto/core/config_GPU.json → ...ore/config_granite_20b_code_instruct.json
@@ -1,22 +1,19 @@
 {
-    "model_name_or_path": "/tmp/model/bloom-560m",
-    "training_data_path": "/tmp/dataset/alpaca_data.json",
-    "output_dir": "/tmp/out",
+    "model_name_or_path": "ibm-granite/granite-20b-code-instruct",
+    "training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
+    "output_dir": "/mnt/output/model",
     "num_train_epochs": 1.0,
-    "per_device_train_batch_size": 4,
+    "per_device_train_batch_size": 1,
     "per_device_eval_batch_size": 4,
     "gradient_accumulation_steps": 4,
     "evaluation_strategy": "no",
     "save_strategy": "epoch",
     "learning_rate": 1e-5,
     "weight_decay": 0.0,
     "lr_scheduler_type": "cosine",
-    "logging_steps": 1.0,
-    "packing": false,
     "include_tokens_per_second": true,
     "response_template": "\n### Response:",
     "dataset_text_field": "output",
     "use_flash_attn": false,
-    "torch_dtype": "float32",
-    "tokenizer_name_or_path": "/tmp/model/bloom-560m"
+    "tokenizer_name_or_path": "ibm-granite/granite-20b-code-instruct"
 }
diff --git a/tests/kfto/core/config_llama2_13b.json b/tests/kfto/core/config_llama2_13b.json
@@ -0,0 +1,19 @@
+{
+    "model_name_or_path": "meta-llama/Llama-2-13b-chat-hf",
+    "training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
+    "output_dir": "/mnt/output/model",
+    "num_train_epochs": 1.0,
+    "per_device_train_batch_size": 1,
+    "per_device_eval_batch_size": 4,
+    "gradient_accumulation_steps": 4,
+    "evaluation_strategy": "no",
+    "save_strategy": "epoch",
+    "learning_rate": 1e-5,
+    "weight_decay": 0.0,
+    "lr_scheduler_type": "cosine",
+    "include_tokens_per_second": true,
+    "response_template": "\n### Response:",
+    "dataset_text_field": "output",
+    "use_flash_attn": false,
+    "tokenizer_name_or_path": "meta-llama/Llama-2-13b-chat-hf"
+}
diff --git a/tests/kfto/core/config_meta_llama3_70b_instruct_lora.json b/tests/kfto/core/config_meta_llama3_70b_instruct_lora.json
@@ -0,0 +1,20 @@
+{
+    "model_name_or_path": "meta-llama/Meta-Llama-3-70B-Instruct",
+    "training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
+    "output_dir": "/mnt/output/model",
+    "num_train_epochs": 1.0,
+    "per_device_train_batch_size": 1,
+    "per_device_eval_batch_size": 4,
+    "gradient_accumulation_steps": 4,
+    "evaluation_strategy": "no",
+    "save_strategy": "epoch",
+    "learning_rate": 1e-5,
+    "weight_decay": 0.0,
+    "lr_scheduler_type": "cosine",
+    "include_tokens_per_second": true,
+    "response_template": "\n### Response:",
+    "dataset_text_field": "output",
+    "use_flash_attn": false,
+    "tokenizer_name_or_path": "meta-llama/Meta-Llama-3-70B-Instruct",
+    "peft_method": "lora"
+}
diff --git a/tests/kfto/core/config_meta_llama3_8b_instruct.json b/tests/kfto/core/config_meta_llama3_8b_instruct.json
@@ -0,0 +1,19 @@
+{
+    "model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct",
+    "training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
+    "output_dir": "/mnt/output/model",
+    "num_train_epochs": 1.0,
+    "per_device_train_batch_size": 1,
+    "per_device_eval_batch_size": 4,
+    "gradient_accumulation_steps": 4,
+    "evaluation_strategy": "no",
+    "save_strategy": "epoch",
+    "learning_rate": 1e-5,
+    "weight_decay": 0.0,
+    "lr_scheduler_type": "cosine",
+    "include_tokens_per_second": true,
+    "response_template": "\n### Response:",
+    "dataset_text_field": "output",
+    "use_flash_attn": false,
+    "tokenizer_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct"
+}
diff --git a/tests/kfto/core/config_mixtral_8x7b_instruct_v01.json b/tests/kfto/core/config_mixtral_8x7b_instruct_v01.json
@@ -0,0 +1,20 @@
+{
+    "model_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+    "training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
+    "output_dir": "/mnt/output/model",
+    "num_train_epochs": 1.0,
+    "per_device_train_batch_size": 1,
+    "per_device_eval_batch_size": 4,
+    "gradient_accumulation_steps": 1,
+    "evaluation_strategy": "no",
+    "save_strategy": "epoch",
+    "learning_rate": 1e-5,
+    "weight_decay": 0.0,
+    "lr_scheduler_type": "cosine",
+    "include_tokens_per_second": true,
+    "response_template": "\n### Response:",
+    "dataset_text_field": "output",
+    "use_flash_attn": false,
+    "tokenizer_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1"
+}
+
diff --git a/tests/kfto/core/environment.go b/tests/kfto/core/environment.go
@@ -29,6 +29,10 @@ const (
 	bloomModelImageEnvVar = "BLOOM_MODEL_IMAGE"
 	// The environment variable referring to image containing Stanford Alpaca dataset
 	alpacaDatasetImageEnvVar = "ALPACA_DATASET_IMAGE"
+	// The environment variable for HuggingFace token to download models which require authentication
+	huggingfaceTokenEnvVar = "HF_TOKEN"
+	// The environment variable specifying existing namespace to be used for multiGPU tests
+	multiGpuNamespaceEnvVar = "MULTIGPU_NAMESPACE"
 )
 
 func GetFmsHfTuningImage(t Test) string {
@@ -47,6 +51,22 @@ func GetAlpacaDatasetImage() string {
 	return lookupEnvOrDefault(alpacaDatasetImageEnvVar, "quay.io/ksuta/alpaca-dataset@sha256:c0492ff0005c13ac491e00d074902aa9dd21a49691945b122da23db3a3b3ac76")
 }
 
+func GetHuggingFaceToken(t Test) string {
+	image, ok := os.LookupEnv(huggingfaceTokenEnvVar)
+	if !ok {
+		t.T().Fatalf("Expected environment variable %s not found, please use this environment variable to specify HuggingFace token to download models.", huggingfaceTokenEnvVar)
+	}
+	return image
+}
+
+func GetMultiGpuNamespace(t Test) string {
+	image, ok := os.LookupEnv(multiGpuNamespaceEnvVar)
+	if !ok {
+		t.T().Fatalf("Expected environment variable %s not found, please use this environment variable to specify namespace to be used for multiGPU tests.", multiGpuNamespaceEnvVar)
+	}
+	return image
+}
+
 func lookupEnvOrDefault(key, value string) string {
 	if v, ok := os.LookupEnv(key); ok {
 		return v