Skip to content

Commit

Permalink
Add multiGPU finetuning tests for granite, llama and mixtral models
Browse files Browse the repository at this point in the history
  • Loading branch information
sutaakar authored and openshift-merge-bot[bot] committed Jul 9, 2024
1 parent d82f293 commit 095c68e
Show file tree
Hide file tree
Showing 9 changed files with 210 additions and 104 deletions.
6 changes: 3 additions & 3 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@ toolchain go1.21.5
require (
github.com/kubeflow/training-operator v1.7.0
github.com/onsi/gomega v1.31.1
github.com/project-codeflare/codeflare-common v0.0.0-20240618073051-795d7ecc5ac7
github.com/openshift/api v0.0.0-20230718161610-2a3e8b481cec
github.com/project-codeflare/codeflare-common v0.0.0-20240702071428-eae5837bea22
github.com/prometheus/client_golang v1.18.0
github.com/prometheus/common v0.45.0
github.com/ray-project/kuberay/ray-operator v1.1.0-alpha.0
k8s.io/api v0.29.2
k8s.io/apimachinery v0.29.2
sigs.k8s.io/kueue v0.6.2
Expand Down Expand Up @@ -51,13 +53,11 @@ require (
github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
github.com/openshift-online/ocm-sdk-go v0.1.368 // indirect
github.com/openshift/api v0.0.0-20230718161610-2a3e8b481cec // indirect
github.com/openshift/client-go v0.0.0-20230718165156-6014fb98e86a // indirect
github.com/pkg/errors v0.9.1 // indirect
github.com/project-codeflare/appwrapper v0.8.0 // indirect
github.com/prometheus/client_model v0.5.0 // indirect
github.com/prometheus/procfs v0.12.0 // indirect
github.com/ray-project/kuberay/ray-operator v1.1.0-alpha.0 // indirect
github.com/sirupsen/logrus v1.9.3 // indirect
github.com/spf13/pflag v1.0.5 // indirect
golang.org/x/exp v0.0.0-20230905200255-921286631fa9 // indirect
Expand Down
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -363,8 +363,8 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/project-codeflare/appwrapper v0.8.0 h1:vWHNtXUtHutN2EzYb6rryLdESnb8iDXsCokXOuNYXvg=
github.com/project-codeflare/appwrapper v0.8.0/go.mod h1:FMQ2lI3fz6LakUVXgN1FTdpsc3BBkNIZZgtMmM9J5UM=
github.com/project-codeflare/codeflare-common v0.0.0-20240618073051-795d7ecc5ac7 h1:XTK5l2FRO3BbSk4Qn9xAwsRFTJ4IeGljymQWcfYLlMI=
github.com/project-codeflare/codeflare-common v0.0.0-20240618073051-795d7ecc5ac7/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
github.com/project-codeflare/codeflare-common v0.0.0-20240702071428-eae5837bea22 h1:gjbp5kz/azRGmRBJBS6ZmoW2PHGsvYj2Mi0Dre/x5KI=
github.com/project-codeflare/codeflare-common v0.0.0-20240702071428-eae5837bea22/go.mod h1:unKTw+XoMANTES3WieG016im7rxZ7IR2/ph++L5Vp1Y=
github.com/prometheus/client_golang v0.9.1/go.mod h1:7SWBe2y4D6OKWSNQJUaRYU/AaXPKyh/dDVn+NZz0KFw=
github.com/prometheus/client_golang v1.0.0/go.mod h1:db9x61etRT2tGnBNRi70OPL5FsnadC4Ky3P0J6CfImo=
github.com/prometheus/client_golang v1.7.1/go.mod h1:PY5Wy2awLA44sXw4AOSfFBetzPP4j5+D6mVACh+pe2M=
Expand Down
Original file line number Diff line number Diff line change
@@ -1,22 +1,19 @@
{
"model_name_or_path": "/tmp/model/bloom-560m",
"training_data_path": "/tmp/dataset/alpaca_data.json",
"output_dir": "/tmp/out",
"model_name_or_path": "ibm-granite/granite-20b-code-instruct",
"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
"output_dir": "/mnt/output/model",
"num_train_epochs": 1.0,
"per_device_train_batch_size": 4,
"per_device_train_batch_size": 1,
"per_device_eval_batch_size": 4,
"gradient_accumulation_steps": 4,
"evaluation_strategy": "no",
"save_strategy": "epoch",
"learning_rate": 1e-5,
"weight_decay": 0.0,
"lr_scheduler_type": "cosine",
"logging_steps": 1.0,
"packing": false,
"include_tokens_per_second": true,
"response_template": "\n### Response:",
"dataset_text_field": "output",
"use_flash_attn": false,
"torch_dtype": "float32",
"tokenizer_name_or_path": "/tmp/model/bloom-560m"
"tokenizer_name_or_path": "ibm-granite/granite-20b-code-instruct"
}
19 changes: 19 additions & 0 deletions tests/kfto/core/config_llama2_13b.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"model_name_or_path": "meta-llama/Llama-2-13b-chat-hf",
"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
"output_dir": "/mnt/output/model",
"num_train_epochs": 1.0,
"per_device_train_batch_size": 1,
"per_device_eval_batch_size": 4,
"gradient_accumulation_steps": 4,
"evaluation_strategy": "no",
"save_strategy": "epoch",
"learning_rate": 1e-5,
"weight_decay": 0.0,
"lr_scheduler_type": "cosine",
"include_tokens_per_second": true,
"response_template": "\n### Response:",
"dataset_text_field": "output",
"use_flash_attn": false,
"tokenizer_name_or_path": "meta-llama/Llama-2-13b-chat-hf"
}
20 changes: 20 additions & 0 deletions tests/kfto/core/config_meta_llama3_70b_instruct_lora.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"model_name_or_path": "meta-llama/Meta-Llama-3-70B-Instruct",
"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
"output_dir": "/mnt/output/model",
"num_train_epochs": 1.0,
"per_device_train_batch_size": 1,
"per_device_eval_batch_size": 4,
"gradient_accumulation_steps": 4,
"evaluation_strategy": "no",
"save_strategy": "epoch",
"learning_rate": 1e-5,
"weight_decay": 0.0,
"lr_scheduler_type": "cosine",
"include_tokens_per_second": true,
"response_template": "\n### Response:",
"dataset_text_field": "output",
"use_flash_attn": false,
"tokenizer_name_or_path": "meta-llama/Meta-Llama-3-70B-Instruct",
"peft_method": "lora"
}
19 changes: 19 additions & 0 deletions tests/kfto/core/config_meta_llama3_8b_instruct.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{
"model_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct",
"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
"output_dir": "/mnt/output/model",
"num_train_epochs": 1.0,
"per_device_train_batch_size": 1,
"per_device_eval_batch_size": 4,
"gradient_accumulation_steps": 4,
"evaluation_strategy": "no",
"save_strategy": "epoch",
"learning_rate": 1e-5,
"weight_decay": 0.0,
"lr_scheduler_type": "cosine",
"include_tokens_per_second": true,
"response_template": "\n### Response:",
"dataset_text_field": "output",
"use_flash_attn": false,
"tokenizer_name_or_path": "meta-llama/Meta-Llama-3-8B-Instruct"
}
20 changes: 20 additions & 0 deletions tests/kfto/core/config_mixtral_8x7b_instruct_v01.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
{
"model_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1",
"training_data_path": "/mnt/scratch/dataset/alpaca_data.json",
"output_dir": "/mnt/output/model",
"num_train_epochs": 1.0,
"per_device_train_batch_size": 1,
"per_device_eval_batch_size": 4,
"gradient_accumulation_steps": 1,
"evaluation_strategy": "no",
"save_strategy": "epoch",
"learning_rate": 1e-5,
"weight_decay": 0.0,
"lr_scheduler_type": "cosine",
"include_tokens_per_second": true,
"response_template": "\n### Response:",
"dataset_text_field": "output",
"use_flash_attn": false,
"tokenizer_name_or_path": "mistralai/Mixtral-8x7B-Instruct-v0.1"
}

20 changes: 20 additions & 0 deletions tests/kfto/core/environment.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ const (
bloomModelImageEnvVar = "BLOOM_MODEL_IMAGE"
// The environment variable referring to image containing Stanford Alpaca dataset
alpacaDatasetImageEnvVar = "ALPACA_DATASET_IMAGE"
// The environment variable for HuggingFace token to download models which require authentication
huggingfaceTokenEnvVar = "HF_TOKEN"
// The environment variable specifying existing namespace to be used for multiGPU tests
multiGpuNamespaceEnvVar = "MULTIGPU_NAMESPACE"
)

func GetFmsHfTuningImage(t Test) string {
Expand All @@ -47,6 +51,22 @@ func GetAlpacaDatasetImage() string {
return lookupEnvOrDefault(alpacaDatasetImageEnvVar, "quay.io/ksuta/alpaca-dataset@sha256:c0492ff0005c13ac491e00d074902aa9dd21a49691945b122da23db3a3b3ac76")
}

func GetHuggingFaceToken(t Test) string {
image, ok := os.LookupEnv(huggingfaceTokenEnvVar)
if !ok {
t.T().Fatalf("Expected environment variable %s not found, please use this environment variable to specify HuggingFace token to download models.", huggingfaceTokenEnvVar)
}
return image
}

func GetMultiGpuNamespace(t Test) string {
image, ok := os.LookupEnv(multiGpuNamespaceEnvVar)
if !ok {
t.T().Fatalf("Expected environment variable %s not found, please use this environment variable to specify namespace to be used for multiGPU tests.", multiGpuNamespaceEnvVar)
}
return image
}

func lookupEnvOrDefault(key, value string) string {
if v, ok := os.LookupEnv(key); ok {
return v
Expand Down
Loading

0 comments on commit 095c68e

Please sign in to comment.