Merge remote-tracking branch 'upstream/main'

red-hat-data-services · Jun 8, 2024 · c1a387b · c1a387b
2 parents d6a16e4 + 52855ef
commit c1a387b
Show file tree

Hide file tree

Showing 15 changed files with 672 additions and 587 deletions.
diff --git a/README.md b/README.md
@@ -86,6 +86,7 @@ Current supported and tested models are `Llama2` (7 and 13B configurations have
 
 ### Single GPU
 
+Below example runs fine tuning with the given datasets and model:
 1. Using pre-processed dataset for training. 
 
 ```bash
@@ -103,20 +104,10 @@ python tuning/sft_trainer.py  \
 --output_dir $OUTPUT_PATH  \
 --num_train_epochs 5  \
 --per_device_train_batch_size 4  \
---per_device_eval_batch_size 4  \
 --gradient_accumulation_steps 4  \
---eval_strategy "no"  \
---save_strategy "epoch"  \
 --learning_rate 1e-5  \
---weight_decay 0.  \
---warmup_ratio 0.03  \
---lr_scheduler_type "cosine"  \
---logging_steps 1  \
---include_tokens_per_second  \
---packing False  \
 --response_template "\n### Response:"  \
---dataset_text_field "output" 
-
+--dataset_text_field "output"
 ```
 
 2. Using formatter with JSON/JSONL files
@@ -136,17 +127,8 @@ python tuning/sft_trainer.py  \
 --output_dir $OUTPUT_PATH  \
 --num_train_epochs 5  \
 --per_device_train_batch_size 4  \
---per_device_eval_batch_size 4  \
 --gradient_accumulation_steps 4  \
---eval_strategy "no"  \
---save_strategy "epoch"  \
 --learning_rate 1e-5  \
---weight_decay 0.  \
---warmup_ratio 0.03  \
---lr_scheduler_type "cosine"  \
---logging_steps 1  \
---include_tokens_per_second  \
---packing False  \
 --response_template "\n## Label:"  \
 --data_formatter_template: "### Input: {{input}} \n\n##Label: {{output}}"
 
@@ -162,6 +144,7 @@ The recommendation is to use [huggingface accelerate](https://huggingface.co/doc
 `accelerate launch` CLI to be run with specific command line arguments, see example below. Default arguments handled by passing in a 
 `--config_file` argument; see [reference docs](https://huggingface.co/docs/accelerate/en/package_reference/cli#accelerate-launch) and [fixtures/accelerate_fsdp_defaults.yaml](./fixtures/accelerate_fsdp_defaults.yaml) for sample defaults.
 
+Below example runs multi-GPU fine tuning on 8 GPUs with FSDP:
 ```bash
 # Please set the environment variables:
 # MASTER_PORT=1234 # The port at which the process with rank 0 listens to and should be set to an unused port
@@ -181,29 +164,20 @@ tuning/sft_trainer.py \
 --output_dir $OUTPUT_PATH \
 --num_train_epochs 5 \
 --per_device_train_batch_size 4 \
---per_device_eval_batch_size 4 \
 --gradient_accumulation_steps 4 \
---eval_strategy "no" \
---save_strategy "epoch" \
 --learning_rate 1e-5 \
---weight_decay 0. \
---warmup_ratio 0.03 \
---lr_scheduler_type "cosine" \
---logging_steps 1 \
---include_tokens_per_second \
---packing False \
 --response_template "\n### Response:" \
 --dataset_text_field "output"
 ```
 
-To summarize you can pick either python for singleGPU jobs or use accelerate launch for multiGPU jobs. The following tuning techniques can be applied:
+To summarize you can pick either python for single-GPU jobs or use accelerate launch for multi-GPU jobs. The following tuning techniques can be applied:
 
-## Tuning Techniques : 
+## Tuning Techniques:
 
 ### LoRA Tuning Example
 
-Set peft_method = "lora". You can additionally pass any arguments from [LoraConfig](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/peft_config.py#L21).
-```bash
+Set `peft_method` to `"lora"`. You can additionally pass any arguments from [LoraConfig](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/peft_config.py#L21).
+```py
 # Args you can pass
 r: int =8 
 lora_alpha: int = 32
@@ -217,9 +191,8 @@ target_modules: List[str] = field(
             "modules except for the output layer."
         },
     )
-  bias = "none"
-  lora_dropout: float = 0.05
-
+bias = "none"
+lora_dropout: float = 0.05
 ```
 Example command to run:
 
@@ -230,26 +203,33 @@ python tuning/sft_trainer.py \
 --output_dir $OUTPUT_PATH \
 --num_train_epochs 40 \
 --per_device_train_batch_size 4 \
---per_device_eval_batch_size 4 \
---gradient_accumulation_steps 4 \
---save_strategy "epoch" \
---learning_rate 1e-4 \
---weight_decay 0. \
---warmup_ratio 0.03 \
---lr_scheduler_type "cosine" \
---logging_steps 1 \
---include_tokens_per_second \
---packing False \
+---learning_rate 1e-4 \
 --response_template "\n### Label:" \
 --dataset_text_field "output" \
---use_flash_attn False \
---tokenizer_name_or_path $MODEL_PATH \
---torch_dtype float32 \
 --peft_method "lora" \
---logging_strategy "epoch" \
 --r 8 \
 --lora_dropout 0.05 \
---lora_alpha 16
+--lora_alpha 16 \
+--target_modules ["c_attn", "c_proj"]
+```
+
+Equally you can pass in a JSON configuration for running tuning. See [build doc](./build/README.md) for more details. The above can also be passed in as JSON:
+```json
+{
+    "model_name_or_path": $MODEL_PATH,
+    "training_data_path": $TRAIN_DATA_PATH,
+    "output_dir": $OUTPUT_PATH,
+    "num_train_epochs": 40.0,
+    "per_device_train_batch_size": 4,
+    "learning_rate": 1e-4,
+    "response_template": "\n### Label:",
+    "dataset_text_field": "output",
+    "peft_method": "lora",
+    "r": 8,
+    "lora_dropout": 0.05,
+    "lora_alpha": 16,
+    "target_modules": ["c_attn", "c_proj"]
+}
 ```
 
 Notice the `target_modules` that are set are the default values. `target_modules` are the names of the modules to apply the adapter to. If this is specified, only the modules with the specified names will be replaced. When passing a list of strings, either an exact match will be performed or it is checked if the name of the module ends with any of the passed strings. If this is specified as `all-linear`, then all linear/Conv1D modules are chosen, excluding the output layer. If this is not specified, modules will be chosen according to the model architecture. If the architecture is not known, an error will be raised — in this case, you should specify the target modules manually. See [HuggingFace docs](https://huggingface.co/docs/peft/en/package_reference/lora#peft.LoraConfig) for more details.
@@ -307,81 +287,89 @@ For example for LLaMA model the modules look like:
 
 You can specify attention or linear layers. With the CLI, you can specify layers with `--target_modules "q_proj" "v_proj" "k_proj" "o_proj"` or `--target_modules "all-linear"`.
 
-### Prompt Tuning :
+### Prompt Tuning:
 
-Specify peft_method to 'pt' . You can additionally pass any arguments from [PromptTuningConfig](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/peft_config.py#L39). 
-```bash
-    # prompt_tuning_init can be either "TEXT" or "RANDOM"
-    prompt_tuning_init: str = "TEXT"
-    num_virtual_tokens: int = 8
-    # prompt_tuning_init_text only applicable if prompt_tuning_init= "TEXT"
-    prompt_tuning_init_text: str = "Classify if the tweet is a complaint or not:"
-    tokenizer_name_or_path: str = "llama-7b-hf"
+Specify `peft_method` to `'pt'` . You can additionally pass any arguments from [PromptTuningConfig](https://github.com/foundation-model-stack/fms-hf-tuning/blob/main/tuning/config/peft_config.py#L63).
+```py
+# prompt_tuning_init can be either "TEXT" or "RANDOM"
+prompt_tuning_init: str = "TEXT"
+num_virtual_tokens: int = 8
+# prompt_tuning_init_text only applicable if prompt_tuning_init= "TEXT"
+prompt_tuning_init_text: str = "Classify if the tweet is a complaint or not:"
+tokenizer_name_or_path: str = "llama-7b-hf"
 ```
 
 Example command you can run:  
 
 ```bash
-
-accelerate launch \
---main_process_port $MASTER_PORT \
---config_file fixtures/accelerate_fsdp_defaults.yaml \
-tuning/sft_trainer.py  \
+python tuning/sft_trainer.py  \
 --model_name_or_path $MODEL_PATH  \
 --training_data_path $TRAIN_DATA_PATH  \
 --output_dir $OUTPUT_PATH  \
---peft_method pt \
---torch_dtype bfloat16 \
---tokenizer_name_or_path $MODEL_PATH  \
 --num_train_epochs 5  \
 --per_device_train_batch_size 1  \
---per_device_eval_batch_size 1  \
---gradient_accumulation_steps 1  \
---eval_strategy "no"  \
---save_strategy "epoch"  \
---learning_rate 1e-5  \
---weight_decay 0.  \
---warmup_ratio 0.03  \
---lr_scheduler_type "cosine"  \
---logging_steps 1  \
---include_tokens_per_second  \
---packing False  \
+--learning_rate 0.03  \
 --response_template "\n### Label:"  \
---dataset_text_field "output" 
+--dataset_text_field "output" \
+--peft_method pt \
+--tokenizer_name_or_path $MODEL_PATH
+--prompt_tuning_init "RANDOM" \
+--prompt_tuning_init_text "From the following input, identify target sentiment of following types: neutral, negative, positive"
 ```
 
-### Fine Tuning :
+Equally you can pass in a JSON configuration for running tuning. See [build doc](./build/README.md) for more details. The above can also be passed in as JSON:
+```json
+{
+    "model_name_or_path": $MODEL_PATH,
+    "training_data_path": $TRAIN_DATA_PATH,
+    "output_dir": $OUTPUT_PATH,
+    "num_train_epochs": 5.0,
+    "per_device_train_batch_size": 1,
+    "learning_rate": 0.03,
+    "response_template": "\n### Label:",
+    "dataset_text_field": "output",
+    "peft_method": "pt",
+    "tokenizer_name_or_path": $MODEL_PATH,
+    "prompt_tuning_init": "RANDOM",
+    "prompt_tuning_init_text": "From the following input, identify target sentiment of following types: neutral, negative, positive"
+}
+```
 
-Set peft_method = 'None'
+### Fine Tuning:
 
-Full fine tuning needs more compute resources, so it is advised to use the MultiGPU method
-```bash
+Set `peft_method` to `'None'` or do not provide `peft_method` flag.
 
+Full fine tuning needs more compute resources, so it is advised to use the MultiGPU method. Example command:
+
+```bash
 accelerate launch \
---main_process_port $MASTER_PORT \
+--num_processes=4
 --config_file fixtures/accelerate_fsdp_defaults.yaml \
 tuning/sft_trainer.py  \
 --model_name_or_path $MODEL_PATH  \
 --training_data_path $TRAIN_DATA_PATH  \
 --output_dir $OUTPUT_PATH  \
---peft_method "None" \
---torch_dtype bfloat16 \
---tokenizer_name_or_path $MODEL_PATH  \
 --num_train_epochs 5  \
---per_device_train_batch_size 1  \
---per_device_eval_batch_size 1  \
---gradient_accumulation_steps 1  \
---eval_strategy "no"  \
---save_strategy "epoch"  \
+--per_device_train_batch_size 4  \
 --learning_rate 1e-5  \
---weight_decay 0.  \
---warmup_ratio 0.03  \
---lr_scheduler_type "cosine"  \
---logging_steps 1  \
---include_tokens_per_second  \
---packing False  \
 --response_template "\n### Label:"  \
---dataset_text_field "output" 
+--dataset_text_field "output" \
+--peft_method "None"
+```
+
+Equally you can pass in a JSON configuration for running tuning. See [build doc](./build/README.md) for more details. The above can also be passed in as JSON:
+```json
+{
+    "model_name_or_path": $MODEL_PATH,
+    "training_data_path": $TRAIN_DATA_PATH,
+    "output_dir": $OUTPUT_PATH,
+    "num_train_epochs": 5.0,
+    "per_device_train_batch_size": 4,
+    "learning_rate": 1e-5,
+    "response_template": "\n### Label:",
+    "dataset_text_field": "output",
+    "peft_method": "None"
+}
 ```
 
 ## Inference
@@ -458,3 +446,4 @@ The above runs several tasks with `hendrycksTest-*` being MMLU.
 
 [Prompt Tuning on Twitter Complaints](examples/prompt_tuning_twitter_complaints/README.md)
 
+A good simple example can be found [here](examples/kfto-kueue-sft-trainer.yaml) which launches a Kubernetes-native `PyTorchJob` using the [Kubeflow Training Operator](https://github.com/kubeflow/training-operator/) with [Kueue](https://github.com/kubernetes-sigs/kueue) for the queue management of tuning jobs.
diff --git a/build/Dockerfile b/build/Dockerfile
@@ -147,9 +147,9 @@ RUN mkdir /app && \
     chmod -R g+rwX /app /tmp
 
 # Copy scripts and default configs
-COPY build/launch_training.py build/accelerate_launch.py fixtures/accelerate_fsdp_defaults.yaml /app/
+COPY build/accelerate_launch.py fixtures/accelerate_fsdp_defaults.yaml /app/
 COPY build/utils.py /app/build/
-RUN chmod +x /app/launch_training.py /app/accelerate_launch.py
+RUN chmod +x /app/accelerate_launch.py
 
 ENV FSDP_DEFAULTS_FILE_PATH="/app/accelerate_fsdp_defaults.yaml"
 ENV SET_NUM_PROCESSES_TO_NUM_GPUS="True"