From 6a1333769990ec08f633975bfc356cca46e961dd Mon Sep 17 00:00:00 2001
From: Chirag Jain <jain.chirag925@gmail.com>
Date: Wed, 12 Jun 2024 15:19:55 +0000
Subject: [PATCH] Bump axolotl, data validation warnings, bf16, flash-attn only
 on sm_80+, highlight error

---
 Dockerfile          |  6 ++--
 Dockerfile-notebook |  2 +-
 config-base.yaml    |  8 ++++--
 requirements.txt    |  9 +++---
 sample_run.sh       |  1 -
 train.py            | 68 ++++++++++++++++++++++-----------------------
 6 files changed, 47 insertions(+), 47 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index af73a32..92c5a06 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
-# https://hub.docker.com/layers/winglian/axolotl/main-20240603-py3.11-cu121-2.3.0/images/sha256-e4b898a0f700eb86f9e802bb85c1ec6c509b2dec65d941ad43405fe323865017?context=explore
-FROM --platform=linux/amd64 winglian/axolotl@sha256:a66d1469cdad472779f6419ea67d0fbb2cce984244aa86f40c99abaa4a21b3db
+# https://hub.docker.com/layers/winglian/axolotl/main-20240612-py3.11-cu121-2.3.0/images/sha256-798eed818fb11d24a640c0efbf27f65fbaebc1d9a5db210d585aa2a4328e93e1?context=explore
+FROM --platform=linux/amd64 winglian/axolotl@sha256:aac52c92ab245793932a635e6dedf14a3a9fb009e40cdf16c10b715f1466afa8
 USER root
 COPY requirements.txt /tmp/
 RUN pip install -U pip wheel setuptools && \
@@ -9,7 +9,7 @@ RUN mkdir -p /packages && \
     cd /packages && \
     git clone https://github.com/truefoundry/axolotl && \
     cd axolotl/ && \
-    git checkout dffcb7adfb42dd3305fcabb0de106d5e2454315e
+    git checkout 0711bfeb6af7d359deb4ee2cae81ceb6890ebf80
 RUN cd /packages/axolotl/ && \
     MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib] && \
     pip install --no-cache-dir -U -r /tmp/requirements.txt && \
diff --git a/Dockerfile-notebook b/Dockerfile-notebook
index f95982f..31a3c41 100644
--- a/Dockerfile-notebook
+++ b/Dockerfile-notebook
@@ -21,7 +21,7 @@ USER jovyan
 RUN cd /packages && \
     git clone https://github.com/truefoundry/axolotl && \
     cd axolotl/ && \
-    git checkout dffcb7adfb42dd3305fcabb0de106d5e2454315e
+    git checkout 0711bfeb6af7d359deb4ee2cae81ceb6890ebf80
 RUN cd /packages/axolotl/ && \
     MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib] && \
     pip install --no-cache-dir -U -r /tmp/llm-finetune/notebook-requirements.txt
diff --git a/config-base.yaml b/config-base.yaml
index c2a06d9..7ed9691 100644
--- a/config-base.yaml
+++ b/config-base.yaml
@@ -22,8 +22,11 @@ datasets: auto # type: list
 test_datasets: auto # type: list
 bf16: auto # type: bool
 bfloat16: auto # type: bool
+flash_attention: auto # type: bool
+flash_attn_cross_entropy: auto # type: bool
 flash_attn_fuse_mlp: auto # type: bool
 flash_attn_fuse_qkv: auto # type: bool
+flash_attn_rms_norm: auto # type: bool
 float16: auto # type: bool
 fp16: auto # type: bool
 load_in_4bit: auto # type: bool
@@ -64,9 +67,6 @@ early_stopping_patience: 10
 eval_sample_packing: False
 eval_steps: 0.1
 eval_strategy: steps
-flash_attention: True
-flash_attn_cross_entropy: True
-flash_attn_rms_norm: True
 gradient_accumulation_steps: 4
 gradient_checkpointing: unsloth
 gradient_checkpointing_kwargs:
@@ -106,8 +106,10 @@ warmup_ratio: 0.1
 weight_decay: 0.01
 ## Added by TrueFoundry, not native to Axolotl
 cleanup_output_dir_on_start: False
+drop_long_sequences: False
 logging_dir: ./tensorboard_logs
 mlfoundry_log_checkpoints: True
+save_model_on_interrupt: False
 use_mflow: False
 use_wandb: False
 use_tensorboard: True
diff --git a/requirements.txt b/requirements.txt
index eea8adf..d76c79d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,9 @@
 --extra-index-url https://download.pytorch.org/whl/cu121
-torch==2.3.0+cu121
 cloud-files==4.15.2
-truefoundry[ml]==0.2.4
-snowflake-connector-python[pandas]==3.7.0
+deepspeed @ git+https://github.com/truefoundry/DeepSpeed@f706f516730bb4a175870b35372e00af8c27a258
 pyarrow==15.0.0
-deepspeed @ git+https://github.com/truefoundry/DeepSpeed@0866580c316963ddda30ffee44de2c3e21129556
+rich>=13.0.0,<14
+snowflake-connector-python[pandas]==3.7.0
+torch==2.3.0+cu121
+truefoundry[ml]==0.2.4
 unsloth @ git+https://github.com/unslothai/unsloth@27fa021a7bb959a53667dd4e7cdb9598c207aa0d
diff --git a/sample_run.sh b/sample_run.sh
index 6936f9a..572b7ff 100755
--- a/sample_run.sh
+++ b/sample_run.sh
@@ -32,7 +32,6 @@ accelerate launch \
 train.py \
 config-base.yaml \
 --deepspeed ./deepspeed_configs/3_ds_z2_config.json \
---flash_attention True \
 --base_model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
 --train_data_uri ./sample_data/chatalpaca-openai-100.jsonl \
 --val_data_uri None \
diff --git a/train.py b/train.py
index 9e90357..7c171f5 100644
--- a/train.py
+++ b/train.py
@@ -15,6 +15,7 @@
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import barrier, is_main_process, zero_first
 from axolotl.utils.models import load_tokenizer
+from rich import console, panel
 from transformers import AutoConfig
 from transformers.utils import is_torch_bf16_gpu_available, is_torch_tf32_available
 
@@ -38,7 +39,6 @@
 # CURRENT LIMITATIONS
 # Axolotl sets report_to to None instead of "none"
 # There should be an option to add only missing special tokens
-# Cannot control truncation vs dropping when data exceeds sequence length
 # Have to hack axolotl module globals to hook our own code
 # micro batch size still needs to be decided by the user. 1 is okay because we are using sample packing now
 
@@ -56,8 +56,11 @@
     "phi3": "phi_3",
     "phi_3": "phi_3",
     "phi": "phi_3",
+    "mistral": "mistral",
+    "mixtral": "mistral",
     None: "chatml",
 }
+LOCAL_RANK = int(os.environ.get("LOCAL_RANK", 0))
 
 
 def set_cfg_option_if_auto(cfg, key, value, force=False):
@@ -101,6 +104,8 @@ def make_axolotl_config(config_base, kwargs, timestamp=None):
     axolotl_config = os.path.join(cfg.output_dir, "axolotl_config.yaml")
 
     if is_main_process():
+        import torch
+
         set_cfg_option_if_auto(cfg, "tokenizer_config", cfg.base_model_config or cfg.base_model)
 
         os.makedirs(cfg.data_dir, exist_ok=True)
@@ -146,12 +151,20 @@ def make_axolotl_config(config_base, kwargs, timestamp=None):
 
         set_cfg_option_if_auto(cfg, "eval_steps", 0.1)
         set_cfg_option_if_auto(cfg, "save_steps", 0.1)
-        set_cfg_option_if_auto(cfg, "tf32", is_torch_tf32_available())
+
+        is_ampere_or_newer = torch.cuda.get_device_capability(device=LOCAL_RANK) >= (8, 0)
+        is_tf32_supported = is_ampere_or_newer and is_torch_tf32_available()
+        is_bf16_supported = is_ampere_or_newer and is_torch_bf16_gpu_available()
+        set_cfg_option_if_auto(cfg, "tf32", is_tf32_supported)
         # TODO: Axolotl doesn't seem to do anything differently even though it says setting bfloat16/float16 will disable AMP
-        set_cfg_option_if_auto(cfg, "bf16", is_torch_bf16_gpu_available())
-        set_cfg_option_if_auto(cfg, "bfloat16", is_torch_bf16_gpu_available())
-        set_cfg_option_if_auto(cfg, "fp16", not is_torch_bf16_gpu_available())
-        set_cfg_option_if_auto(cfg, "float16", not is_torch_bf16_gpu_available())
+        set_cfg_option_if_auto(cfg, "bf16", is_bf16_supported)
+        set_cfg_option_if_auto(cfg, "bfloat16", is_bf16_supported)
+        set_cfg_option_if_auto(cfg, "fp16", not is_bf16_supported)
+        set_cfg_option_if_auto(cfg, "float16", not is_bf16_supported)
+
+        set_cfg_option_if_auto(cfg, "flash_attention", is_ampere_or_newer)
+        set_cfg_option_if_auto(cfg, "flash_attn_cross_entropy", is_ampere_or_newer)
+        set_cfg_option_if_auto(cfg, "flash_attn_rms_norm", is_ampere_or_newer)
 
         set_cfg_option_if_auto(cfg, "load_in_4bit", cfg.adapter == "qlora")
         set_cfg_option_if_auto(cfg, "flash_attn_fuse_mlp", cfg.adapter not in {"qlora", "lora"})
@@ -219,10 +232,9 @@ def make_axolotl_config(config_base, kwargs, timestamp=None):
     return axolotl_config
 
 
-def train_with_truefoundry(config_base: Path = Path("examples/"), **kwargs):
-    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+def _train_with_truefoundry(config_base: Path = Path("examples/"), **kwargs):
     maybe_set_custom_tempdir()
-    maybe_set_torch_max_memory(device=local_rank)
+    maybe_set_torch_max_memory(device=LOCAL_RANK)
     timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S")
     with zero_first(is_main_process()):
         axolotl_config = make_axolotl_config(
@@ -282,31 +294,17 @@ def train_with_truefoundry(config_base: Path = Path("examples/"), **kwargs):
             run.end()
 
 
-if __name__ == "__main__":
-    fire.Fire(train_with_truefoundry)
+def train_with_truefoundry(config_base: Path = Path("examples/"), **kwargs):
+    try:
+        _train_with_truefoundry(config_base=config_base, **kwargs)
+    except Exception as e:
+        c = console.Console()
+        error_message = (
+            f"Rank {LOCAL_RANK} failed with error: {str(e)}\nPlease see the following traceback for more details."
+        )
+        c.print(panel.Panel.fit(f"[red]{error_message}[/]", title="Error", border_style="bright_red"))
+        raise
 
 
-# def check_if_model_will_fit_only_with_gpus(
-#     model_id: str,
-#     revision: Optional[str],
-#     torch_dtype,
-# ):
-#     with init_empty_weights():
-#         config = AutoConfig.from_pretrained(
-#             model_id,
-#             revision=revision,
-#             trust_remote_code=True,
-#         )
-#         model = AutoModelForCausalLM.from_config(
-#             config=config,
-#             trust_remote_code=True,
-#             torch_dtype=torch_dtype,
-#             # low_cpu_mem_usage=True,
-#         )
-#     device_map = infer_auto_device_map(model, dtype=torch_dtype)
-#     logger.info(f"Inferred device_map for auto settings: {device_map}")
-#     if any(not isinstance(v, int) for v in device_map.values()):
-#         raise RuntimeError(
-#             "For lora/qlora the model must entirely fit on gpus without any kind of offloading to prevent bugs with merging! "
-#             "With the current configuration model is being offloaded to cpu/disk. This causes incorrect model saving. See https://github.com/huggingface/peft/issues/868"
-#         )
+if __name__ == "__main__":
+    fire.Fire(train_with_truefoundry)