From 6a1333769990ec08f633975bfc356cca46e961dd Mon Sep 17 00:00:00 2001 From: Chirag Jain Date: Wed, 12 Jun 2024 15:19:55 +0000 Subject: [PATCH] Bump axolotl, data validation warnings, bf16, flash-attn only on sm_80+, highlight error --- Dockerfile | 6 ++-- Dockerfile-notebook | 2 +- config-base.yaml | 8 ++++-- requirements.txt | 9 +++--- sample_run.sh | 1 - train.py | 68 ++++++++++++++++++++++----------------------- 6 files changed, 47 insertions(+), 47 deletions(-) diff --git a/Dockerfile b/Dockerfile index af73a32..92c5a06 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,5 +1,5 @@ -# https://hub.docker.com/layers/winglian/axolotl/main-20240603-py3.11-cu121-2.3.0/images/sha256-e4b898a0f700eb86f9e802bb85c1ec6c509b2dec65d941ad43405fe323865017?context=explore -FROM --platform=linux/amd64 winglian/axolotl@sha256:a66d1469cdad472779f6419ea67d0fbb2cce984244aa86f40c99abaa4a21b3db +# https://hub.docker.com/layers/winglian/axolotl/main-20240612-py3.11-cu121-2.3.0/images/sha256-798eed818fb11d24a640c0efbf27f65fbaebc1d9a5db210d585aa2a4328e93e1?context=explore +FROM --platform=linux/amd64 winglian/axolotl@sha256:aac52c92ab245793932a635e6dedf14a3a9fb009e40cdf16c10b715f1466afa8 USER root COPY requirements.txt /tmp/ RUN pip install -U pip wheel setuptools && \ @@ -9,7 +9,7 @@ RUN mkdir -p /packages && \ cd /packages && \ git clone https://github.com/truefoundry/axolotl && \ cd axolotl/ && \ - git checkout dffcb7adfb42dd3305fcabb0de106d5e2454315e + git checkout 0711bfeb6af7d359deb4ee2cae81ceb6890ebf80 RUN cd /packages/axolotl/ && \ MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib] && \ pip install --no-cache-dir -U -r /tmp/requirements.txt && \ diff --git a/Dockerfile-notebook b/Dockerfile-notebook index f95982f..31a3c41 100644 --- a/Dockerfile-notebook +++ b/Dockerfile-notebook @@ -21,7 +21,7 @@ USER jovyan RUN cd /packages && \ git clone https://github.com/truefoundry/axolotl && \ cd axolotl/ && \ - git checkout dffcb7adfb42dd3305fcabb0de106d5e2454315e + git checkout 0711bfeb6af7d359deb4ee2cae81ceb6890ebf80 RUN cd /packages/axolotl/ && \ MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib] && \ pip install --no-cache-dir -U -r /tmp/llm-finetune/notebook-requirements.txt diff --git a/config-base.yaml b/config-base.yaml index c2a06d9..7ed9691 100644 --- a/config-base.yaml +++ b/config-base.yaml @@ -22,8 +22,11 @@ datasets: auto # type: list test_datasets: auto # type: list bf16: auto # type: bool bfloat16: auto # type: bool +flash_attention: auto # type: bool +flash_attn_cross_entropy: auto # type: bool flash_attn_fuse_mlp: auto # type: bool flash_attn_fuse_qkv: auto # type: bool +flash_attn_rms_norm: auto # type: bool float16: auto # type: bool fp16: auto # type: bool load_in_4bit: auto # type: bool @@ -64,9 +67,6 @@ early_stopping_patience: 10 eval_sample_packing: False eval_steps: 0.1 eval_strategy: steps -flash_attention: True -flash_attn_cross_entropy: True -flash_attn_rms_norm: True gradient_accumulation_steps: 4 gradient_checkpointing: unsloth gradient_checkpointing_kwargs: @@ -106,8 +106,10 @@ warmup_ratio: 0.1 weight_decay: 0.01 ## Added by TrueFoundry, not native to Axolotl cleanup_output_dir_on_start: False +drop_long_sequences: False logging_dir: ./tensorboard_logs mlfoundry_log_checkpoints: True +save_model_on_interrupt: False use_mflow: False use_wandb: False use_tensorboard: True diff --git a/requirements.txt b/requirements.txt index eea8adf..d76c79d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,8 +1,9 @@ --extra-index-url https://download.pytorch.org/whl/cu121 -torch==2.3.0+cu121 cloud-files==4.15.2 -truefoundry[ml]==0.2.4 -snowflake-connector-python[pandas]==3.7.0 +deepspeed @ git+https://github.com/truefoundry/DeepSpeed@f706f516730bb4a175870b35372e00af8c27a258 pyarrow==15.0.0 -deepspeed @ git+https://github.com/truefoundry/DeepSpeed@0866580c316963ddda30ffee44de2c3e21129556 +rich>=13.0.0,<14 +snowflake-connector-python[pandas]==3.7.0 +torch==2.3.0+cu121 +truefoundry[ml]==0.2.4 unsloth @ git+https://github.com/unslothai/unsloth@27fa021a7bb959a53667dd4e7cdb9598c207aa0d diff --git a/sample_run.sh b/sample_run.sh index 6936f9a..572b7ff 100755 --- a/sample_run.sh +++ b/sample_run.sh @@ -32,7 +32,6 @@ accelerate launch \ train.py \ config-base.yaml \ --deepspeed ./deepspeed_configs/3_ds_z2_config.json \ ---flash_attention True \ --base_model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ --train_data_uri ./sample_data/chatalpaca-openai-100.jsonl \ --val_data_uri None \ diff --git a/train.py b/train.py index 9e90357..7c171f5 100644 --- a/train.py +++ b/train.py @@ -15,6 +15,7 @@ from axolotl.utils.dict import DictDefault from axolotl.utils.distributed import barrier, is_main_process, zero_first from axolotl.utils.models import load_tokenizer +from rich import console, panel from transformers import AutoConfig from transformers.utils import is_torch_bf16_gpu_available, is_torch_tf32_available @@ -38,7 +39,6 @@ # CURRENT LIMITATIONS # Axolotl sets report_to to None instead of "none" # There should be an option to add only missing special tokens -# Cannot control truncation vs dropping when data exceeds sequence length # Have to hack axolotl module globals to hook our own code # micro batch size still needs to be decided by the user. 1 is okay because we are using sample packing now @@ -56,8 +56,11 @@ "phi3": "phi_3", "phi_3": "phi_3", "phi": "phi_3", + "mistral": "mistral", + "mixtral": "mistral", None: "chatml", } +LOCAL_RANK = int(os.environ.get("LOCAL_RANK", 0)) def set_cfg_option_if_auto(cfg, key, value, force=False): @@ -101,6 +104,8 @@ def make_axolotl_config(config_base, kwargs, timestamp=None): axolotl_config = os.path.join(cfg.output_dir, "axolotl_config.yaml") if is_main_process(): + import torch + set_cfg_option_if_auto(cfg, "tokenizer_config", cfg.base_model_config or cfg.base_model) os.makedirs(cfg.data_dir, exist_ok=True) @@ -146,12 +151,20 @@ def make_axolotl_config(config_base, kwargs, timestamp=None): set_cfg_option_if_auto(cfg, "eval_steps", 0.1) set_cfg_option_if_auto(cfg, "save_steps", 0.1) - set_cfg_option_if_auto(cfg, "tf32", is_torch_tf32_available()) + + is_ampere_or_newer = torch.cuda.get_device_capability(device=LOCAL_RANK) >= (8, 0) + is_tf32_supported = is_ampere_or_newer and is_torch_tf32_available() + is_bf16_supported = is_ampere_or_newer and is_torch_bf16_gpu_available() + set_cfg_option_if_auto(cfg, "tf32", is_tf32_supported) # TODO: Axolotl doesn't seem to do anything differently even though it says setting bfloat16/float16 will disable AMP - set_cfg_option_if_auto(cfg, "bf16", is_torch_bf16_gpu_available()) - set_cfg_option_if_auto(cfg, "bfloat16", is_torch_bf16_gpu_available()) - set_cfg_option_if_auto(cfg, "fp16", not is_torch_bf16_gpu_available()) - set_cfg_option_if_auto(cfg, "float16", not is_torch_bf16_gpu_available()) + set_cfg_option_if_auto(cfg, "bf16", is_bf16_supported) + set_cfg_option_if_auto(cfg, "bfloat16", is_bf16_supported) + set_cfg_option_if_auto(cfg, "fp16", not is_bf16_supported) + set_cfg_option_if_auto(cfg, "float16", not is_bf16_supported) + + set_cfg_option_if_auto(cfg, "flash_attention", is_ampere_or_newer) + set_cfg_option_if_auto(cfg, "flash_attn_cross_entropy", is_ampere_or_newer) + set_cfg_option_if_auto(cfg, "flash_attn_rms_norm", is_ampere_or_newer) set_cfg_option_if_auto(cfg, "load_in_4bit", cfg.adapter == "qlora") set_cfg_option_if_auto(cfg, "flash_attn_fuse_mlp", cfg.adapter not in {"qlora", "lora"}) @@ -219,10 +232,9 @@ def make_axolotl_config(config_base, kwargs, timestamp=None): return axolotl_config -def train_with_truefoundry(config_base: Path = Path("examples/"), **kwargs): - local_rank = int(os.environ.get("LOCAL_RANK", 0)) +def _train_with_truefoundry(config_base: Path = Path("examples/"), **kwargs): maybe_set_custom_tempdir() - maybe_set_torch_max_memory(device=local_rank) + maybe_set_torch_max_memory(device=LOCAL_RANK) timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S") with zero_first(is_main_process()): axolotl_config = make_axolotl_config( @@ -282,31 +294,17 @@ def train_with_truefoundry(config_base: Path = Path("examples/"), **kwargs): run.end() -if __name__ == "__main__": - fire.Fire(train_with_truefoundry) +def train_with_truefoundry(config_base: Path = Path("examples/"), **kwargs): + try: + _train_with_truefoundry(config_base=config_base, **kwargs) + except Exception as e: + c = console.Console() + error_message = ( + f"Rank {LOCAL_RANK} failed with error: {str(e)}\nPlease see the following traceback for more details." + ) + c.print(panel.Panel.fit(f"[red]{error_message}[/]", title="Error", border_style="bright_red")) + raise -# def check_if_model_will_fit_only_with_gpus( -# model_id: str, -# revision: Optional[str], -# torch_dtype, -# ): -# with init_empty_weights(): -# config = AutoConfig.from_pretrained( -# model_id, -# revision=revision, -# trust_remote_code=True, -# ) -# model = AutoModelForCausalLM.from_config( -# config=config, -# trust_remote_code=True, -# torch_dtype=torch_dtype, -# # low_cpu_mem_usage=True, -# ) -# device_map = infer_auto_device_map(model, dtype=torch_dtype) -# logger.info(f"Inferred device_map for auto settings: {device_map}") -# if any(not isinstance(v, int) for v in device_map.values()): -# raise RuntimeError( -# "For lora/qlora the model must entirely fit on gpus without any kind of offloading to prevent bugs with merging! " -# "With the current configuration model is being offloaded to cpu/disk. This causes incorrect model saving. See https://github.com/huggingface/peft/issues/868" -# ) +if __name__ == "__main__": + fire.Fire(train_with_truefoundry)