Skip to content

Commit

Permalink
Bump axolotl, data validation warnings, bf16, flash-attn only on sm_8…
Browse files Browse the repository at this point in the history
…0+, highlight error
  • Loading branch information
chiragjn committed Jun 12, 2024
1 parent beffd7e commit 6a13337
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 47 deletions.
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# https://hub.docker.com/layers/winglian/axolotl/main-20240603-py3.11-cu121-2.3.0/images/sha256-e4b898a0f700eb86f9e802bb85c1ec6c509b2dec65d941ad43405fe323865017?context=explore
FROM --platform=linux/amd64 winglian/axolotl@sha256:a66d1469cdad472779f6419ea67d0fbb2cce984244aa86f40c99abaa4a21b3db
# https://hub.docker.com/layers/winglian/axolotl/main-20240612-py3.11-cu121-2.3.0/images/sha256-798eed818fb11d24a640c0efbf27f65fbaebc1d9a5db210d585aa2a4328e93e1?context=explore
FROM --platform=linux/amd64 winglian/axolotl@sha256:aac52c92ab245793932a635e6dedf14a3a9fb009e40cdf16c10b715f1466afa8
USER root
COPY requirements.txt /tmp/
RUN pip install -U pip wheel setuptools && \
Expand All @@ -9,7 +9,7 @@ RUN mkdir -p /packages && \
cd /packages && \
git clone https://github.com/truefoundry/axolotl && \
cd axolotl/ && \
git checkout dffcb7adfb42dd3305fcabb0de106d5e2454315e
git checkout 0711bfeb6af7d359deb4ee2cae81ceb6890ebf80
RUN cd /packages/axolotl/ && \
MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib] && \
pip install --no-cache-dir -U -r /tmp/requirements.txt && \
Expand Down
2 changes: 1 addition & 1 deletion Dockerfile-notebook
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ USER jovyan
RUN cd /packages && \
git clone https://github.com/truefoundry/axolotl && \
cd axolotl/ && \
git checkout dffcb7adfb42dd3305fcabb0de106d5e2454315e
git checkout 0711bfeb6af7d359deb4ee2cae81ceb6890ebf80
RUN cd /packages/axolotl/ && \
MAX_JOBS=1 NVCC_APPEND_FLAGS="--threads 1" pip install -U --no-build-isolation -e .[flash-attn,mamba-ssm,fused-dense-lib] && \
pip install --no-cache-dir -U -r /tmp/llm-finetune/notebook-requirements.txt
Expand Down
8 changes: 5 additions & 3 deletions config-base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,11 @@ datasets: auto # type: list
test_datasets: auto # type: list
bf16: auto # type: bool
bfloat16: auto # type: bool
flash_attention: auto # type: bool
flash_attn_cross_entropy: auto # type: bool
flash_attn_fuse_mlp: auto # type: bool
flash_attn_fuse_qkv: auto # type: bool
flash_attn_rms_norm: auto # type: bool
float16: auto # type: bool
fp16: auto # type: bool
load_in_4bit: auto # type: bool
Expand Down Expand Up @@ -64,9 +67,6 @@ early_stopping_patience: 10
eval_sample_packing: False
eval_steps: 0.1
eval_strategy: steps
flash_attention: True
flash_attn_cross_entropy: True
flash_attn_rms_norm: True
gradient_accumulation_steps: 4
gradient_checkpointing: unsloth
gradient_checkpointing_kwargs:
Expand Down Expand Up @@ -106,8 +106,10 @@ warmup_ratio: 0.1
weight_decay: 0.01
## Added by TrueFoundry, not native to Axolotl
cleanup_output_dir_on_start: False
drop_long_sequences: False
logging_dir: ./tensorboard_logs
mlfoundry_log_checkpoints: True
save_model_on_interrupt: False
use_mflow: False
use_wandb: False
use_tensorboard: True
9 changes: 5 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
--extra-index-url https://download.pytorch.org/whl/cu121
torch==2.3.0+cu121
cloud-files==4.15.2
truefoundry[ml]==0.2.4
snowflake-connector-python[pandas]==3.7.0
deepspeed @ git+https://github.com/truefoundry/DeepSpeed@f706f516730bb4a175870b35372e00af8c27a258
pyarrow==15.0.0
deepspeed @ git+https://github.com/truefoundry/DeepSpeed@0866580c316963ddda30ffee44de2c3e21129556
rich>=13.0.0,<14
snowflake-connector-python[pandas]==3.7.0
torch==2.3.0+cu121
truefoundry[ml]==0.2.4
unsloth @ git+https://github.com/unslothai/unsloth@27fa021a7bb959a53667dd4e7cdb9598c207aa0d
1 change: 0 additions & 1 deletion sample_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ accelerate launch \
train.py \
config-base.yaml \
--deepspeed ./deepspeed_configs/3_ds_z2_config.json \
--flash_attention True \
--base_model TinyLlama/TinyLlama-1.1B-Chat-v1.0 \
--train_data_uri ./sample_data/chatalpaca-openai-100.jsonl \
--val_data_uri None \
Expand Down
68 changes: 33 additions & 35 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from axolotl.utils.dict import DictDefault
from axolotl.utils.distributed import barrier, is_main_process, zero_first
from axolotl.utils.models import load_tokenizer
from rich import console, panel
from transformers import AutoConfig
from transformers.utils import is_torch_bf16_gpu_available, is_torch_tf32_available

Expand All @@ -38,7 +39,6 @@
# CURRENT LIMITATIONS
# Axolotl sets report_to to None instead of "none"
# There should be an option to add only missing special tokens
# Cannot control truncation vs dropping when data exceeds sequence length
# Have to hack axolotl module globals to hook our own code
# micro batch size still needs to be decided by the user. 1 is okay because we are using sample packing now

Expand All @@ -56,8 +56,11 @@
"phi3": "phi_3",
"phi_3": "phi_3",
"phi": "phi_3",
"mistral": "mistral",
"mixtral": "mistral",
None: "chatml",
}
LOCAL_RANK = int(os.environ.get("LOCAL_RANK", 0))


def set_cfg_option_if_auto(cfg, key, value, force=False):
Expand Down Expand Up @@ -101,6 +104,8 @@ def make_axolotl_config(config_base, kwargs, timestamp=None):
axolotl_config = os.path.join(cfg.output_dir, "axolotl_config.yaml")

if is_main_process():
import torch

set_cfg_option_if_auto(cfg, "tokenizer_config", cfg.base_model_config or cfg.base_model)

os.makedirs(cfg.data_dir, exist_ok=True)
Expand Down Expand Up @@ -146,12 +151,20 @@ def make_axolotl_config(config_base, kwargs, timestamp=None):

set_cfg_option_if_auto(cfg, "eval_steps", 0.1)
set_cfg_option_if_auto(cfg, "save_steps", 0.1)
set_cfg_option_if_auto(cfg, "tf32", is_torch_tf32_available())

is_ampere_or_newer = torch.cuda.get_device_capability(device=LOCAL_RANK) >= (8, 0)
is_tf32_supported = is_ampere_or_newer and is_torch_tf32_available()
is_bf16_supported = is_ampere_or_newer and is_torch_bf16_gpu_available()
set_cfg_option_if_auto(cfg, "tf32", is_tf32_supported)
# TODO: Axolotl doesn't seem to do anything differently even though it says setting bfloat16/float16 will disable AMP
set_cfg_option_if_auto(cfg, "bf16", is_torch_bf16_gpu_available())
set_cfg_option_if_auto(cfg, "bfloat16", is_torch_bf16_gpu_available())
set_cfg_option_if_auto(cfg, "fp16", not is_torch_bf16_gpu_available())
set_cfg_option_if_auto(cfg, "float16", not is_torch_bf16_gpu_available())
set_cfg_option_if_auto(cfg, "bf16", is_bf16_supported)
set_cfg_option_if_auto(cfg, "bfloat16", is_bf16_supported)
set_cfg_option_if_auto(cfg, "fp16", not is_bf16_supported)
set_cfg_option_if_auto(cfg, "float16", not is_bf16_supported)

set_cfg_option_if_auto(cfg, "flash_attention", is_ampere_or_newer)
set_cfg_option_if_auto(cfg, "flash_attn_cross_entropy", is_ampere_or_newer)
set_cfg_option_if_auto(cfg, "flash_attn_rms_norm", is_ampere_or_newer)

set_cfg_option_if_auto(cfg, "load_in_4bit", cfg.adapter == "qlora")
set_cfg_option_if_auto(cfg, "flash_attn_fuse_mlp", cfg.adapter not in {"qlora", "lora"})
Expand Down Expand Up @@ -219,10 +232,9 @@ def make_axolotl_config(config_base, kwargs, timestamp=None):
return axolotl_config


def train_with_truefoundry(config_base: Path = Path("examples/"), **kwargs):
local_rank = int(os.environ.get("LOCAL_RANK", 0))
def _train_with_truefoundry(config_base: Path = Path("examples/"), **kwargs):
maybe_set_custom_tempdir()
maybe_set_torch_max_memory(device=local_rank)
maybe_set_torch_max_memory(device=LOCAL_RANK)
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S")
with zero_first(is_main_process()):
axolotl_config = make_axolotl_config(
Expand Down Expand Up @@ -282,31 +294,17 @@ def train_with_truefoundry(config_base: Path = Path("examples/"), **kwargs):
run.end()


if __name__ == "__main__":
fire.Fire(train_with_truefoundry)
def train_with_truefoundry(config_base: Path = Path("examples/"), **kwargs):
try:
_train_with_truefoundry(config_base=config_base, **kwargs)
except Exception as e:
c = console.Console()
error_message = (
f"Rank {LOCAL_RANK} failed with error: {str(e)}\nPlease see the following traceback for more details."
)
c.print(panel.Panel.fit(f"[red]{error_message}[/]", title="Error", border_style="bright_red"))
raise


# def check_if_model_will_fit_only_with_gpus(
# model_id: str,
# revision: Optional[str],
# torch_dtype,
# ):
# with init_empty_weights():
# config = AutoConfig.from_pretrained(
# model_id,
# revision=revision,
# trust_remote_code=True,
# )
# model = AutoModelForCausalLM.from_config(
# config=config,
# trust_remote_code=True,
# torch_dtype=torch_dtype,
# # low_cpu_mem_usage=True,
# )
# device_map = infer_auto_device_map(model, dtype=torch_dtype)
# logger.info(f"Inferred device_map for auto settings: {device_map}")
# if any(not isinstance(v, int) for v in device_map.values()):
# raise RuntimeError(
# "For lora/qlora the model must entirely fit on gpus without any kind of offloading to prevent bugs with merging! "
# "With the current configuration model is being offloaded to cpu/disk. This causes incorrect model saving. See https://github.com/huggingface/peft/issues/868"
# )
if __name__ == "__main__":
fire.Fire(train_with_truefoundry)

0 comments on commit 6a13337

Please sign in to comment.