Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/release' into release
Browse files Browse the repository at this point in the history
  • Loading branch information
dchourasia committed Oct 19, 2024
2 parents 2c73c6b + 8f16818 commit e1a7f3d
Show file tree
Hide file tree
Showing 15 changed files with 99 additions and 36 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/format.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ name: Format

on:
push:
branches: [ "main" ]
branches: [ "main", "release" ]
pull_request:
branches: [ "main" ]
branches: [ "main", "release" ]

jobs:
lint:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/image.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
name: Image
on:
push:
branches: [ "main" ]
branches: [ "main", "release" ]
pull_request:
branches: [ "main" ]
branches: [ "main", "release" ]

jobs:
build:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/test.yaml
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
name: Test
on:
push:
branches: [ "main" ]
branches: [ "main", "release" ]
pull_request:
branches: [ "main" ]
branches: [ "main", "release" ]

jobs:
build:
Expand Down
2 changes: 1 addition & 1 deletion CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@
# https://help.github.com/en/articles/about-code-owners
#

* @anhuong @Ssukriti @alex-jw-brooks
* @anhuong @Ssukriti @aluu317 @fabianlim @kmehant
6 changes: 6 additions & 0 deletions build/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,12 @@ RUN mkdir /app && \
chown -R $USER:0 /app /tmp && \
chmod -R g+rwX /app /tmp

# Set Triton environment variables for qLoRA
ENV TRITON_HOME="/tmp/triton_home"
ENV TRITON_DUMP_DIR="/tmp/triton_dump_dir"
ENV TRITON_CACHE_DIR="/tmp/triton_cache_dir"
ENV TRITON_OVERRIDE_DIR="/tmp/triton_override_dir"

# Need a better way to address these hacks
RUN if [[ "${ENABLE_AIM}" == "true" ]] ; then \
touch /.aim_profile && \
Expand Down
3 changes: 2 additions & 1 deletion fixtures/accelerate_fsdp_defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,10 @@ fsdp_config:
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP

# this controls the FSDP pipelining
fsdp_backward_prefetch_policy: BACKWARD_PRE # set to BACKWARD_PRE for the most time-efficient pipeline
fsdp_backward_prefetch: BACKWARD_PRE # set to BACKWARD_PRE for the most time-efficient pipeline
# but requires the most memory. BACKWARD_POST is the less
# memory intensive option
fsdp_backward_prefetch_policy: BACKWARD_PRE # for backwards compatibility

# setting this to true will increase forward memory by prefetching the next FSDP all-gather, while performing
# the current forward pass.
Expand Down
8 changes: 4 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,14 +27,14 @@ classifiers=[
]
dependencies = [
"numpy>=1.26.4,<2.0",
"accelerate>=0.20.3,<0.34",
"transformers>4.41,<4.45",
"torch>=2.2.0,<3.0",
"accelerate>=0.20.3,!=0.34,<1.1",
"transformers>4.41,<4.50",
"torch>=2.2.0,<2.5",
"sentencepiece>=0.1.99,<0.3",
"tokenizers>=0.13.3,<1.0",
"tqdm>=4.66.2,<5.0",
"trl>=0.9.3,<1.0",
"peft>=0.8.0,<0.13",
"peft>=0.8.0,<0.14",
"protobuf>=5.28.0,<6.0.0",
"datasets>=2.15.0,<3.0",
"simpleeval>=0.9.13,<1.0",
Expand Down
4 changes: 2 additions & 2 deletions scripts/run_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
import torch

# Local
from tuning.data import tokenizer_data_utils
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize


### Utilities
Expand Down Expand Up @@ -219,7 +219,7 @@ def load(
# where the model's layers are modified, in our case the embedding layer
# is modified, so we resize the backbone model's embedding layer with our own
# utility before passing it along to load the PEFT model.
tokenizer_data_utils.tokenizer_and_embedding_resize(
tokenizer_and_embedding_resize(
{}, tokenizer=tokenizer, model=base_model
)
model = PeftModel.from_pretrained(
Expand Down
2 changes: 1 addition & 1 deletion tests/build/dummy_job_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
"dynamo_use_dynamic": true,
"num_machines": 1,
"main_process_port": 1234,
"fsdp_backward_prefetch_policy": "TRANSFORMER_BASED_WRAP",
"fsdp_backward_prefetch": "TRANSFORMER_BASED_WRAP",
"fsdp_sharding_strategy": 1,
"fsdp_state_dict_type": "FULL_STATE_DICT",
"fsdp_cpu_ram_efficient_loading": true,
Expand Down
2 changes: 1 addition & 1 deletion tests/build/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_process_accelerate_launch_args(job_config):
args = process_accelerate_launch_args(job_config)
# json config values used
assert args.use_fsdp is True
assert args.fsdp_backward_prefetch_policy == "TRANSFORMER_BASED_WRAP"
assert args.fsdp_backward_prefetch == "TRANSFORMER_BASED_WRAP"
assert args.env == ["env1", "env2"]
assert args.training_script == "tuning.sft_trainer"
assert args.config_file == "fixtures/accelerate_fsdp_defaults.yaml"
Expand Down
74 changes: 70 additions & 4 deletions tests/utils/test_embedding_resize.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,10 @@
import torch

# Local
from tuning.data import tokenizer_data_utils
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize

MODEL_NAME = "Maykeye/TinyLLama-v0"
INPUT_TEXT = "### Text: @NortonSupport Thanks much.\n\n### Label:"


def _inference(
Expand All @@ -41,16 +42,16 @@ def _inference(


def test_output_unaltered_across_embedding_resizes():
input_text = "### Text: @NortonSupport Thanks much.\n\n### Label:"
input_text = INPUT_TEXT
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model_not_resized = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model_resized = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

tokenizer_data_utils.tokenizer_and_embedding_resize(
tokenizer_and_embedding_resize(
special_tokens_dict={}, tokenizer=tokenizer, model=model_resized, multiple_of=8
)

tokenizer_data_utils.tokenizer_and_embedding_resize(
tokenizer_and_embedding_resize(
special_tokens_dict={},
tokenizer=tokenizer,
model=model_not_resized,
Expand All @@ -74,3 +75,68 @@ def test_output_unaltered_across_embedding_resizes():
)

assert output_from_model_not_resized == output_from_model_resized


def test_resize_with_special_tokens():
input_text = INPUT_TEXT
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

input_tokenizer_len = len(tokenizer.get_vocab())

special_tokens = {"sep_token": "<SEP>", "pad_token": "<PAD>"}
resize_result = tokenizer_and_embedding_resize(
special_tokens_dict=special_tokens,
tokenizer=tokenizer,
model=model,
multiple_of=1,
)

assert "<SEP>" in tokenizer.get_vocab()
assert "<PAD>" in tokenizer.get_vocab()

output_tokenizer_len = len(tokenizer.get_vocab())

assert output_tokenizer_len == input_tokenizer_len + 2
assert resize_result["num_new_tokens"] == output_tokenizer_len - input_tokenizer_len

output = _inference(
tokenizer=tokenizer, model=model, input_text=input_text, max_new_tokens=20
)
assert output is not None


def test_no_resize_when_no_special_tokens():
input_text = INPUT_TEXT
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

input_tokenizer_len = len(tokenizer.get_vocab())

resize_result = tokenizer_and_embedding_resize(
special_tokens_dict={}, tokenizer=tokenizer, model=model, multiple_of=1
)

output_tokenizer_len = len(tokenizer.get_vocab())

assert input_tokenizer_len == output_tokenizer_len
assert resize_result["num_new_tokens"] == 0

output = _inference(
tokenizer=tokenizer, model=model, input_text=input_text, max_new_tokens=20
)

assert output is not None


def test_resize_with_multiple_of():
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

resize_result = tokenizer_and_embedding_resize(
special_tokens_dict={}, tokenizer=tokenizer, model=model, multiple_of=8
)

assert model.get_input_embeddings().embedding_dim % 8 == 0
assert resize_result["new_embedding_size"] % 8 == 0
assert model.get_output_embeddings().out_features % 8 == 0
2 changes: 1 addition & 1 deletion tests/utils/test_tokenizer_data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

# Local
# First party
from tuning.data.tokenizer_data_utils import tokenizer_and_embedding_resize
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize


def test_tokenizer_and_embedding_resize_return_values():
Expand Down
13 changes: 0 additions & 13 deletions tuning/data/__init__.py

This file was deleted.

7 changes: 5 additions & 2 deletions tuning/sft_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@
FileLoggingTrackerConfig,
TrackerConfigFactory,
)
from tuning.data import tokenizer_data_utils
from tuning.trackers.tracker_factory import FILE_LOGGING_TRACKER, get_tracker
from tuning.trainercontroller import TrainerControllerCallback
from tuning.utils.config_utils import get_hf_peft_config, get_json_config
Expand All @@ -70,6 +69,7 @@
is_pretokenized_dataset,
validate_data_args,
)
from tuning.utils.tokenizer_data_utils import tokenizer_and_embedding_resize


def train(
Expand Down Expand Up @@ -294,7 +294,7 @@ def train(

# TODO: lower priority but understand if resizing impacts inference quality and why its needed.
# It makes sense if we manipulate tokenizer that we also save it and provide it to inference.
added_tokens_dict = tokenizer_data_utils.tokenizer_and_embedding_resize(
added_tokens_dict = tokenizer_and_embedding_resize(
special_tokens_dict=special_tokens_dict,
tokenizer=tokenizer,
model=model,
Expand Down Expand Up @@ -637,6 +637,9 @@ def main():
combined_tracker_configs.file_logger_config = file_logger_config
combined_tracker_configs.aim_config = aim_config

if training_args.output_dir:
os.makedirs(training_args.output_dir, exist_ok=True)
logger.info("using the output directory at %s", training_args.output_dir)
try:
trainer, additional_train_info = train(
model_args=model_args,
Expand Down
File renamed without changes.

0 comments on commit e1a7f3d

Please sign in to comment.