Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Non-Contiguous Tensor Issue in Checkpoint Consolidation #708

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions docs/source/training_tutorials/finetune_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,7 @@ def chunk(sample, chunk_length=chunk_length):
return result

# tokenize and chunk dataset
lm_dataset = dataset.map(
partial(chunk, chunk_length=chunk_length),
batched=True,
)
lm_dataset = dataset.map(partial(chunk, chunk_length=chunk_length), batched=True,)
print(f"Total number of samples: {len(lm_dataset)}")
return lm_dataset

Expand Down
6 changes: 1 addition & 5 deletions docs/source/training_tutorials/sft_lora_finetune_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,11 +43,7 @@ def training_function(script_args, training_args):
)

args = training_args.to_dict()
sft_config = NeuronSFTConfig(
max_seq_length=1024,
packing=False,
**args,
)
sft_config = NeuronSFTConfig(max_seq_length=1024, packing=False, **args,)

trainer = NeuronSFTTrainer(
args=sft_config,
Expand Down
23 changes: 3 additions & 20 deletions examples/image-classification/run_image_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,10 +266,7 @@ def main():
if data_args.validation_dir is not None:
data_files["validation"] = os.path.join(data_args.validation_dir, "**")
dataset = load_dataset(
"imagefolder",
data_files=data_files,
cache_dir=model_args.cache_dir,
task="image-classification",
"imagefolder", data_files=data_files, cache_dir=model_args.cache_dir, task="image-classification",
)

# If we don't have a validation split, split off a percentage of train as validation.
Expand Down Expand Up @@ -340,22 +337,8 @@ def compute_metrics(p):
if hasattr(image_processor, "image_mean") and hasattr(image_processor, "image_std")
else Lambda(lambda x: x)
)
_train_transforms = Compose(
[
RandomResizedCrop(size),
RandomHorizontalFlip(),
ToTensor(),
normalize,
]
)
_val_transforms = Compose(
[
Resize(size),
CenterCrop(size),
ToTensor(),
normalize,
]
)
_train_transforms = Compose([RandomResizedCrop(size), RandomHorizontalFlip(), ToTensor(), normalize,])
_val_transforms = Compose([Resize(size), CenterCrop(size), ToTensor(), normalize,])

def train_transforms(example_batch):
"""Apply _train_transforms across a batch."""
Expand Down
20 changes: 4 additions & 16 deletions examples/language-modeling/run_clm.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,8 +221,7 @@ class DataTrainingArguments:
},
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
default=None, metadata={"help": "The number of processes to use for the preprocessing."},
)
keep_linebreaks: bool = field(
default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
Expand Down Expand Up @@ -363,11 +362,7 @@ def main():
extension = "text"
dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
raw_datasets = load_dataset(
extension,
data_files=data_files,
cache_dir=model_args.cache_dir,
token=model_args.token,
**dataset_args,
extension, data_files=data_files, cache_dir=model_args.cache_dir, token=model_args.token, **dataset_args,
)
# If no validation data is there, validation_split_percentage will be used to divide the dataset.
if "validation" not in raw_datasets.keys():
Expand Down Expand Up @@ -503,11 +498,7 @@ def tokenize_function(examples):
desc="Running tokenizer on dataset",
)
else:
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
remove_columns=column_names,
)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=column_names,)

if data_args.block_size is None:
block_size = tokenizer.model_max_length
Expand Down Expand Up @@ -558,10 +549,7 @@ def group_texts(examples):
desc=f"Grouping texts in chunks of {block_size}",
)
else:
lm_datasets = tokenized_datasets.map(
group_texts,
batched=True,
)
lm_datasets = tokenized_datasets.map(group_texts, batched=True,)

if training_args.do_train:
if "train" not in tokenized_datasets:
Expand Down
23 changes: 5 additions & 18 deletions examples/language-modeling/run_mlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,7 @@ class DataTrainingArguments:
},
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
default=None, metadata={"help": "The number of processes to use for the preprocessing."},
)
mlm_probability: float = field(
default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
Expand Down Expand Up @@ -359,10 +358,7 @@ def main():
if extension == "txt":
extension = "text"
raw_datasets = load_dataset(
extension,
data_files=data_files,
cache_dir=model_args.cache_dir,
token=model_args.token,
extension, data_files=data_files, cache_dir=model_args.cache_dir, token=model_args.token,
)

# If no validation data is there, validation_split_percentage will be used to divide the dataset.
Expand Down Expand Up @@ -511,9 +507,7 @@ def tokenize_function(examples):
)
else:
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
remove_columns=[text_column_name],
tokenize_function, batched=True, remove_columns=[text_column_name],
)
else:
# Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
Expand All @@ -533,11 +527,7 @@ def tokenize_function(examples):
desc="Running tokenizer on every text in dataset",
)
else:
tokenized_datasets = raw_datasets.map(
tokenize_function,
batched=True,
remove_columns=column_names,
)
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=column_names,)

# Main data processing function that will concatenate all texts from our dataset and generate chunks of
# max_seq_length.
Expand Down Expand Up @@ -572,10 +562,7 @@ def group_texts(examples):
desc=f"Grouping texts in chunks of {max_seq_length}",
)
else:
tokenized_datasets = tokenized_datasets.map(
group_texts,
batched=True,
)
tokenized_datasets = tokenized_datasets.map(group_texts, batched=True,)

if training_args.do_train:
if "train" not in tokenized_datasets:
Expand Down
15 changes: 3 additions & 12 deletions examples/multiple-choice/run_swag.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,8 +123,7 @@ class DataTrainingArguments:
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
default=None, metadata={"help": "The number of processes to use for the preprocessing."},
)
max_seq_length: Optional[int] = field(
default=None,
Expand Down Expand Up @@ -316,19 +315,11 @@ def main():
data_files["validation"] = data_args.validation_file
extension = data_args.train_file.split(".")[-1]
raw_datasets = load_dataset(
extension,
data_files=data_files,
cache_dir=model_args.cache_dir,
token=model_args.token,
extension, data_files=data_files, cache_dir=model_args.cache_dir, token=model_args.token,
)
else:
# Downloading and loading the swag dataset from the hub.
raw_datasets = load_dataset(
"swag",
"regular",
cache_dir=model_args.cache_dir,
token=model_args.token,
)
raw_datasets = load_dataset("swag", "regular", cache_dir=model_args.cache_dir, token=model_args.token,)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.

Expand Down
9 changes: 2 additions & 7 deletions examples/question-answering/run_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,8 +133,7 @@ class DataTrainingArguments:
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
default=None, metadata={"help": "The number of processes to use for the preprocessing."},
)
max_seq_length: int = field(
default=384,
Expand Down Expand Up @@ -331,11 +330,7 @@ def main():
data_files["test"] = data_args.test_file
extension = data_args.test_file.split(".")[-1]
raw_datasets = load_dataset(
extension,
data_files=data_files,
field="data",
cache_dir=model_args.cache_dir,
token=model_args.token,
extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir, token=model_args.token,
)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.
Expand Down
14 changes: 3 additions & 11 deletions examples/question-answering/run_seq2seq_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,7 @@ class DataTrainingArguments:
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
default=None, metadata={"help": "The number of processes to use for the preprocessing."},
)
max_seq_length: int = field(
default=384,
Expand Down Expand Up @@ -376,11 +375,7 @@ def main():
data_files["test"] = data_args.test_file
extension = data_args.test_file.split(".")[-1]
raw_datasets = load_dataset(
extension,
data_files=data_files,
field="data",
cache_dir=model_args.cache_dir,
token=model_args.token,
extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir, token=model_args.token,
)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.
Expand Down Expand Up @@ -485,10 +480,7 @@ def main():
max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)

def preprocess_squad_batch(
examples,
question_column: str,
context_column: str,
answer_column: str,
examples, question_column: str, context_column: str, answer_column: str,
) -> Tuple[List[str], List[str]]:
questions = examples[question_column]
contexts = examples[context_column]
Expand Down
11 changes: 3 additions & 8 deletions examples/summarization/run_summarization.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,7 @@ class ModelArguments:
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
)
cache_dir: Optional[str] = field(
default=None,
metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
default=None, metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
)
use_fast_tokenizer: bool = field(
default=True,
Expand Down Expand Up @@ -180,8 +179,7 @@ class DataTrainingArguments:
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
)
preprocessing_num_workers: Optional[int] = field(
default=None,
metadata={"help": "The number of processes to use for the preprocessing."},
default=None, metadata={"help": "The number of processes to use for the preprocessing."},
)
max_source_length: Optional[int] = field(
default=1024,
Expand Down Expand Up @@ -427,10 +425,7 @@ def main():
data_files["test"] = data_args.test_file
extension = data_args.test_file.split(".")[-1]
raw_datasets = load_dataset(
extension,
data_files=data_files,
cache_dir=model_args.cache_dir,
token=model_args.token,
extension, data_files=data_files, cache_dir=model_args.cache_dir, token=model_args.token,
)
# See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
# https://huggingface.co/docs/datasets/loading_datasets.html.
Expand Down
18 changes: 4 additions & 14 deletions examples/text-classification/run_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,8 +80,7 @@ class DataTrainingArguments:
"""

task_name: Optional[str] = field(
default=None,
metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
default=None, metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
)
dataset_name: Optional[str] = field(
default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
Expand Down Expand Up @@ -305,10 +304,7 @@ def main():
if data_args.task_name is not None:
# Downloading and loading a dataset from the hub.
raw_datasets = load_dataset(
"glue",
data_args.task_name,
cache_dir=model_args.cache_dir,
token=model_args.token,
"glue", data_args.task_name, cache_dir=model_args.cache_dir, token=model_args.token,
)
elif data_args.dataset_name is not None:
# Downloading and loading a dataset from the hub.
Expand Down Expand Up @@ -342,18 +338,12 @@ def main():
if data_args.train_file.endswith(".csv"):
# Loading a dataset from local csv files
raw_datasets = load_dataset(
"csv",
data_files=data_files,
cache_dir=model_args.cache_dir,
token=model_args.token,
"csv", data_files=data_files, cache_dir=model_args.cache_dir, token=model_args.token,
)
else:
# Loading a dataset from local json files
raw_datasets = load_dataset(
"json",
data_files=data_files,
cache_dir=model_args.cache_dir,
token=model_args.token,
"json", data_files=data_files, cache_dir=model_args.cache_dir, token=model_args.token,
)
# See more about loading any type of standard or custom dataset at
# https://huggingface.co/docs/datasets/loading_datasets.html.
Expand Down
18 changes: 3 additions & 15 deletions examples/text-classification/run_xnli.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,11 +255,7 @@ def main():
if training_args.do_train:
if model_args.train_language is None:
train_dataset = load_dataset(
"xnli",
model_args.language,
split="train",
cache_dir=model_args.cache_dir,
token=model_args.token,
"xnli", model_args.language, split="train", cache_dir=model_args.cache_dir, token=model_args.token,
)
else:
train_dataset = load_dataset(
Expand All @@ -273,21 +269,13 @@ def main():

if training_args.do_eval:
eval_dataset = load_dataset(
"xnli",
model_args.language,
split="validation",
cache_dir=model_args.cache_dir,
token=model_args.token,
"xnli", model_args.language, split="validation", cache_dir=model_args.cache_dir, token=model_args.token,
)
label_list = eval_dataset.features["label"].names

if training_args.do_predict:
predict_dataset = load_dataset(
"xnli",
model_args.language,
split="test",
cache_dir=model_args.cache_dir,
token=model_args.token,
"xnli", model_args.language, split="test", cache_dir=model_args.cache_dir, token=model_args.token,
)
label_list = predict_dataset.features["label"].names

Expand Down
5 changes: 1 addition & 4 deletions examples/text-generation/generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,10 +31,7 @@ def generate(model, tokenizer, prompts, length, temperature):
parent_parser.add_argument("model", type=str, help="The HF Hub model id or a local directory.")
export_parser = subparsers.add_parser("export", parents=[parent_parser], help="Convert model to Neuron.")
export_parser.add_argument(
"--batch_size",
type=int,
default=1,
help="The batch size.",
"--batch_size", type=int, default=1, help="The batch size.",
)
export_parser.add_argument("--sequence_length", type=int, help="The maximum sequence length.")
export_parser.add_argument(
Expand Down
Loading