huggingface · gioannides · Oct 5, 2024 · Oct 7, 2024 · Nov 13, 2024
diff --git a/docs/source/training_tutorials/finetune_llm.py b/docs/source/training_tutorials/finetune_llm.py
@@ -57,10 +57,7 @@ def chunk(sample, chunk_length=chunk_length):
         return result
 
     # tokenize and chunk dataset
-    lm_dataset = dataset.map(
-        partial(chunk, chunk_length=chunk_length),
-        batched=True,
-    )
+    lm_dataset = dataset.map(partial(chunk, chunk_length=chunk_length), batched=True,)
     print(f"Total number of samples: {len(lm_dataset)}")
     return lm_dataset
 

diff --git a/docs/source/training_tutorials/sft_lora_finetune_llm.py b/docs/source/training_tutorials/sft_lora_finetune_llm.py
@@ -43,11 +43,7 @@ def training_function(script_args, training_args):
     )
 
     args = training_args.to_dict()
-    sft_config = NeuronSFTConfig(
-        max_seq_length=1024,
-        packing=False,
-        **args,
-    )
+    sft_config = NeuronSFTConfig(max_seq_length=1024, packing=False, **args,)
 
     trainer = NeuronSFTTrainer(
         args=sft_config,

diff --git a/examples/image-classification/run_image_classification.py b/examples/image-classification/run_image_classification.py
@@ -266,10 +266,7 @@ def main():
         if data_args.validation_dir is not None:
             data_files["validation"] = os.path.join(data_args.validation_dir, "**")
         dataset = load_dataset(
-            "imagefolder",
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-            task="image-classification",
+            "imagefolder", data_files=data_files, cache_dir=model_args.cache_dir, task="image-classification",
         )
 
     # If we don't have a validation split, split off a percentage of train as validation.
@@ -340,22 +337,8 @@ def compute_metrics(p):
         if hasattr(image_processor, "image_mean") and hasattr(image_processor, "image_std")
         else Lambda(lambda x: x)
     )
-    _train_transforms = Compose(
-        [
-            RandomResizedCrop(size),
-            RandomHorizontalFlip(),
-            ToTensor(),
-            normalize,
-        ]
-    )
-    _val_transforms = Compose(
-        [
-            Resize(size),
-            CenterCrop(size),
-            ToTensor(),
-            normalize,
-        ]
-    )
+    _train_transforms = Compose([RandomResizedCrop(size), RandomHorizontalFlip(), ToTensor(), normalize,])
+    _val_transforms = Compose([Resize(size), CenterCrop(size), ToTensor(), normalize,])
 
     def train_transforms(example_batch):
         """Apply _train_transforms across a batch."""

diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py
@@ -221,8 +221,7 @@ class DataTrainingArguments:
         },
     )
     preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
+        default=None, metadata={"help": "The number of processes to use for the preprocessing."},
     )
     keep_linebreaks: bool = field(
         default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
@@ -363,11 +362,7 @@ def main():
             extension = "text"
             dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
         raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-            **dataset_args,
+            extension, data_files=data_files, cache_dir=model_args.cache_dir, token=model_args.token, **dataset_args,
         )
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
         if "validation" not in raw_datasets.keys():
@@ -503,11 +498,7 @@ def tokenize_function(examples):
                 desc="Running tokenizer on dataset",
             )
         else:
-            tokenized_datasets = raw_datasets.map(
-                tokenize_function,
-                batched=True,
-                remove_columns=column_names,
-            )
+            tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=column_names,)
 
     if data_args.block_size is None:
         block_size = tokenizer.model_max_length
@@ -558,10 +549,7 @@ def group_texts(examples):
                 desc=f"Grouping texts in chunks of {block_size}",
             )
         else:
-            lm_datasets = tokenized_datasets.map(
-                group_texts,
-                batched=True,
-            )
+            lm_datasets = tokenized_datasets.map(group_texts, batched=True,)
 
     if training_args.do_train:
         if "train" not in tokenized_datasets:

diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py
@@ -187,8 +187,7 @@ class DataTrainingArguments:
         },
     )
     preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
+        default=None, metadata={"help": "The number of processes to use for the preprocessing."},
     )
     mlm_probability: float = field(
         default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
@@ -359,10 +358,7 @@ def main():
         if extension == "txt":
             extension = "text"
         raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
+            extension, data_files=data_files, cache_dir=model_args.cache_dir, token=model_args.token,
         )
 
         # If no validation data is there, validation_split_percentage will be used to divide the dataset.
@@ -511,9 +507,7 @@ def tokenize_function(examples):
                 )
             else:
                 tokenized_datasets = raw_datasets.map(
-                    tokenize_function,
-                    batched=True,
-                    remove_columns=[text_column_name],
+                    tokenize_function, batched=True, remove_columns=[text_column_name],
                 )
     else:
         # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
@@ -533,11 +527,7 @@ def tokenize_function(examples):
                     desc="Running tokenizer on every text in dataset",
                 )
             else:
-                tokenized_datasets = raw_datasets.map(
-                    tokenize_function,
-                    batched=True,
-                    remove_columns=column_names,
-                )
+                tokenized_datasets = raw_datasets.map(tokenize_function, batched=True, remove_columns=column_names,)
 
         # Main data processing function that will concatenate all texts from our dataset and generate chunks of
         # max_seq_length.
@@ -572,10 +562,7 @@ def group_texts(examples):
                     desc=f"Grouping texts in chunks of {max_seq_length}",
                 )
             else:
-                tokenized_datasets = tokenized_datasets.map(
-                    group_texts,
-                    batched=True,
-                )
+                tokenized_datasets = tokenized_datasets.map(group_texts, batched=True,)
 
     if training_args.do_train:
         if "train" not in tokenized_datasets:

diff --git a/examples/multiple-choice/run_swag.py b/examples/multiple-choice/run_swag.py
@@ -123,8 +123,7 @@ class DataTrainingArguments:
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
     preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
+        default=None, metadata={"help": "The number of processes to use for the preprocessing."},
     )
     max_seq_length: Optional[int] = field(
         default=None,
@@ -316,19 +315,11 @@ def main():
             data_files["validation"] = data_args.validation_file
         extension = data_args.train_file.split(".")[-1]
         raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
+            extension, data_files=data_files, cache_dir=model_args.cache_dir, token=model_args.token,
         )
     else:
         # Downloading and loading the swag dataset from the hub.
-        raw_datasets = load_dataset(
-            "swag",
-            "regular",
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
-        )
+        raw_datasets = load_dataset("swag", "regular", cache_dir=model_args.cache_dir, token=model_args.token,)
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
 

diff --git a/examples/question-answering/run_qa.py b/examples/question-answering/run_qa.py
@@ -133,8 +133,7 @@ class DataTrainingArguments:
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
     preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
+        default=None, metadata={"help": "The number of processes to use for the preprocessing."},
     )
     max_seq_length: int = field(
         default=384,
@@ -331,11 +330,7 @@ def main():
             data_files["test"] = data_args.test_file
             extension = data_args.test_file.split(".")[-1]
         raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            field="data",
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
+            extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir, token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.

diff --git a/examples/question-answering/run_seq2seq_qa.py b/examples/question-answering/run_seq2seq_qa.py
@@ -146,8 +146,7 @@ class DataTrainingArguments:
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
     preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
+        default=None, metadata={"help": "The number of processes to use for the preprocessing."},
     )
     max_seq_length: int = field(
         default=384,
@@ -376,11 +375,7 @@ def main():
             data_files["test"] = data_args.test_file
             extension = data_args.test_file.split(".")[-1]
         raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            field="data",
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
+            extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir, token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -485,10 +480,7 @@ def main():
     max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
 
     def preprocess_squad_batch(
-        examples,
-        question_column: str,
-        context_column: str,
-        answer_column: str,
+        examples, question_column: str, context_column: str, answer_column: str,
     ) -> Tuple[List[str], List[str]]:
         questions = examples[question_column]
         contexts = examples[context_column]

diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
@@ -90,8 +90,7 @@ class ModelArguments:
         default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
     )
     cache_dir: Optional[str] = field(
-        default=None,
-        metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
+        default=None, metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
     )
     use_fast_tokenizer: bool = field(
         default=True,
@@ -180,8 +179,7 @@ class DataTrainingArguments:
         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
     )
     preprocessing_num_workers: Optional[int] = field(
-        default=None,
-        metadata={"help": "The number of processes to use for the preprocessing."},
+        default=None, metadata={"help": "The number of processes to use for the preprocessing."},
     )
     max_source_length: Optional[int] = field(
         default=1024,
@@ -427,10 +425,7 @@ def main():
             data_files["test"] = data_args.test_file
             extension = data_args.test_file.split(".")[-1]
         raw_datasets = load_dataset(
-            extension,
-            data_files=data_files,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
+            extension, data_files=data_files, cache_dir=model_args.cache_dir, token=model_args.token,
         )
     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
     # https://huggingface.co/docs/datasets/loading_datasets.html.

diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py
@@ -80,8 +80,7 @@ class DataTrainingArguments:
     """
 
     task_name: Optional[str] = field(
-        default=None,
-        metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
+        default=None, metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
     )
     dataset_name: Optional[str] = field(
         default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
@@ -305,10 +304,7 @@ def main():
     if data_args.task_name is not None:
         # Downloading and loading a dataset from the hub.
         raw_datasets = load_dataset(
-            "glue",
-            data_args.task_name,
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
+            "glue", data_args.task_name, cache_dir=model_args.cache_dir, token=model_args.token,
         )
     elif data_args.dataset_name is not None:
         # Downloading and loading a dataset from the hub.
@@ -342,18 +338,12 @@ def main():
         if data_args.train_file.endswith(".csv"):
             # Loading a dataset from local csv files
             raw_datasets = load_dataset(
-                "csv",
-                data_files=data_files,
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
+                "csv", data_files=data_files, cache_dir=model_args.cache_dir, token=model_args.token,
             )
         else:
             # Loading a dataset from local json files
             raw_datasets = load_dataset(
-                "json",
-                data_files=data_files,
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
+                "json", data_files=data_files, cache_dir=model_args.cache_dir, token=model_args.token,
             )
     # See more about loading any type of standard or custom dataset at
     # https://huggingface.co/docs/datasets/loading_datasets.html.

diff --git a/examples/text-classification/run_xnli.py b/examples/text-classification/run_xnli.py
@@ -255,11 +255,7 @@ def main():
     if training_args.do_train:
         if model_args.train_language is None:
             train_dataset = load_dataset(
-                "xnli",
-                model_args.language,
-                split="train",
-                cache_dir=model_args.cache_dir,
-                token=model_args.token,
+                "xnli", model_args.language, split="train", cache_dir=model_args.cache_dir, token=model_args.token,
             )
         else:
             train_dataset = load_dataset(
@@ -273,21 +269,13 @@ def main():
 
     if training_args.do_eval:
         eval_dataset = load_dataset(
-            "xnli",
-            model_args.language,
-            split="validation",
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
+            "xnli", model_args.language, split="validation", cache_dir=model_args.cache_dir, token=model_args.token,
         )
         label_list = eval_dataset.features["label"].names
 
     if training_args.do_predict:
         predict_dataset = load_dataset(
-            "xnli",
-            model_args.language,
-            split="test",
-            cache_dir=model_args.cache_dir,
-            token=model_args.token,
+            "xnli", model_args.language, split="test", cache_dir=model_args.cache_dir, token=model_args.token,
         )
         label_list = predict_dataset.features["label"].names
 

diff --git a/examples/text-generation/generation.py b/examples/text-generation/generation.py
@@ -31,10 +31,7 @@ def generate(model, tokenizer, prompts, length, temperature):
     parent_parser.add_argument("model", type=str, help="The HF Hub model id or a local directory.")
     export_parser = subparsers.add_parser("export", parents=[parent_parser], help="Convert model to Neuron.")
     export_parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=1,
-        help="The batch size.",
+        "--batch_size", type=int, default=1, help="The batch size.",
     )
     export_parser.add_argument("--sequence_length", type=int, help="The maximum sequence length.")
     export_parser.add_argument(