diff --git a/examples/audio/process_audio.py b/examples/audio/process_audio.py index 8657d53ef957..d34461937284 100644 --- a/examples/audio/process_audio.py +++ b/examples/audio/process_audio.py @@ -176,6 +176,7 @@ def main(cfg: ProcessConfig) -> ProcessConfig: raise RuntimeError('Model does not have a sampler') if cfg.audio_dir is not None: + input_dir = cfg.audio_dir filepaths = list(glob.glob(os.path.join(cfg.audio_dir, f"**/*.{cfg.audio_type}"), recursive=True)) else: # get filenames from manifest @@ -193,6 +194,15 @@ def main(cfg: ProcessConfig) -> ProcessConfig: audio_file = manifest_dir / audio_file filepaths.append(str(audio_file.absolute())) + # common path for all files + common_path = os.path.commonpath(filepaths) + if Path(common_path).is_relative_to(manifest_dir): + # if all paths are relative to the manifest, use manifest dir as input dir + input_dir = manifest_dir + else: + # use the parent of the common path as input dir + input_dir = Path(common_path).parent + if cfg.max_utts is not None: # Limit the number of utterances to process filepaths = filepaths[: cfg.max_utts] @@ -238,6 +248,7 @@ def autocast(): batch_size=cfg.batch_size, num_workers=cfg.num_workers, input_channel_selector=cfg.input_channel_selector, + input_dir=input_dir, ) logging.info(f"Finished processing {len(filepaths)} files!") diff --git a/nemo/collections/audio/models/audio_to_audio.py b/nemo/collections/audio/models/audio_to_audio.py index 60c16f756f58..57b4c9d48119 100644 --- a/nemo/collections/audio/models/audio_to_audio.py +++ b/nemo/collections/audio/models/audio_to_audio.py @@ -332,6 +332,7 @@ def process( batch_size: int = 1, num_workers: Optional[int] = None, input_channel_selector: Optional[ChannelSelectorType] = None, + input_dir: Optional[str] = None, ) -> List[str]: """ Takes paths to audio files and returns a list of paths to processed @@ -344,6 +345,7 @@ def process( num_workers: Number of workers for the dataloader input_channel_selector (int | Iterable[int] | str): select a single channel or a subset of channels from multi-channel audio. If set to `'average'`, it performs averaging across channels. Disabled if set to `None`. Defaults to `None`. + input_dir: Optional, directory that contains the input files. If provided, the output directory will mirror the input directory structure. Returns: Paths to processed audio signals. @@ -413,9 +415,17 @@ def process( for example_idx in range(processed_batch.size(0)): # This assumes the data loader is not shuffling files - file_name = os.path.basename(paths2audio_files[file_idx]) + if input_dir is not None: + # Make sure the output has the same directory structure as the input + filepath_relative = os.path.relpath(paths2audio_files[file_idx], start=input_dir) + else: + # Input dir is not provided, save files in the output directory + filepath_relative = os.path.basename(paths2audio_files[file_idx]) # Prepare output file - output_file = os.path.join(output_dir, f'processed_{file_name}') + output_file = os.path.join(output_dir, filepath_relative) + # Create output dir if necessary + if not os.path.isdir(os.path.dirname(output_file)): + os.makedirs(os.path.dirname(output_file)) # Crop the output signal to the actual length output_signal = processed_batch[example_idx, :, : input_length[example_idx]].cpu().numpy() # Write audio diff --git a/nemo/collections/llm/gpt/data/packed_sequence.py b/nemo/collections/llm/gpt/data/packed_sequence.py index 345489ea0b63..1f43bce99e62 100644 --- a/nemo/collections/llm/gpt/data/packed_sequence.py +++ b/nemo/collections/llm/gpt/data/packed_sequence.py @@ -78,7 +78,7 @@ def prepare_packed_sequence_data( sequences, histogram = create_hist(dataset, max_seq_length) assignments = create_packing_strategy(histogram, packed_sequence_size, packing_algorithm) - output_data = fill_packing_strategy(assignments, sequences, packed_sequence_size) + output_data = fill_packing_strategy(assignments, sequences, packed_sequence_size, tokenizer.eos_id) # save output data np.save(output_path, output_data) diff --git a/nemo/deploy/nlp/__init__.py b/nemo/deploy/nlp/__init__.py index 5ebbe6816664..633544e300ed 100644 --- a/nemo/deploy/nlp/__init__.py +++ b/nemo/deploy/nlp/__init__.py @@ -12,15 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. - -use_query_llm = True -try: - from nemo.deploy.nlp.query_llm import NemoQueryLLM, NemoQueryLLMPyTorch -except Exception: - use_query_llm = False - -use_megatron_llm = True -try: - from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable -except Exception: - use_megatron_llm = False +from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable +from nemo.deploy.nlp.query_llm import NemoQueryLLM, NemoQueryLLMPyTorch diff --git a/nemo/utils/sequence_packing_utils.py b/nemo/utils/sequence_packing_utils.py index 2ca03ce44b67..647a0401bd5b 100644 --- a/nemo/utils/sequence_packing_utils.py +++ b/nemo/utils/sequence_packing_utils.py @@ -176,7 +176,7 @@ def create_packing_strategy( def fill_packing_strategy( - assignments: List[List[int]], sequences: Dict[int, List[Dict]], pack_size: int + assignments: List[List[int]], sequences: Dict[int, List[Dict]], pack_size: int, pad_id: int ) -> List[Dict]: """ Fills the packing strategy with actual sequence data based on assignments and sequence information. @@ -192,6 +192,7 @@ def fill_packing_strategy( sequences: A dictionary where keys are sequence lengths and values are lists of corresponding sequences from the dataset (output of 'create_hist'). pack_size: The maximum capacity of each bin. + pad_id: The tokenizer's padding token. Returns: output_data: A list of dictionaries, where each dictionary represents a packed sequence with its input IDs, @@ -205,7 +206,13 @@ def fill_packing_strategy( input_ids = np.array([x['input_ids'] for x in per_seq_data])[perm].tolist() try: loss_mask = np.array( - [[idx >= x['answer_start_idx'] for idx in range(len(x['input_ids']))] for x in per_seq_data] + [ + [ + idx >= x['answer_start_idx'] and x['input_ids'][idx] != pad_id + for idx in range(len(x['input_ids'])) + ] + for x in per_seq_data + ] )[perm].tolist() except KeyError: loss_mask = None diff --git a/scripts/dataset_processing/add_noise.py b/scripts/dataset_processing/add_noise.py index 2aefc8b03974..e273d0de0af2 100644 --- a/scripts/dataset_processing/add_noise.py +++ b/scripts/dataset_processing/add_noise.py @@ -131,12 +131,15 @@ def add_noise(infile, snrs, noise_manifest, out_dir, num_workers=1): def main(): parser = argparse.ArgumentParser() parser.add_argument( - "--input_manifest", type=str, required=True, help="clean test set", + "--input_manifest", + type=str, + required=True, + help="clean test set", ) parser.add_argument("--noise_manifest", type=str, required=True, help="path to noise manifest file") parser.add_argument("--out_dir", type=str, required=True, help="destination directory for audio and manifests") parser.add_argument("--snrs", type=int, nargs="+", default=[0, 10, 20, 30]) - parser.add_argument("--seed", type=int, default=42) + parser.add_argument("--seed", type=int, default=None) parser.add_argument("--num_workers", default=1, type=int) parser.add_argument("--sample_rate", default=16000, type=int) parser.add_argument( diff --git a/scripts/deploy/nlp/query_inframework.py b/scripts/deploy/nlp/query_inframework.py index e77ab72a1f04..a62e09fa071d 100644 --- a/scripts/deploy/nlp/query_inframework.py +++ b/scripts/deploy/nlp/query_inframework.py @@ -15,7 +15,7 @@ import argparse import sys -from nemo.deploy.nlp.query_llm import NemoQueryLLMPyTorch +from nemo.deploy.nlp import NemoQueryLLMPyTorch def get_args(argv): diff --git a/scripts/llm/llama3_pretraining.py b/scripts/llm/pretraining.py similarity index 80% rename from scripts/llm/llama3_pretraining.py rename to scripts/llm/pretraining.py index a5ca3a496a88..1b4a33832a56 100644 --- a/scripts/llm/llama3_pretraining.py +++ b/scripts/llm/pretraining.py @@ -13,8 +13,8 @@ # limitations under the License. # NOTE: This script is only an example of using NeMo with NeMo-Run's APIs and is subject to change without notice. -# This script is used for pretraining a Llama3 model, specifically for the 8b or 70b model variants, on local and slurm executors. -# It uses NeMo 2.0 recipes (https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/llm/recipes/llama3_8b.py#L74) and NeMo-Run (https://github.com/NVIDIA/NeMo-Run) to configure and execute the runs. +# This script is used for pretraining on local and slurm executors. +# It uses NeMo 2.0 recipes (https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/llm/recipes/) and NeMo-Run (https://github.com/NVIDIA/NeMo-Run) to configure and execute the runs. import argparse from functools import partial @@ -26,12 +26,12 @@ def get_parser(): - parser = argparse.ArgumentParser(description="Llama3 Pretraining") + parser = argparse.ArgumentParser(description="NeMo2.0 Pretraining") parser.add_argument( - "--size", + "--recipe", type=str, - default="8b", - help="Choose llama3 model size 70b/8b", + default="llama3_8b", + help="Choose NeMo 2.0 recipe. Recipes are named in the format of _(_ or other special settings)", ) parser.add_argument( "--tag", @@ -66,7 +66,7 @@ def slurm_executor( time: str = "01:00:00", custom_mounts: Optional[list[str]] = None, custom_env_vars: Optional[dict[str, str]] = None, - container_image: str = "nvcr.io/nvidia/nemo:dev", + container_image: str = "nvcr.io/nvidia/nemo:24.09", retries: int = 0, ) -> run.SlurmExecutor: if not (user and host and remote_job_dir and account and partition and nodes and devices): @@ -102,7 +102,7 @@ def slurm_executor( mem="0", exclusive=True, gres="gpu:8", - packager=run.GitArchivePackager(subpath="examples/llm/run"), + packager=run.GitArchivePackager(), ) executor.container_image = container_image @@ -134,28 +134,14 @@ def main(): if args.tag and not args.tag.startswith("-"): args.tag = "-" + args.tag - MODEL_SIZE_MAPPING: dict[str, dict[str, Any]] = { - "8b": { - "exp_name": "llama3-8b", - "nemo": { - "pretrain": partial(llm.llama3_8b.pretrain_recipe, num_nodes=1, num_gpus_per_node=8), - }, - }, - "70b": { - "exp_name": "llama3-70b", - "nemo": { - "pretrain": partial(llm.llama3_70b.pretrain_recipe, num_nodes=128, num_gpus_per_node=8), - }, - }, - } - - exp_name = MODEL_SIZE_MAPPING[args.size]["exp_name"] + exp_name = args.recipe # Uses configs from NeMo directly - pretrain = MODEL_SIZE_MAPPING[args.size]["nemo"]["pretrain"]( - name=exp_name, - ckpt_dir="/nemo_run/checkpoints", - ) + assert hasattr( + llm, args.recipe + ), f"Recipe named {args.recipe} not found. General format is _(_ or other special settings)" + pretrain_recipe = getattr(llm, args.recipe).pretrain_recipe + pretrain = partial(pretrain_recipe)(name=exp_name, dir="/nemo_run/checkpoints") # Overwrite the dataloader in the recipe to use your custom dataloader. # dataloader = set_your_custom_dataloader diff --git a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py index 19a3e6a78228..9b0a2cb60131 100644 --- a/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py +++ b/scripts/nlp_language_modeling/prepare_packed_ft_dataset.py @@ -131,6 +131,7 @@ def tokenize_dataset(cfg: 'DictConfig'): max_seq_length = dataset.max_seq_length pad_id = dataset.tokenizer.eos_id + tokenizer = dataset.tokenizer pad_seq_length_to_mult = dataset.pad_seq_length_to_mult dataset = np.array([dataset[i] for i in range(len(dataset))]) if cp_size > 1: @@ -162,7 +163,7 @@ def pre_pad_dataset(data, max_seq_length, max_length_to_pad, pad_id): for data in dataset: max_length_to_pad = min(max_seq_length, ceil_to_nearest(len(data['input_ids']), pad_seq_length_to_mult)) pre_pad_dataset(data, max_seq_length, max_length_to_pad, pad_id) - return dataset + return dataset, tokenizer @dataclass @@ -187,11 +188,11 @@ def from_config(self, cfg: 'DictConfig'): ) def main(cfg: 'DictConfig') -> None: args = PackingArgs().from_config(cfg) - dataset = tokenize_dataset(cfg) + dataset, tokenizer = tokenize_dataset(cfg) sequences, histogram = create_hist(dataset, cfg.model.data.train_ds.max_seq_length) for pack_size in args.pack_sizes: assignments = create_packing_strategy(histogram, pack_size, args.packing_algorithm) - output_data = fill_packing_strategy(assignments, sequences, pack_size) + output_data = fill_packing_strategy(assignments, sequences, pack_size, tokenizer.eos_id) # save output data os.makedirs(args.output_dir, exist_ok=True) diff --git a/tests/deploy/nemo_deploy.py b/tests/deploy/nemo_deploy.py index 23db7c4f01f3..45f2bae3425e 100644 --- a/tests/deploy/nemo_deploy.py +++ b/tests/deploy/nemo_deploy.py @@ -21,7 +21,7 @@ import torch -from nemo.deploy.nlp.megatronllm_deployable import MegatronLLMDeployable +from nemo.deploy.nlp import MegatronLLMDeployable from tests.infer_data_path import get_infer_test_data run_export_tests = True diff --git a/tests/export/nemo_export.py b/tests/export/nemo_export.py index df6a68828d41..cb2b3619e4d3 100644 --- a/tests/export/nemo_export.py +++ b/tests/export/nemo_export.py @@ -43,7 +43,8 @@ from nemo.deploy.nlp import MegatronLLMDeployable, NemoQueryLLMPyTorch except Exception as e: LOGGER.warning( - f"Cannot import MegatronLLMDeployable, in-framework inference will not be available. {type(e).__name__}: {e}" + "Cannot import MegatronLLMDeployable or NemoQueryLLMPyTorch," + f" in-framework inference will not be available. {type(e).__name__}: {e}" ) in_framework_supported = False @@ -104,12 +105,7 @@ def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path): all_expected_outputs.append(expected_output) if model is not None: - in_framework_model = False - if in_framework_supported: - if isinstance(model, MegatronLLMDeployable): - in_framework_model = True - - if in_framework_model: + if in_framework_supported and isinstance(model, MegatronLLMDeployable): model_output = model.generate( inputs=[prompt], length_params={"min_length": 1, "max_length": 1}, @@ -153,7 +149,7 @@ def get_accuracy_with_lambada(model, nq, task_ids, lora_uids, test_data_path): correct_answers_relaxed += 1 if nq is not None: - if isinstance(nq, NemoQueryLLMPyTorch): + if in_framework_supported and isinstance(nq, NemoQueryLLMPyTorch): deployed_output = nq.query_llm( prompts=[prompt], max_length=1,