Merge branch 'main' into update_tn_langs

NVIDIA · Nov 26, 2024 · b132e06 · b132e06
2 parents e4cb7af + 5d97b70
commit b132e06
Show file tree

Hide file tree

Showing 24 changed files with 310 additions and 60 deletions.
diff --git a/.github/workflows/_test_template.yml b/.github/workflows/_test_template.yml
@@ -60,7 +60,16 @@ jobs:
               ARG=("--runtime=nvidia --gpus all")
             fi
 
-            docker run --rm -d --name nemo_container_${{ github.run_id }} ${ARG[@]} --shm-size=64g --env TRANSFORMERS_OFFLINE=0 --env HYDRA_FULL_ERROR=1 --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container:${{ github.run_id }} bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
+            docker run \
+              --rm \
+              -d \
+              --name nemo_container_${{ github.run_id }} ${ARG[@]} \
+              --shm-size=64g \
+              --env TRANSFORMERS_OFFLINE=0 \
+              --env HYDRA_FULL_ERROR=1 \
+              --env HF_HOME=/home/TestData/HF_HOME \
+              --volume /mnt/datadrive/TestData:/home/TestData nemoci.azurecr.io/nemo_container:${{ github.run_id }} \
+              bash -c "sleep $(( ${{ inputs.TIMEOUT }} * 60 + 60 ))"
 
         - id: main
           name: Run main script
@@ -95,4 +104,4 @@ jobs:
           if: always()
           run: |
             docker container stop nemo_container_${{ github.run_id }} || true
-            docker container rm nemo_container_${{ github.run_id }} || true
+            docker container rm nemo_container_${{ github.run_id }} || true
diff --git a/examples/llm/peft/hf.py b/examples/llm/peft/hf.py
@@ -76,11 +76,11 @@ def formatting_prompts_func(examples):
         # See: https://github.com/Lightning-AI/pytorch-lightning/blob/8ad3e29816a63d8ce5c00ac104b14729a4176f4f/src/lightning/pytorch/plugins/precision/fsdp.py#L81
         grad_clip = None
     use_dist_samp = False
-    tokenizer = llm.HfAutoModelForCausalLM.configure_tokenizer(args.model)
+    tokenizer = llm.HFAutoModelForCausalLM.configure_tokenizer(args.model)
 
     llm.api.finetune(
-        model=llm.HfAutoModelForCausalLM(args.model),
-        data=llm.HfDatasetDataModule(
+        model=llm.HFAutoModelForCausalLM(args.model),
+        data=llm.HFDatasetDataModule(
             mk_hf_dataset(tokenizer.tokenizer), pad_token_id=tokenizer.tokenizer.eos_token_id
         ),
         trainer=nl.Trainer(

diff --git a/examples/llm/sft/hf.py b/examples/llm/sft/hf.py
@@ -84,7 +84,7 @@ def squad(tokenizer) -> pl.LightningDataModule:
 
     from nemo.lightning.pytorch.accelerate.transformer_engine import te_accelerate
 
-    model = llm.HfAutoModelForCausalLM(model_name=args.model, model_accelerator=model_accelerator)
+    model = llm.HFAutoModelForCausalLM(model_name=args.model, model_accelerator=model_accelerator)
     tokenizer = model.tokenizer
 
     llm.api.finetune(

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
@@ -22,7 +22,7 @@
     AlpacaDataModule,
     DollyDataModule,
     FineTuningDataModule,
-    HfDatasetDataModule,
+    HFDatasetDataModule,
     MockDataModule,
     PreTrainingDataModule,
     SquadDataModule,
@@ -64,7 +64,7 @@
     GPTConfig126M,
     GPTConfig175B,
     GPTModel,
-    HfAutoModelForCausalLM,
+    HFAutoModelForCausalLM,
     Llama2Config7B,
     Llama2Config13B,
     Llama2Config70B,
@@ -218,7 +218,7 @@
     "dolly",
     "peft",
     "hf_dataset",
-    "HfAutoModelForCausalLM",
+    "HFAutoModelForCausalLM",
 ]
 
 

diff --git a/nemo/collections/llm/gpt/data/__init__.py b/nemo/collections/llm/gpt/data/__init__.py
@@ -15,7 +15,7 @@
 from nemo.collections.llm.gpt.data.alpaca import AlpacaDataModule
 from nemo.collections.llm.gpt.data.dolly import DollyDataModule
 from nemo.collections.llm.gpt.data.fine_tuning import FineTuningDataModule
-from nemo.collections.llm.gpt.data.hf_dataset import HfDatasetDataModule
+from nemo.collections.llm.gpt.data.hf_dataset import HFDatasetDataModule
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.pre_training import PreTrainingDataModule, build_pretraining_datamodule
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
@@ -28,5 +28,5 @@
     "MockDataModule",
     "PreTrainingDataModule",
     "build_pretraining_datamodule",
-    "HfDatasetDataModule",
+    "HFDatasetDataModule",
 ]
diff --git a/nemo/collections/llm/gpt/data/api.py b/nemo/collections/llm/gpt/data/api.py
@@ -16,7 +16,7 @@
 import nemo_run as run
 
 from nemo.collections.llm.gpt.data.dolly import DollyDataModule
-from nemo.collections.llm.gpt.data.hf_dataset import HfDatasetDataModule
+from nemo.collections.llm.gpt.data.hf_dataset import HFDatasetDataModule
 from nemo.collections.llm.gpt.data.mock import MockDataModule
 from nemo.collections.llm.gpt.data.squad import SquadDataModule
 
@@ -42,7 +42,7 @@ def dolly() -> pl.LightningDataModule:
 @run.cli.factory
 @run.autoconvert
 def hf_dataset(dataset: str) -> pl.LightningDataModule:
-    return HfDatasetDataModule(dataset=dataset, global_batch_size=16, micro_batch_size=2)
+    return HFDatasetDataModule(dataset=dataset, global_batch_size=16, micro_batch_size=2)
 
 
 __all__ = ["mock", "squad", "dolly", "hf_dataset"]
diff --git a/nemo/collections/llm/gpt/data/hf_dataset.py b/nemo/collections/llm/gpt/data/hf_dataset.py
@@ -18,7 +18,7 @@
 from nemo.lightning.pytorch.plugins import MegatronDataSampler
 
 
-class HfDatasetDataModule(pl.LightningDataModule):
+class HFDatasetDataModule(pl.LightningDataModule):
     def __init__(
         self,
         dataset,
@@ -88,7 +88,7 @@ def train_dataloader(self, collate_fn=None):
         from nemo.lightning.data import add_megatron_sampler
 
         if collate_fn is None:
-            collate_fn = lambda x: HfDatasetDataModule.collate_fn(x, pad_token_id=self.pad_token_id)
+            collate_fn = lambda x: HFDatasetDataModule.collate_fn(x, pad_token_id=self.pad_token_id)
 
         return DataLoader(
             self.dataset,

diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
@@ -45,7 +45,7 @@
     Gemma2Config27B,
     Gemma2Model,
 )
-from nemo.collections.llm.gpt.model.hf_auto_model_for_causal_lm import HfAutoModelForCausalLM
+from nemo.collections.llm.gpt.model.hf_auto_model_for_causal_lm import HFAutoModelForCausalLM
 from nemo.collections.llm.gpt.model.llama import (
     CodeLlamaConfig7B,
     CodeLlamaConfig13B,
@@ -191,5 +191,5 @@
     "transformer_engine_layer_spec",
     "transformer_engine_full_layer_spec",
     "local_layer_spec",
-    "HfAutoModelForCausalLM",
+    "HFAutoModelForCausalLM",
 ]
diff --git a/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/gpt/model/hf_auto_model_for_causal_lm.py
@@ -31,7 +31,7 @@ def masked_cross_entropy(logits, targets, mask=None):
         return F.cross_entropy(logits, targets)
 
 
-class HfAutoModelForCausalLM(pl.LightningModule, io.IOMixin, fn.FNMixin):
+class HFAutoModelForCausalLM(pl.LightningModule, io.IOMixin, fn.FNMixin):
     def __init__(
         self,
         model_name='gpt2',
@@ -57,7 +57,7 @@ def __init__(
     @property
     def tokenizer(self):
         if self._tokenizer is None:
-            self._tokenizer = HfAutoModelForCausalLM.configure_tokenizer(self.model_name, self.trust_remote_code)
+            self._tokenizer = HFAutoModelForCausalLM.configure_tokenizer(self.model_name, self.trust_remote_code)
         return self._tokenizer
 
     @tokenizer.setter

diff --git a/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py b/nemo/collections/llm/recipes/hf_auto_model_for_causal_lm.py
@@ -23,7 +23,7 @@
 from nemo import lightning as nl
 from nemo.collections.llm.api import finetune, pretrain
 from nemo.collections.llm.gpt.data.mock import MockDataModule
-from nemo.collections.llm.gpt.model.hf_auto_model_for_causal_lm import HfAutoModelForCausalLM
+from nemo.collections.llm.gpt.model.hf_auto_model_for_causal_lm import HFAutoModelForCausalLM
 from nemo.collections.llm.peft.lora import LoRA
 from nemo.collections.llm.recipes.log.default import default_log, default_resume, tensorboard_logger
 from nemo.collections.llm.recipes.optim.adam import pytorch_adam_with_cosine_annealing
@@ -35,23 +35,23 @@
 @run.cli.factory(name=NAME)
 def model(model_name, load_pretrained_weights) -> run.Config[pl.LightningModule]:
     """
-    Factory function to create HfAutoModelForCausalLM model configurations.
+    Factory function to create HFAutoModelForCausalLM model configurations.
 
     Args:
         model_name (str): Model id on HF.
 
     Returns:
-        run.Config[pl.LightningModule]: Configuration for the HfAutoModelForCausalLM.
+        run.Config[pl.LightningModule]: Configuration for the HFAutoModelForCausalLM.
 
     Examples:
         CLI usage:
-            $ nemo llm pretrain --factory 'HfAutoModelForCausalLM(model_name="mistralai/Mistral-Nemo-Instruct-2407")'
+            $ nemo llm pretrain --factory 'HFAutoModelForCausalLM(model_name="mistralai/Mistral-Nemo-Instruct-2407")'
 
         Python API usage:
             >>> model_config = model(model_name="mistralai/Mistral-Nemo-Instruct-2407")
             >>> print(model_config)
     """
-    return run.Config(HfAutoModelForCausalLM, model_name=model_name, load_pretrained_weights=load_pretrained_weights)
+    return run.Config(HFAutoModelForCausalLM, model_name=model_name, load_pretrained_weights=load_pretrained_weights)
 
 
 def trainer(
@@ -69,7 +69,7 @@ def trainer(
     gradient_clip_val: float = 1.0,
 ) -> run.Config[nl.Trainer]:
     """
-    Configure the NeMo Lightning Trainer for HfAutoModelForCausalLM.
+    Configure the NeMo Lightning Trainer for HFAutoModelForCausalLM.
 
     This function sets up the distributed training strategy and other training parameters.
 
@@ -91,7 +91,7 @@ def trainer(
 
     Examples:
         CLI usage:
-            $ nemo llm pretrain trainer=HfAutoModelForCausalLM ...
+            $ nemo llm pretrain trainer=HFAutoModelForCausalLM ...
 
         Python API usage:
             >>> trainer_config = trainer(num_nodes=2, num_gpus_per_node=8)
@@ -131,7 +131,7 @@ def pretrain_recipe(
     model_name: str = '',
 ) -> run.Partial:
     """
-    Create a pre-training recipe for a HfAutoModelForCausalLM model.
+    Create a pre-training recipe for a HFAutoModelForCausalLM model.
 
     This function sets up a complete configuration for pre-training, including
     model, trainer, data, logging, optimization, and resumption settings.
@@ -148,7 +148,7 @@ def pretrain_recipe(
 
     Examples:
         CLI usage:
-            $ nemo llm pretrain --factory 'HfAutoModelForCausalLM(model_name="mistralai/Mistral-Nemo-Instruct-2407")'
+            $ nemo llm pretrain --factory 'HFAutoModelForCausalLM(model_name="mistralai/Mistral-Nemo-Instruct-2407")'
 
         Python API usage:
             >>> recipe = pretrain_recipe(name="auto_pretrain", num_nodes=2, model_name="mistralai/Mistral-Nemo-Instruct-2407")
@@ -179,7 +179,7 @@ def finetune_recipe(
     model_name: str = '',
 ) -> run.Partial:
     """
-    Create a fine-tuning recipe for a HfAutoModelForCausalLM model.
+    Create a fine-tuning recipe for a HFAutoModelForCausalLM model.
 
     This function sets up a complete configuration for fine-tuning, including
     model, trainer, data, logging, optimization, and resumption settings.

diff --git a/nemo/export/tensorrt_llm.py b/nemo/export/tensorrt_llm.py
@@ -480,10 +480,13 @@ def export(
 
             tokenizer_path = os.path.join(nemo_export_dir, "tokenizer.model")
             tokenizer_path_nemo2 = os.path.join(nemo_export_dir, "nemo_context")
+            vocab_path = os.path.join(nemo_export_dir, "vocab.json")
             if os.path.exists(tokenizer_path):
                 shutil.copy(tokenizer_path, self.model_dir)
             elif os.path.exists(tokenizer_path_nemo2):
                 shutil.copytree(tokenizer_path_nemo2, Path(self.model_dir) / "nemo_context")
+            elif os.path.exists(vocab_path):
+                shutil.copy(vocab_path, os.path.join(self.model_dir, "vocab.json"))
             else:
                 self.tokenizer.save_pretrained(os.path.join(self.model_dir, 'huggingface_tokenizer'))
 

diff --git a/nemo/export/tiktoken_tokenizer.py b/nemo/export/tiktoken_tokenizer.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import base64
+import json
+from pathlib import Path
+from typing import Dict, Optional
+
+import numpy as np
+import tiktoken
+import torch
+
+PATTERN_TIKTOKEN = "[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]*[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]+|[^\\r\\n\\p{L}\\p{N}]?[\\p{Lu}\\p{Lt}\\p{Lm}\\p{Lo}\\p{M}]+[\\p{Ll}\\p{Lm}\\p{Lo}\\p{M}]*|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n/]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+DEFAULT_TIKTOKEN_MAX_VOCAB = 2**17  # 131072
+SPECIAL_TOKENS = ["<unk>", "<s>", "</s>"]
+SPECIAL_TOKEN_TEMPLATE = "<SPECIAL_{id}>"
+
+
+def reload_mergeable_ranks(
+    path: str,
+    max_vocab: Optional[int] = None,
+) -> Dict[bytes, int]:
+    """
+    Reload the tokenizer JSON file and convert it to Tiktoken format.
+    """
+    assert path.endswith(".json")
+
+    # reload vocab
+    with open(path, "r", encoding='utf-8') as f:
+        vocab = json.load(f)
+    assert isinstance(vocab, list)
+    print(f"Vocab size: {len(vocab)}")
+    if max_vocab is not None:
+        vocab = vocab[:max_vocab]
+        print(f"Cutting vocab to first {len(vocab)} tokens.")
+
+    # build ranks
+    ranks: Dict[bytes, int] = {}
+    for i, x in enumerate(vocab):
+        assert x.keys() == {"rank", "token_bytes", "token_str"}
+        assert x["rank"] == i
+        merge = base64.b64decode(x["token_bytes"])
+        assert i >= 256 or merge == bytes([i])
+        ranks[merge] = x["rank"]
+
+    # sanity check
+    assert len(ranks) == len(vocab)
+    assert set(ranks.values()) == set(range(len(ranks)))
+
+    return ranks
+
+
+class TiktokenTokenizer:
+    def __init__(self, vocab_file: str):
+
+        self.num_special_tokens = 1000
+        vocab_size = DEFAULT_TIKTOKEN_MAX_VOCAB
+        pattern = PATTERN_TIKTOKEN
+        special_tokens = SPECIAL_TOKENS.copy()
+        inner_vocab_size = vocab_size - self.num_special_tokens
+
+        token2id = reload_mergeable_ranks(vocab_file, max_vocab=inner_vocab_size)
+        self.tokenizer = tiktoken.Encoding(
+            name=Path(vocab_file).parent.name,
+            pat_str=pattern,
+            mergeable_ranks=token2id,
+            special_tokens={},  # special tokens are handled manually
+        )
+
+        # BOS / EOS / Pad token IDs
+        self._bos_id = special_tokens.index("<s>")
+        self._eos_id = special_tokens.index("</s>")
+
+    def encode(self, text):
+        tokens = self.tokenizer.encode(text)
+        tokens = [t + self.num_special_tokens for t in tokens]
+        return tokens
+
+    def decode(self, tokens):
+        # Filter out special tokens and adjust the remaining tokens
+        adjusted_tokens = [
+            t - self.num_special_tokens
+            for t in tokens
+            if t not in {self._bos_id, self._eos_id} and t >= self.num_special_tokens
+        ]
+
+        # Decode only if there are tokens left after filtering
+        if adjusted_tokens:
+            return self.tokenizer.decode(adjusted_tokens)
+        else:
+            return ""  # Return an empty string if all tokens were filtered out
+
+    def batch_decode(self, ids):
+        if isinstance(ids, np.ndarray) or torch.is_tensor(ids):
+            ids = ids.tolist()
+
+        if isinstance(ids[0], list):
+            ids = ids[0]
+
+        return self.decode(ids)
+
+    @property
+    def pad_id(self):
+        return self._eos_id
+
+    @property
+    def bos_token_id(self):
+        return self._bos_id
+
+    @property
+    def eos_token_id(self):
+        return self._eos_id