Replace huggingface args with hydra configs (#57)

bhavya01 · web-flow · commit 9721837fd52f · 2025-01-28T11:05:50.000-08:00
* Replace huggingface config args with hydra configs

* Fix train.py script to remove old args

* Add hydra-core dependency to pyproject.toml and fix lint

* Update README.md and rename llama.yaml to llama-3-8b.yaml
diff --git a/README.md b/README.md
@@ -27,8 +27,7 @@ Train Llama 3 8B using torch_xla:
 
 ```sh
 export HF_TOKEN='...your huggingface token...'
-XLA_IR_DEBUG=1 XLA_HLO_DEBUG=1 python3 torchprime/torch_xla_models/train.py \
-    torchprime/torch_xla_models/configs/run.json
+XLA_IR_DEBUG=1 XLA_HLO_DEBUG=1 python3 torchprime/torch_xla_models/train.py
 ```
 
 Train Llama 3 8B using torchax:
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,6 +16,7 @@ dependencies = [
     "transformers==4.44.2",
     "transformers[torch]==4.44.2",
     "datasets==3.0.0",
+    "hydra-core==1.3.0",
     "optax==0.2.4",
     "fire==0.7.0",
     "tensorflow-cpu==2.18.0",
diff --git a/torchprime/torch_xla_models/README.md b/torchprime/torch_xla_models/README.md
@@ -24,8 +24,7 @@
 3. Run the training script:
 
    ```
-   XLA_IR_DEBUG=1 XLA_HLO_DEBUG=1 python3 torchprime/torch_xla_models/train.py \
-       torchprime/torch_xla_models/configs/run.json
+   XLA_IR_DEBUG=1 XLA_HLO_DEBUG=1 python3 torchprime/torch_xla_models/train.py
    ```
 
 ## Running on XPK
@@ -37,20 +36,7 @@ export HF_TOKEN='... hugging face token ...'
 export XLA_IR_DEBUG=1
 export XLA_HLO_DEBUG=1 
 
-tp run torchprime/torch_xla_models/train.py \
-   --dataset_name wikitext \
-   --dataset_config_name 'wikitext-103-raw-v1' \
-   --output_dir /tmp \
-   --cache_dir /tmp \
-   --global_batch_size 256 \
-   --logging_steps 10 \
-   --max_steps 30 \
-   --profile_step 5 \
-   --model_id 'meta-llama/Meta-Llama-3-8B' \
-   --tokenizer_name 'meta-llama/Meta-Llama-3-8B' \
-   --block_size 8192 \
-   --fsdp full_shard \
-   --fsdp_config torchprime/torch_xla_models/configs/fsdp_config.json
+tp run torchprime/torch_xla_models/train.py
 ```
 
 This will build the dockerfile and launch it on XPK.
@@ -59,5 +45,6 @@ This will build the dockerfile and launch it on XPK.
 ## Key Components
 
 - `train.py`: Main training script that sets up the model, data, and training loop
-- `configs/run.json`: Configuration file for the training script
+- `configs/base.yaml`: Configuration file for the training script
+- `configs/model`: Configuration files for the training models
 - `llama/model.py`: Implementation of the Llama model
diff --git a/torchprime/torch_xla_models/configs/base.yaml b/torchprime/torch_xla_models/configs/base.yaml
@@ -0,0 +1,26 @@
+# This defines the order in which configs are loaded. The latter configs
+# override the earlier ones.
+defaults:
+  - _self_ # refers to this config file
+  - model: llama-3-8b # refers to model/llama.yaml
+
+dataset_name: wikitext
+dataset_config_name: wikitext-2-raw-v1
+global_batch_size: 8
+logging_steps: 10
+max_steps: 15
+block_size: 8192
+cache_dir: /tmp/
+seed: 42
+profile_step: -1
+profile_logdir: /tmp/profile
+profile_duration: 100000
+fsdp:
+  transformer_layer_cls_to_wrap:
+    - LlamaDecoderLayer
+  xla_fsdp_grad_ckpt: true
+optimizer:
+  learning_rate: 5.e-5
+lr_scheduler:
+  type: linear
+  warmup_steps: 0 
diff --git a/torchprime/torch_xla_models/configs/fsdp_config.json b/torchprime/torch_xla_models/configs/fsdp_config.json
diff --git a/torchprime/torch_xla_models/configs/model/llama-3-8b.yaml b/torchprime/torch_xla_models/configs/model/llama-3-8b.yaml
@@ -0,0 +1,17 @@
+vocab_size: 128256
+hidden_size: 4096
+intermediate_size: 14336
+num_hidden_layers: 32
+num_attention_heads: 32
+num_key_value_heads: 8
+hidden_act: silu
+max_position_embeddings: 131072
+bos_token_id: 128000
+eos_token_id: 128001
+tokenizer_name: meta-llama/Meta-Llama-3-8B
+initializer_range: 0.02
+rms_norm_eps: 1.0e-05
+attention_dropout: false
+attention_bias: false
+flash_attention: true
+rope_theta: 500000.0
diff --git a/torchprime/torch_xla_models/configs/run.json b/torchprime/torch_xla_models/configs/run.json
diff --git a/torchprime/torch_xla_models/llama/model.py b/torchprime/torch_xla_models/llama/model.py
@@ -22,11 +22,10 @@
 
 import torch
 import torch_xla.debug.profiler as xp
+from omegaconf import DictConfig
 from torch import nn
 from torch.nn import CrossEntropyLoss
 from transformers.activations import ACT2FN
-from transformers.modeling_utils import PreTrainedModel
-from transformers.models.llama.configuration_llama import LlamaConfig
 from transformers.models.llama.modeling_llama import CausalLMOutputWithPast
 from transformers.utils import logging
 
@@ -52,12 +51,7 @@ def forward(self, hidden_states):
 
 class LlamaRotaryEmbedding(nn.Module):
   def __init__(
-    self,
-    dim,
-    max_position_embeddings=2048,
-    base=10000,
-    device=None,
-    scaling_factor=1.0,
+    self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0
   ):
     super().__init__()
     self.scaling_factor = scaling_factor
@@ -161,7 +155,7 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
 class LlamaAttention(nn.Module):
   """Multi-headed attention from 'Attention Is All You Need' paper"""
 
-  def __init__(self, config: LlamaConfig, layer_idx: int | None = None):
+  def __init__(self, config: DictConfig, layer_idx: int | None = None):
     super().__init__()
     self.config = config
     self.layer_idx = layer_idx
@@ -290,7 +284,7 @@ def forward(
 
 
 class LlamaDecoderLayer(nn.Module):
-  def __init__(self, config: LlamaConfig, layer_idx: int):
+  def __init__(self, config: DictConfig, layer_idx: int):
     super().__init__()
     self.hidden_size = config.hidden_size
 
@@ -338,35 +332,19 @@ def forward(
     return hidden_states
 
 
-class LlamaPreTrainedModel(PreTrainedModel):
-  def _init_weights(self, module):
-    std = self.config.initializer_range
-    if isinstance(module, nn.Linear):
-      module.weight.data.normal_(mean=0.0, std=std)
-      if module.bias is not None:
-        module.bias.data.zero_()
-    elif isinstance(module, nn.Embedding):
-      module.weight.data.normal_(mean=0.0, std=std)
-      if module.padding_idx is not None:
-        module.weight.data[module.padding_idx].zero_()
-
-
-class LlamaModel(LlamaPreTrainedModel):
+class LlamaModel(nn.Module):
   """
   Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`LlamaDecoderLayer`]
 
   Args:
-      config: LlamaConfig
+      config: DictConfig
   """
 
-  def __init__(self, config: LlamaConfig):
-    super().__init__(config)
-    self.padding_idx = config.pad_token_id
+  def __init__(self, config: DictConfig):
+    super().__init__()
     self.vocab_size = config.vocab_size
 
-    self.embed_tokens = nn.Embedding(
-      config.vocab_size, config.hidden_size, self.padding_idx
-    )
+    self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size)
     self.layers = nn.ModuleList(
       [
         LlamaDecoderLayer(config, layer_idx)
@@ -375,9 +353,6 @@ def __init__(self, config: LlamaConfig):
     )
     self.norm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    # Initialize weights and apply final processing
-    self.post_init()
-
   @xp.trace_me("LlamaModel")
   def forward(
     self,
@@ -393,11 +368,7 @@ def forward(
     # Create a causal mask without calling the current method
     seq_length = inputs_embeds.size(1)
     causal_mask = torch.triu(
-      torch.full(
-        (seq_length, seq_length),
-        float("-inf"),
-        device=inputs_embeds.device,
-      ),
+      torch.full((seq_length, seq_length), float("-inf"), device=inputs_embeds.device),
       diagonal=1,
     )
     causal_mask = causal_mask.unsqueeze(0).unsqueeze(0)  # Add batch and head dimension
@@ -411,24 +382,34 @@ def forward(
     # decoder layers
     for decoder_layer in self.layers:
       hidden_states = decoder_layer(
-        hidden_states,
-        attention_mask=causal_mask,
-        position_ids=position_ids,
+        hidden_states, attention_mask=causal_mask, position_ids=position_ids
       )
 
     hidden_states = self.norm(hidden_states)
     return hidden_states
 
 
-class LlamaForCausalLM(LlamaPreTrainedModel):
+class LlamaForCausalLM(nn.Module):
   def __init__(self, config):
-    super().__init__(config)
+    super().__init__()
+    self.config = config
     self.model = LlamaModel(config)
     self.vocab_size = config.vocab_size
     self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 
     # Initialize weights and apply final processing
-    self.post_init()
+    self.apply(self._init_weights)
+
+  def _init_weights(self, module):
+    std = self.config.initializer_range
+    if isinstance(module, nn.Linear):
+      module.weight.data.normal_(mean=0.0, std=std)
+      if module.bias is not None:
+        module.bias.data.zero_()
+    elif isinstance(module, nn.Embedding):
+      module.weight.data.normal_(mean=0.0, std=std)
+      if module.padding_idx is not None:
+        module.weight.data[module.padding_idx].zero_()
 
   @xp.trace_me("LlamaForCausalLM")
   def forward(
diff --git a/torchprime/torch_xla_models/tests/test_llama.py b/torchprime/torch_xla_models/tests/test_llama.py
@@ -3,6 +3,7 @@
 
 import torch
 import torch_xla
+from omegaconf import OmegaConf
 from transformers import AutoConfig
 from transformers import LlamaForCausalLM as HfLlamaForCausalLM
 
@@ -24,10 +25,28 @@ def setUp(self):
       vocab_size=self.vocab_size,
     )
     config.flash_attention = False
+    torchprime_config = OmegaConf.create(
+      {
+        "vocab_size": 128,
+        "hidden_size": 8,
+        "intermediate_size": 16,
+        "num_hidden_layers": 1,
+        "num_attention_heads": 8,
+        "num_key_value_heads": 8,
+        "hidden_act": "silu",
+        "max_position_embeddings": 8192,
+        "initializer_range": 0.02,
+        "rms_norm_eps": 1.0e-05,
+        "attention_dropout": False,
+        "attention_bias": False,
+        "flash_attention": False,
+        "rope_theta": 500000.0,
+      }
+    )
     # place model on CPU device first
     with torch.device("cpu"):
       self.hf_model = HfLlamaForCausalLM(config)
-      self.model = LlamaForCausalLM(config)
+      self.model = LlamaForCausalLM(torchprime_config)
       self.model.load_state_dict(self.hf_model.state_dict())
 
   def test_forward_our_model_against_hf_model(self):
diff --git a/torchprime/torch_xla_models/train.py b/torchprime/torch_xla_models/train.py