small-thinking · yxjiang · Oct 26, 2025
diff --git a/modeling/basics/test_attention.py b/modeling/basics/test_attention.py
diff --git a/modeling/llm_post_training/tools/example_vocab_dump.py b/modeling/llm_post_training/tools/example_vocab_dump.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""
+Example script demonstrating vocabulary dump functionality.
+
+This script shows how to use the vocab_inspect.py tool to dump
+a model's vocabulary to a JSON file.
+"""
+
+import subprocess
+import sys
+import os
+
+
+def dump_vocabulary_example():
+    """Example of dumping vocabulary from a model."""
+
+    # Example model paths (adjust based on your setup)
+    model_paths = [
+        "Qwen/Qwen2-0.5B-Instruct",
+        "meta-llama/Llama-3.2-3B-Instruct",
+        "distilbert-base-uncased",
+    ]
+
+    print("🔍 Vocabulary Dump Examples")
+    print("=" * 50)
+
+    for model_path in model_paths:
+        output_file = f"vocab_{model_path.replace('/', '_')}.json"
+
+        print(f"\n📝 Dumping vocabulary for: {model_path}")
+        print(f"📁 Output file: {output_file}")
+
+        # Build command
+        cmd = [
+            "python",
+            "vocab_inspect.py",
+            "--model-path",
+            model_path,
+            "--dump-vocab",
+            output_file,
+            "--no-embeddings",  # Don't load full model for faster processing
+        ]
+
+        try:
+            # Run the command
+            result = subprocess.run(cmd, capture_output=True, text=True)
+
+            if result.returncode == 0:
+                print(f"✅ Successfully dumped vocabulary")
+                print(f"📊 Output: {result.stdout}")
+            else:
+                print(f"❌ Error: {result.stderr}")
+
+        except Exception as e:
+            print(f"❌ Exception: {e}")
+
+        print("-" * 30)
+
+
+def show_vocab_structure():
+    """Show the structure of the dumped vocabulary JSON."""
+
+    example_structure = {
+        "model_path": "Qwen/Qwen2-0.5B-Instruct",
+        "vocab_size": 152064,
+        "special_tokens": {
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+            "bos_token": "<|begin_of_text|>",
+            "eos_token": "<|end_of_text|>",
+            "sep_token": "<|eot_id|>",
+        },
+        "vocabulary": [
+            {
+                "token_id": 0,
+                "token": "<|begin_of_text|>",
+                "length": 15,
+                "is_special": True,
+                "is_punctuation": False,
+                "is_digit": False,
+                "is_alpha": False,
+                "is_whitespace": False,
+                "is_subword": False,
+            },
+            {
+                "token_id": 1,
+                "token": "<|end_of_text|>",
+                "length": 13,
+                "is_special": True,
+                "is_punctuation": False,
+                "is_digit": False,
+                "is_alpha": False,
+                "is_whitespace": False,
+                "is_subword": False,
+            },
+            # ... more tokens
+        ],
+    }
+
+    print("\n📋 Vocabulary JSON Structure:")
+    print("=" * 50)
+    print("The dumped vocabulary JSON contains:")
+    print("• model_path: Original model path")
+    print("• vocab_size: Total number of tokens")
+    print("• special_tokens: Special token mappings")
+    print("• vocabulary: Array of token objects with:")
+    print("  - token_id: Numeric ID of the token")
+    print("  - token: The actual token string")
+    print("  - length: Character length of token")
+    print("  - is_special: Whether it's a special token")
+    print("  - is_punctuation: Whether it's punctuation")
+    print("  - is_digit: Whether it's a digit")
+    print("  - is_alpha: Whether it's alphabetic")
+    print("  - is_whitespace: Whether it's whitespace")
+    print("  - is_subword: Whether it's a subword token")
+
+
+if __name__ == "__main__":
+    print("🚀 Vocabulary Inspector - Dump Examples")
+    print("=" * 60)
+
+    # Show structure first
+    show_vocab_structure()
+
+    # Ask user if they want to run examples
+    response = input("\n❓ Run vocabulary dump examples? (y/n): ").lower().strip()
+
+    if response in ["y", "yes"]:
+        dump_vocabulary_example()
+    else:
+        print("👋 Skipping examples. Use the following command to dump vocabularies:")
+        print("\nExample commands:")
+        print(
+            "python vocab_inspect.py --model-path Qwen/Qwen2-0.5B-Instruct --dump-vocab qwen_0.5b_vocab.json"
+        )
+        print(
+            "python vocab_inspect.py --model-path meta-llama/Llama-3.2-3B-Instruct --dump-vocab llama_vocab.json"
+        )
+        print(
+            "python vocab_inspect.py --model-path distilbert-base-uncased --dump-vocab distilbert_vocab.json"
+        )
diff --git a/modeling/llm_post_training/tools/vocab_inspect.py b/modeling/llm_post_training/tools/vocab_inspect.py
@@ -7,8 +7,12 @@
 Usage:
     python vocab_inspect.py --model-path meta-llama/Llama-3.2-3B \\
         --query "dog" --top-k 5
-    python vocab_inspect.py --model-path meta-llama/Llama-3.2-3B --tokenize "Hello, world!"
-    python vocab_inspect.py --model-path distilbert-base-uncased --tokenize "The quick brown fox"
+    python vocab_inspect.py --model-path meta-llama/Llama-3.2-3B \\
+        --tokenize "Hello, world!"
+    python vocab_inspect.py --model-path distilbert-base-uncased \\
+        --tokenize "The quick brown fox"
+    python vocab_inspect.py --model-path Qwen/Qwen2-0.5B-Instruct \\
+        --dump-vocab qwen_0.5b_vocab.json
 """
 
 import argparse
@@ -319,6 +323,55 @@ def tokenize_query(self, query: str) -> List[Dict]:
 
         return token_info_list
 
+    def dump_vocabulary(self, output_file: str) -> None:
+        """
+        Dump the complete vocabulary to a JSON file.
+
+        Args:
+            output_file: Path to output JSON file
+        """
+        if self.tokenizer is None:
+            raise ValueError("Tokenizer not loaded. Call load_model() first.")
+
+        print(f"📝 Dumping vocabulary to: {output_file}")
+        print(f"📊 Total tokens: {len(self.vocab)}")
+
+        # Get all tokens with their IDs
+        vocab_items = sorted(self.vocab.items(), key=lambda x: x[1])  # Sort by ID
+
+        import json
+
+        vocab_data = {
+            "model_path": self.model_path,
+            "vocab_size": len(self.vocab),
+            "special_tokens": {
+                "unk_token": self.tokenizer.unk_token,
+                "pad_token": self.tokenizer.pad_token,
+                "bos_token": self.tokenizer.bos_token,
+                "eos_token": self.tokenizer.eos_token,
+                "sep_token": self.tokenizer.sep_token,
+            },
+            "vocabulary": [
+                {
+                    "token_id": token_id,
+                    "token": token,
+                    "length": len(token),
+                    "is_special": (token.startswith("<") and token.endswith(">")),
+                    "is_punctuation": token in ".,!?;:()[]{}",
+                    "is_digit": token.isdigit(),
+                    "is_alpha": token.isalpha(),
+                    "is_whitespace": token.isspace(),
+                    "is_subword": (token.startswith("##") or token.startswith("▁")),
+                }
+                for token, token_id in vocab_items
+            ],
+        }
+
+        with open(output_file, "w", encoding="utf-8") as f:
+            json.dump(vocab_data, f, indent=2, ensure_ascii=False)
+
+        print(f"✅ Vocabulary dumped successfully to {output_file}")
+
 
 def main():
     """Main function for command-line interface."""
@@ -351,6 +404,9 @@ def main():
     parser.add_argument(
         "--no-embeddings", action="store_true", help="Don't load model for embeddings"
     )
+    parser.add_argument(
+        "--dump-vocab", help="Dump vocabulary to JSON file (specify output file path)"
+    )
 
     args = parser.parse_args()
 
@@ -359,7 +415,16 @@ def main():
     tool.load_model(load_embeddings=not args.no_embeddings)
 
     # Handle different operations
-    if args.tokenize:
+    if args.dump_vocab:
+        print(f"\nDumping vocabulary to: {args.dump_vocab}")
+        print("=" * 60)
+
+        try:
+            tool.dump_vocabulary(args.dump_vocab)
+        except Exception as e:
+            print(f"Error dumping vocabulary: {e}")
+
+    elif args.tokenize:
         print(f"\nTokenizing: '{args.tokenize}'")
         print("=" * 60)
 
@@ -452,7 +517,10 @@ def main():
         print(f"({len(tokens)} tokens)")
 
     else:
-        print("No operation specified. Use --tokenize, --query, or " "--list-tokens")
+        print(
+            "No operation specified. Use --tokenize, --query, "
+            "--list-tokens, or --dump-vocab"
+        )
 
 
 if __name__ == "__main__":