Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 0 additions & 79 deletions modeling/basics/test_attention.py

This file was deleted.

141 changes: 141 additions & 0 deletions modeling/llm_post_training/tools/example_vocab_dump.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!/usr/bin/env python3
"""
Example script demonstrating vocabulary dump functionality.

This script shows how to use the vocab_inspect.py tool to dump
a model's vocabulary to a JSON file.
"""

import subprocess
import sys
import os


def dump_vocabulary_example():
"""Example of dumping vocabulary from a model."""

# Example model paths (adjust based on your setup)
model_paths = [
"Qwen/Qwen2-0.5B-Instruct",
"meta-llama/Llama-3.2-3B-Instruct",
"distilbert-base-uncased",
]

print("🔍 Vocabulary Dump Examples")
print("=" * 50)

for model_path in model_paths:
output_file = f"vocab_{model_path.replace('/', '_')}.json"

print(f"\n📝 Dumping vocabulary for: {model_path}")
print(f"📁 Output file: {output_file}")

# Build command
cmd = [
"python",
"vocab_inspect.py",
"--model-path",
model_path,
"--dump-vocab",
output_file,
"--no-embeddings", # Don't load full model for faster processing
]

try:
# Run the command
result = subprocess.run(cmd, capture_output=True, text=True)

if result.returncode == 0:
print(f"✅ Successfully dumped vocabulary")
print(f"📊 Output: {result.stdout}")
else:
print(f"❌ Error: {result.stderr}")

except Exception as e:
print(f"❌ Exception: {e}")

print("-" * 30)


def show_vocab_structure():
"""Show the structure of the dumped vocabulary JSON."""

example_structure = {
"model_path": "Qwen/Qwen2-0.5B-Instruct",
"vocab_size": 152064,
"special_tokens": {
"unk_token": "<unk>",
"pad_token": "<pad>",
"bos_token": "<|begin_of_text|>",
"eos_token": "<|end_of_text|>",
"sep_token": "<|eot_id|>",
},
"vocabulary": [
{
"token_id": 0,
"token": "<|begin_of_text|>",
"length": 15,
"is_special": True,
"is_punctuation": False,
"is_digit": False,
"is_alpha": False,
"is_whitespace": False,
"is_subword": False,
},
{
"token_id": 1,
"token": "<|end_of_text|>",
"length": 13,
"is_special": True,
"is_punctuation": False,
"is_digit": False,
"is_alpha": False,
"is_whitespace": False,
"is_subword": False,
},
# ... more tokens
],
}

print("\n📋 Vocabulary JSON Structure:")
print("=" * 50)
print("The dumped vocabulary JSON contains:")
print("• model_path: Original model path")
print("• vocab_size: Total number of tokens")
print("• special_tokens: Special token mappings")
print("• vocabulary: Array of token objects with:")
print(" - token_id: Numeric ID of the token")
print(" - token: The actual token string")
print(" - length: Character length of token")
print(" - is_special: Whether it's a special token")
print(" - is_punctuation: Whether it's punctuation")
print(" - is_digit: Whether it's a digit")
print(" - is_alpha: Whether it's alphabetic")
print(" - is_whitespace: Whether it's whitespace")
print(" - is_subword: Whether it's a subword token")


if __name__ == "__main__":
print("🚀 Vocabulary Inspector - Dump Examples")
print("=" * 60)

# Show structure first
show_vocab_structure()

# Ask user if they want to run examples
response = input("\n❓ Run vocabulary dump examples? (y/n): ").lower().strip()

if response in ["y", "yes"]:
dump_vocabulary_example()
else:
print("👋 Skipping examples. Use the following command to dump vocabularies:")
print("\nExample commands:")
print(
"python vocab_inspect.py --model-path Qwen/Qwen2-0.5B-Instruct --dump-vocab qwen_0.5b_vocab.json"
)
print(
"python vocab_inspect.py --model-path meta-llama/Llama-3.2-3B-Instruct --dump-vocab llama_vocab.json"
)
print(
"python vocab_inspect.py --model-path distilbert-base-uncased --dump-vocab distilbert_vocab.json"
)
76 changes: 72 additions & 4 deletions modeling/llm_post_training/tools/vocab_inspect.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,12 @@
Usage:
python vocab_inspect.py --model-path meta-llama/Llama-3.2-3B \\
--query "dog" --top-k 5
python vocab_inspect.py --model-path meta-llama/Llama-3.2-3B --tokenize "Hello, world!"
python vocab_inspect.py --model-path distilbert-base-uncased --tokenize "The quick brown fox"
python vocab_inspect.py --model-path meta-llama/Llama-3.2-3B \\
--tokenize "Hello, world!"
python vocab_inspect.py --model-path distilbert-base-uncased \\
--tokenize "The quick brown fox"
python vocab_inspect.py --model-path Qwen/Qwen2-0.5B-Instruct \\
--dump-vocab qwen_0.5b_vocab.json
"""

import argparse
Expand Down Expand Up @@ -319,6 +323,55 @@ def tokenize_query(self, query: str) -> List[Dict]:

return token_info_list

def dump_vocabulary(self, output_file: str) -> None:
"""
Dump the complete vocabulary to a JSON file.

Args:
output_file: Path to output JSON file
"""
if self.tokenizer is None:
raise ValueError("Tokenizer not loaded. Call load_model() first.")

print(f"📝 Dumping vocabulary to: {output_file}")
print(f"📊 Total tokens: {len(self.vocab)}")

# Get all tokens with their IDs
vocab_items = sorted(self.vocab.items(), key=lambda x: x[1]) # Sort by ID

import json

vocab_data = {
"model_path": self.model_path,
"vocab_size": len(self.vocab),
"special_tokens": {
"unk_token": self.tokenizer.unk_token,
"pad_token": self.tokenizer.pad_token,
"bos_token": self.tokenizer.bos_token,
"eos_token": self.tokenizer.eos_token,
"sep_token": self.tokenizer.sep_token,
},
"vocabulary": [
{
"token_id": token_id,
"token": token,
"length": len(token),
"is_special": (token.startswith("<") and token.endswith(">")),
"is_punctuation": token in ".,!?;:()[]{}",
"is_digit": token.isdigit(),
"is_alpha": token.isalpha(),
"is_whitespace": token.isspace(),
"is_subword": (token.startswith("##") or token.startswith("▁")),
}
for token, token_id in vocab_items
],
}

with open(output_file, "w", encoding="utf-8") as f:
json.dump(vocab_data, f, indent=2, ensure_ascii=False)

print(f"✅ Vocabulary dumped successfully to {output_file}")


def main():
"""Main function for command-line interface."""
Expand Down Expand Up @@ -351,6 +404,9 @@ def main():
parser.add_argument(
"--no-embeddings", action="store_true", help="Don't load model for embeddings"
)
parser.add_argument(
"--dump-vocab", help="Dump vocabulary to JSON file (specify output file path)"
)

args = parser.parse_args()

Expand All @@ -359,7 +415,16 @@ def main():
tool.load_model(load_embeddings=not args.no_embeddings)

# Handle different operations
if args.tokenize:
if args.dump_vocab:
print(f"\nDumping vocabulary to: {args.dump_vocab}")
print("=" * 60)

try:
tool.dump_vocabulary(args.dump_vocab)
except Exception as e:
print(f"Error dumping vocabulary: {e}")

elif args.tokenize:
print(f"\nTokenizing: '{args.tokenize}'")
print("=" * 60)

Expand Down Expand Up @@ -452,7 +517,10 @@ def main():
print(f"({len(tokens)} tokens)")

else:
print("No operation specified. Use --tokenize, --query, or " "--list-tokens")
print(
"No operation specified. Use --tokenize, --query, "
"--list-tokens, or --dump-vocab"
)


if __name__ == "__main__":
Expand Down