Skip to content

vocab.bin missing #6

@amb007

Description

@amb007

It seems this file is also missing. An LLM created it after some analysis, and it worked for me.

#!/usr/bin/env python3
"""Export vocab.bin for the C decoder (token_id -> string mapping).

Usage: python export_vocab.py [tokenizer.json] [output.bin]

Binary format (must match load_vocab() in infer.m):
  uint32_t num_entries
  uint32_t max_id
  for each entry (sorted by token_id):
    uint16_t byte_len
    char[byte_len]  (UTF-8 bytes)
"""
 
import json
import struct
import sys
import os
 
 
def find_tokenizer():
    """Search common locations for tokenizer.json."""
    candidates = []
    if len(sys.argv) > 1:
        candidates.append(sys.argv[1])
    candidates.extend(
        [
            "tokenizer.json",
            "metal_infer/tokenizer.json",
            "../tokenizer.json",
        ]
    )
    # HuggingFace cache
    hf_base = os.path.expanduser("~/.cache/huggingface/hub")
    if os.path.isdir(hf_base):
        for root, dirs, files in os.walk(hf_base):
            if "tokenizer.json" in files:
                candidates.append(os.path.join(root, "tokenizer.json"))
    for p in candidates:
        if os.path.isfile(p):
            return p
    return None
 
 
def main():
    tok_path = find_tokenizer()
    out_path = (
        sys.argv[-1]
        if len(sys.argv) > 1 and not sys.argv[-1].endswith(".json")
        else "vocab.bin"
    )
 
    if not tok_path:
        print("ERROR: tokenizer.json not found. Pass as first argument:")
        print("  python export_vocab.py /path/to/tokenizer.json [output.bin]")
        sys.exit(1)
 
    print(f"Using tokenizer: {tok_path}")
    with open(tok_path, "r", encoding="utf-8") as f:
        t = json.load(f)
 
    with open(tok_path, "r", encoding="utf-8") as f:
        t = json.load(f)
 
    model = t["model"]
    vocab = model["vocab"]  # str -> int
 
    # Sort by token_id
    sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])
    num_entries = len(sorted_vocab)
    max_id = sorted_vocab[-1][1]
 
    with open(out_path, "wb") as f:
        f.write(struct.pack("<I", num_entries))
        f.write(struct.pack("<I", max_id))
        for token_str, token_id in sorted_vocab:
            b = token_str.encode("utf-8")
            f.write(struct.pack("<H", len(b)))
            f.write(b)
 
    print(f"Exported to {out_path}:")
    print(f"  Entries: {num_entries}")

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions