It seems this file is also missing. An LLM created it after some analysis, and it worked for me.
#!/usr/bin/env python3
"""Export vocab.bin for the C decoder (token_id -> string mapping).
Usage: python export_vocab.py [tokenizer.json] [output.bin]
Binary format (must match load_vocab() in infer.m):
uint32_t num_entries
uint32_t max_id
for each entry (sorted by token_id):
uint16_t byte_len
char[byte_len] (UTF-8 bytes)
"""
import json
import struct
import sys
import os
def find_tokenizer():
"""Search common locations for tokenizer.json."""
candidates = []
if len(sys.argv) > 1:
candidates.append(sys.argv[1])
candidates.extend(
[
"tokenizer.json",
"metal_infer/tokenizer.json",
"../tokenizer.json",
]
)
# HuggingFace cache
hf_base = os.path.expanduser("~/.cache/huggingface/hub")
if os.path.isdir(hf_base):
for root, dirs, files in os.walk(hf_base):
if "tokenizer.json" in files:
candidates.append(os.path.join(root, "tokenizer.json"))
for p in candidates:
if os.path.isfile(p):
return p
return None
def main():
tok_path = find_tokenizer()
out_path = (
sys.argv[-1]
if len(sys.argv) > 1 and not sys.argv[-1].endswith(".json")
else "vocab.bin"
)
if not tok_path:
print("ERROR: tokenizer.json not found. Pass as first argument:")
print(" python export_vocab.py /path/to/tokenizer.json [output.bin]")
sys.exit(1)
print(f"Using tokenizer: {tok_path}")
with open(tok_path, "r", encoding="utf-8") as f:
t = json.load(f)
with open(tok_path, "r", encoding="utf-8") as f:
t = json.load(f)
model = t["model"]
vocab = model["vocab"] # str -> int
# Sort by token_id
sorted_vocab = sorted(vocab.items(), key=lambda x: x[1])
num_entries = len(sorted_vocab)
max_id = sorted_vocab[-1][1]
with open(out_path, "wb") as f:
f.write(struct.pack("<I", num_entries))
f.write(struct.pack("<I", max_id))
for token_str, token_id in sorted_vocab:
b = token_str.encode("utf-8")
f.write(struct.pack("<H", len(b)))
f.write(b)
print(f"Exported to {out_path}:")
print(f" Entries: {num_entries}")
It seems this file is also missing. An LLM created it after some analysis, and it worked for me.