src: add initial tokenization exploration

This is code from Andrej Karpathy's video about tokenization which I'll use as I'm following along with the video.
danbev · Aug 23, 2024 · 63186a1 · 63186a1
1 parent dac4fd7
commit 63186a1
Showing 1 changed file with 35 additions and 0 deletions.
diff --git a/fundamentals/tokenization/src/bpe.py b/fundamentals/tokenization/src/bpe.py
@@ -0,0 +1,35 @@
+def pairs_count(ids, d=None):
+    d = {} if d is None else d
+    for pair in zip(ids, ids[1:]):
+        print(f'checking pair: {pair}')
+        d[pair] = d.get(pair, 0) + 1
+    return d
+
+inputs = [1, 2, 3, 1, 2]
+counts = pairs_count(inputs)
+print(counts)
+
+def merge(ids, pair, idx):
+    """
+    In the list of integers (ids), replace all consecutive occurrences
+    of pair with the new integer token idx
+    Example: ids=[1, 2, 3, 1, 2], pair=(1, 2), idx=4 -> [4, 3, 4]
+    """
+    newids = []
+    i = 0
+    while i < len(ids):
+        # if not at the very last position AND the pair matches, replace it
+        if ids[i] == pair[0] and i < len(ids) - 1 and ids[i+1] == pair[1]:
+            newids.append(idx)
+            i += 2
+        else:
+            newids.append(ids[i])
+            i += 1
+    return newids
+
+ids = merge(inputs, (1, 2), 4)
+print(ids)
+
+vocab = {idx: bytes([idx]) for idx in range(256) }
+print(vocab[255])
+print(bytes([255]));