Skip to content

Commit

Permalink
src: add initial tokenization exploration
Browse files Browse the repository at this point in the history
This is code from Andrej Karpathy's video about tokenization which I'll
use as I'm following along with the video.
  • Loading branch information
danbev committed Aug 23, 2024
1 parent dac4fd7 commit 63186a1
Showing 1 changed file with 35 additions and 0 deletions.
35 changes: 35 additions & 0 deletions fundamentals/tokenization/src/bpe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
def pairs_count(ids, d=None):
d = {} if d is None else d
for pair in zip(ids, ids[1:]):
print(f'checking pair: {pair}')
d[pair] = d.get(pair, 0) + 1
return d

inputs = [1, 2, 3, 1, 2]
counts = pairs_count(inputs)
print(counts)

def merge(ids, pair, idx):
"""
In the list of integers (ids), replace all consecutive occurrences
of pair with the new integer token idx
Example: ids=[1, 2, 3, 1, 2], pair=(1, 2), idx=4 -> [4, 3, 4]
"""
newids = []
i = 0
while i < len(ids):
# if not at the very last position AND the pair matches, replace it
if ids[i] == pair[0] and i < len(ids) - 1 and ids[i+1] == pair[1]:
newids.append(idx)
i += 2
else:
newids.append(ids[i])
i += 1
return newids

ids = merge(inputs, (1, 2), 4)
print(ids)

vocab = {idx: bytes([idx]) for idx in range(256) }
print(vocab[255])
print(bytes([255]));

0 comments on commit 63186a1

Please sign in to comment.