Skip to content

Commit

Permalink
add small amount of code to write out the gpt4 vocab in the same form…
Browse files Browse the repository at this point in the history
…at as the base class, taking into account the byte shuffle
  • Loading branch information
karpathy committed Feb 19, 2024
1 parent 872c086 commit 1cc9d06
Showing 1 changed file with 23 additions and 0 deletions.
23 changes: 23 additions & 0 deletions minbpe/gpt4.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,26 @@ def save(self, file_prefix):

def load(self, model_file):
raise NotImplementedError("GPT4Tokenizer cannot be loaded.")

def save_vocab(self, vocab_file):
# just for visualization purposes let's output the GPT-4 tokens
# in the exact same format as the base class would.
# simple run as:
# python -c "from minbpe import GPT4Tokenizer; GPT4Tokenizer().save_vocab('gpt4.vocab')"
from .base import render_token
# build vocab being mindful of the byte shuffle
vocab = {idx: bytes([self.inverse_byte_shuffle[idx]]) for idx in range(256)}
for (p0, p1), idx in self.merges.items():
vocab[idx] = vocab[p0] + vocab[p1]
# now merge the shuffled bytes and write to file
inverted_merges = {idx: pair for pair, idx in self.merges.items()}
with open(vocab_file, "w", encoding="utf-8") as f:
for idx, token in vocab.items():
s = render_token(token)
if idx in inverted_merges:
idx0, idx1 = inverted_merges[idx]
s0 = render_token(vocab[idx0])
s1 = render_token(vocab[idx1])
f.write(f"[{s0}][{s1}] -> [{s}] {idx}\n")
else:
f.write(f"[{s}] {idx}\n")

0 comments on commit 1cc9d06

Please sign in to comment.