forked from karpathy/minbpe
-
Notifications
You must be signed in to change notification settings - Fork 0
/
train.py
29 lines (24 loc) · 1.12 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
"""
Train our Tokenizers on some data, just to see them in action.
The whole thing runs in ~25 seconds on my laptop.
"""
# feel free to use either
from bpe_regex import Tokenizer as RegexTokenizer
from bpe_basic import Tokenizer as BasicTokenizer
# open some text and train a vocab of 512 tokens
text = open("taylorswift.txt", "r", encoding="utf-8").read()
for TokenizerClass, name in zip([BasicTokenizer, RegexTokenizer], ["basic", "regex"]):
# construct the Tokenizer object and kick off verbose training
tokenizer = TokenizerClass()
tokenizer.train(text, 512, verbose=True)
# pretty print the final vocab into a file
vocab_file = f"{name}.vocab"
with open(vocab_file, "w", encoding="utf-8") as f:
for idx, token in tokenizer.vocab.items():
if idx < 256:
# the first 256 tokens are just bytes, render them in <0xHH> format
token_string = f"<0x{idx:02x}>"
else:
# otherwise let's attempt to render the token as a string
token_string = token.decode('utf-8', errors='replace')
f.write(f"{token_string} {idx}\n")