Skip to content

Commit

Permalink
let's not clutter the main directory
Browse files Browse the repository at this point in the history
  • Loading branch information
karpathy committed Feb 18, 2024
1 parent ac3ca85 commit a04b85a
Showing 1 changed file with 7 additions and 2 deletions.
9 changes: 7 additions & 2 deletions train.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,20 @@
The whole thing runs in ~25 seconds on my laptop.
"""

import os
from minbpe import BasicTokenizer, RegexTokenizer

# open some text and train a vocab of 512 tokens
text = open("tests/taylorswift.txt", "r", encoding="utf-8").read()

# create a directory for models so we don't pollute the current directory
os.makedirs("models", exist_ok=True)

for TokenizerClass, name in zip([BasicTokenizer, RegexTokenizer], ["basic", "regex"]):

# construct the Tokenizer object and kick off verbose training
tokenizer = TokenizerClass()
tokenizer.train(text, 512, verbose=True)
# writes two files in current directory: name.model, and name.vocab
tokenizer.save(name)
# writes two files in the models directory: name.model, and name.vocab
prefix = os.path.join("models", name)
tokenizer.save(prefix)

0 comments on commit a04b85a

Please sign in to comment.