From a04b85accaf34190fdc5d2f966923fde3edd2fa0 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 17 Feb 2024 17:14:42 -0800 Subject: [PATCH] let's not clutter the main directory --- train.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index d00005f8..53d9fcb2 100644 --- a/train.py +++ b/train.py @@ -3,15 +3,20 @@ The whole thing runs in ~25 seconds on my laptop. """ +import os from minbpe import BasicTokenizer, RegexTokenizer # open some text and train a vocab of 512 tokens text = open("tests/taylorswift.txt", "r", encoding="utf-8").read() +# create a directory for models so we don't pollute the current directory +os.makedirs("models", exist_ok=True) + for TokenizerClass, name in zip([BasicTokenizer, RegexTokenizer], ["basic", "regex"]): # construct the Tokenizer object and kick off verbose training tokenizer = TokenizerClass() tokenizer.train(text, 512, verbose=True) - # writes two files in current directory: name.model, and name.vocab - tokenizer.save(name) + # writes two files in the models directory: name.model, and name.vocab + prefix = os.path.join("models", name) + tokenizer.save(prefix)