add small warnings about gpt4 tokenizer just being a pretrained token…

…izer
codecontinuously · Feb 18, 2024 · e0ed1bc · e0ed1bc
1 parent 9ab2500
commit e0ed1bc
Showing 1 changed file with 7 additions and 1 deletion.
diff --git a/minbpe/gpt4.py b/minbpe/gpt4.py
@@ -1,5 +1,7 @@
 """
-Implements the GPT-4 Tokenizer with a light wrapper around the RegexTokenizer.
+Implements the GPT-4 Tokenizer as a light wrapper around the RegexTokenizer.
+Note that this is a pretrained tokenizer. By default and inside init(), it
+loads the pretrained tokenizer from the `cl100k_base` tokenizer of tiktoken.
 """
 
 import tiktoken
@@ -78,6 +80,10 @@ def decode(self, ids):
         text = text_bytes.decode("utf-8", errors="replace")
         return text
 
+    # this is a pretrained tokenizer, it is not intended to be trained
+    def train(self, text, vocab_size, verbose=False):
+        raise NotImplementedError
+
     # save/load would require some thought.
     # we'd have to change save/load of base to add support for byte_shuffle...
     # alternatively, we could move byte_shuffle to base class, but that would