add caution around save/load of gpt4 tokenizer

miknoj · Feb 18, 2024 · f9e19ea · f9e19ea
1 parent a04b85a
commit f9e19ea
Showing 1 changed file with 11 additions and 0 deletions.
diff --git a/minbpe/gpt4.py b/minbpe/gpt4.py
@@ -77,3 +77,14 @@ def decode(self, ids):
         text_bytes = bytes(self.inverse_byte_shuffle[b] for b in text_bytes)
         text = text_bytes.decode("utf-8", errors="replace")
         return text
+
+    # save/load would require some thought.
+    # we'd have to change save/load of base to add support for byte_shuffle...
+    # alternatively, we could move byte_shuffle to base class, but that would
+    # mean that we're making ugly our beautiful Tokenizer just to support
+    # the GPT-4 tokenizer and its weird historical quirks around byte_shuffle.
+    def save(self, file_prefix):
+        raise NotImplementedError("GPT4Tokenizer cannot be saved.")
+
+    def load(self, model_file):
+        raise NotImplementedError("GPT4Tokenizer cannot be loaded.")