From f9e19ea00d51f7d449d0cf90a5529ade6081ac04 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sat, 17 Feb 2024 18:16:10 -0800
Subject: [PATCH] add caution around save/load of gpt4 tokenizer

---
 minbpe/gpt4.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/minbpe/gpt4.py b/minbpe/gpt4.py
index 16d64ded..49bad7cb 100644
--- a/minbpe/gpt4.py
+++ b/minbpe/gpt4.py
@@ -77,3 +77,14 @@ def decode(self, ids):
         text_bytes = bytes(self.inverse_byte_shuffle[b] for b in text_bytes)
         text = text_bytes.decode("utf-8", errors="replace")
         return text
+
+    # save/load would require some thought.
+    # we'd have to change save/load of base to add support for byte_shuffle...
+    # alternatively, we could move byte_shuffle to base class, but that would
+    # mean that we're making ugly our beautiful Tokenizer just to support
+    # the GPT-4 tokenizer and its weird historical quirks around byte_shuffle.
+    def save(self, file_prefix):
+        raise NotImplementedError("GPT4Tokenizer cannot be saved.")
+
+    def load(self, model_file):
+        raise NotImplementedError("GPT4Tokenizer cannot be loaded.")