From f9e19ea00d51f7d449d0cf90a5529ade6081ac04 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 17 Feb 2024 18:16:10 -0800 Subject: [PATCH] add caution around save/load of gpt4 tokenizer --- minbpe/gpt4.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/minbpe/gpt4.py b/minbpe/gpt4.py index 16d64ded..49bad7cb 100644 --- a/minbpe/gpt4.py +++ b/minbpe/gpt4.py @@ -77,3 +77,14 @@ def decode(self, ids): text_bytes = bytes(self.inverse_byte_shuffle[b] for b in text_bytes) text = text_bytes.decode("utf-8", errors="replace") return text + + # save/load would require some thought. + # we'd have to change save/load of base to add support for byte_shuffle... + # alternatively, we could move byte_shuffle to base class, but that would + # mean that we're making ugly our beautiful Tokenizer just to support + # the GPT-4 tokenizer and its weird historical quirks around byte_shuffle. + def save(self, file_prefix): + raise NotImplementedError("GPT4Tokenizer cannot be saved.") + + def load(self, model_file): + raise NotImplementedError("GPT4Tokenizer cannot be loaded.")