diff --git a/README.md b/README.md index ae677760..3472b766 100644 --- a/README.md +++ b/README.md @@ -11,9 +11,16 @@ There are two Tokenizers in this repository, both of which can perform the 3 pri Finally, the script [train.py](train.py) trains both of these tokenizers on the input text [taylorswift.txt](taylorswift.txt) (this is the Wikipedia entry for her kek) and saves the vocab to disk for visualization. This script runs in about 25 seconds on my (M1) MacBook. -## call for action +### reproducing tiktoken GPT-4 -Similar to my earlier repo [llama2.c](https://github.com/karpathy/llama2.c), here I will list (and accept PRs for) any versions of this code that might implement or specialize this algorithm for different use cases, or implement it in different languages (e.g. C, Rust, JavaScript, etc.). One of these forks could then become a standard implementation people wish to use for Tokenization in LLMs, to deprecate the use of sentencepiece. I will then try to keep this repo as a small, clean reference for tokenization algorithms. +The correctness of this code is also established by exactly reproducing the [tiktoken](https://github.com/openai/tiktoken) library, and its encoding/decoding with the GPT-4 tokenizer. In particular, we can take the `_mergeable_ranks` from the GPT4 tokenizer: + +``` +enc = tiktoken.get_encoding("cl100k_base") +mergeable_ranks = enc._mergeable_ranks +``` + +And use them to construct a `RegexTokenizer` that will exactly reproduce the tokenization of GPT4. Run and step through the file [test_gpt4.py](test_gpt4.py) for details. ## License diff --git a/bpe_regex.py b/bpe_regex.py index 4fa31942..91cc43f8 100644 --- a/bpe_regex.py +++ b/bpe_regex.py @@ -55,6 +55,9 @@ def __init__(self): self.merges = {} self.vocab = {idx: bytes([idx]) for idx in range(256)} self.pattern = re.compile(GPT4_SPLIT_PATTERN) + # ugly optional part, only needed for GPT-4 tiktoken compatibility + self.byte_shuffle = None + self.inverse_byte_shuffle = None def train(self, text, vocab_size, verbose=False): assert vocab_size >= 256 @@ -97,13 +100,19 @@ def train(self, text, vocab_size, verbose=False): def decode(self, ids): # given ids (list of integers), return Python string text_bytes = b"".join(self.vocab[idx] for idx in ids) + if self.inverse_byte_shuffle is not None: + text_bytes = bytes(self.inverse_byte_shuffle[b] for b in text_bytes) text = text_bytes.decode("utf-8", errors="replace") return text def _encode_chunk(self, text): # given a string text, return the token ids text_bytes = text.encode("utf-8") # raw bytes - ids = list(text_bytes) # list of integers in range 0..255 + # needed to repro GPT-4 because OpenAI shuffles its 1-byte tokens order + if self.byte_shuffle is not None: + text_bytes = bytes(self.byte_shuffle[b] for b in text_bytes) + # let's begin. first, convert all bytes to integers in range 0..255 + ids = list(text_bytes) while len(ids) >= 2: # find the pair with the lowest merge index stats = get_stats(ids) diff --git a/test_gpt4.py b/test_gpt4.py new file mode 100644 index 00000000..64803392 --- /dev/null +++ b/test_gpt4.py @@ -0,0 +1,80 @@ +""" +Verifies that our implementation agrees with that from tiktoken, +and that we can encode and decode text exactly as GPT-4 would. +""" + +import tiktoken +from bpe_regex import Tokenizer as RegexTokenizer + +# get the official tokenizer and its merges +enc = tiktoken.get_encoding("cl100k_base") +# mergeable_ranks is the variable thing we need from the official tokenizer +mergeable_ranks = enc._mergeable_ranks + +# ----------------------------------------------------------------------------- +""" +now comes a bit tricky part. +- the `merges` that tiktoken has above contain first the 255 raw bytes, but + for some reason these bytes are permuted in a different order. This is + non-sensical, and I think historical, but for that reason we have to here + use that custom byte order manually and it looks weird but it's ok. +- second, the `merges` are already the byte sequences in their merged state. + so we have to recover the original pairings. We can do this by doing + a small BPE training run on all the tokens, in their order. +also see https://github.com/openai/tiktoken/issues/60 +""" + +def bpe(mergeable_ranks, token, max_rank): + parts = [bytes([b]) for b in token] + while True: + min_idx = None + min_rank = None + for i, pair in enumerate(zip(parts[:-1], parts[1:])): + rank = mergeable_ranks.get(pair[0] + pair[1]) + if rank is not None and (min_rank is None or rank < min_rank): + min_idx = i + min_rank = rank + if min_rank is None or (max_rank is not None and min_rank >= max_rank): + break + assert min_idx is not None + parts = parts[:min_idx] + [parts[min_idx] + parts[min_idx + 1]] + parts[min_idx + 2:] + return parts + +merges = {} +for token, rank in mergeable_ranks.items(): + if len(token) == 1: + continue # skip raw bytes + pair = tuple(bpe(mergeable_ranks, token, max_rank=rank)) + assert len(pair) == 2 + # recover the integer ranks of the pair + ix0 = mergeable_ranks[pair[0]] + ix1 = mergeable_ranks[pair[1]] + merges[(ix0, ix1)] = rank + +# ----------------------------------------------------------------------------- +# now create our own tokenizer. bit hacky +tokenizer = RegexTokenizer() +# override the merges +tokenizer.merges = merges +# and finally keep in mind we have to shuffle the bytes +tokenizer.byte_shuffle = {i: mergeable_ranks[bytes([i])] for i in range(256)} +tokenizer.inverse_byte_shuffle = {v: k for k, v in tokenizer.byte_shuffle.items()} +# re-construct the vocab +vocab = {idx: bytes([idx]) for idx in range(256)} +for (p0, p1), idx in merges.items(): + vocab[idx] = vocab[p0] + vocab[p1] +tokenizer.vocab = vocab + +# ----------------------------------------------------------------------------- +# let's take it for a spin! +text = "hello world!!!? (안녕하세요!) lol123 😉" +print(text) +print(enc.encode(text)) # tiktoken +print(tokenizer.encode(text)) # ours +print(tokenizer.decode(tokenizer.encode(text))) # ours back to text + +# two quick tests: equality (to tiktoken) and identity +print("OK" if enc.encode(text) == tokenizer.encode(text) else "FAIL") +print("OK" if text == tokenizer.decode(tokenizer.encode(text)) else "FAIL") + +# let's also tokenize all of taylor swift \ No newline at end of file