From 37b63c28f77f9d01a2e86a55b8ac99826265a889 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 19 Feb 2024 06:54:49 -0800 Subject: [PATCH] fix pytest -v, it was printing all of taylor swift text --- README.md | 4 ++-- tests/test_tokenizer.py | 17 ++++++++++++++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 7466f0ef..2ec5833a 100644 --- a/README.md +++ b/README.md @@ -55,10 +55,10 @@ print(tokenizer.encode(text)) We use the pytest library for tests. All of them are located in the `tests/` directory. First `pip install pytest` if you haven't already, then: ```bash -$ pytest . +$ pytest -v . ``` -to run the tests. +to run the tests. (-v is verbose, slightly prettier). ## todos diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py index d41065af..439db6e8 100644 --- a/tests/test_tokenizer.py +++ b/tests/test_tokenizer.py @@ -8,14 +8,23 @@ # common test data # a few strings to test the tokenizers on -dirname = os.path.dirname(os.path.abspath(__file__)) -taylorswift_file = os.path.join(dirname, "taylorswift.txt") test_strings = [ "", # empty string "?", # single character "hello world!!!? (안녕하세요!) lol123 😉", # fun small string - open(taylorswift_file, "r", encoding="utf-8").read(), # big string + "FILE:taylorswift.txt", # FILE: is handled as a special string in unpack() ] +def unpack(text): + # we do this because `pytest -v .` prints the arguments to console, and we don't + # want to print the entire contents of the file, it creates a mess. So here we go. + if text.startswith("FILE:"): + dirname = os.path.dirname(os.path.abspath(__file__)) + taylorswift_file = os.path.join(dirname, text[5:]) + contents = open(taylorswift_file, "r", encoding="utf-8").read() + return contents + else: + return text + specials_string = """ <|endoftext|>Hello world this is one document <|endoftext|>And this is another document @@ -43,6 +52,7 @@ @pytest.mark.parametrize("tokenizer_factory", [BasicTokenizer, RegexTokenizer, GPT4Tokenizer]) @pytest.mark.parametrize("text", test_strings) def test_encode_decode_identity(tokenizer_factory, text): + text = unpack(text) tokenizer = tokenizer_factory() ids = tokenizer.encode(text) decoded = tokenizer.decode(ids) @@ -51,6 +61,7 @@ def test_encode_decode_identity(tokenizer_factory, text): # test that our tokenizer matches the official GPT-4 tokenizer @pytest.mark.parametrize("text", test_strings) def test_gpt4_tiktoken_equality(text): + text = unpack(text) tokenizer = GPT4Tokenizer() enc = tiktoken.get_encoding("cl100k_base") tiktoken_ids = enc.encode(text)