fix pytest -v, it was printing all of taylor swift text

fortarch · Feb 19, 2024 · 37b63c2 · 37b63c2
1 parent e82c123
commit 37b63c2
Show file tree

Hide file tree

Showing 2 changed files with 16 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -55,10 +55,10 @@ print(tokenizer.encode(text))
 We use the pytest library for tests. All of them are located in the `tests/` directory. First `pip install pytest` if you haven't already, then:
 
 ```bash
-$ pytest .
+$ pytest -v .
 ```
 
-to run the tests.
+to run the tests. (-v is verbose, slightly prettier).
 
 ## todos
 

diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -8,14 +8,23 @@
 # common test data
 
 # a few strings to test the tokenizers on
-dirname = os.path.dirname(os.path.abspath(__file__))
-taylorswift_file = os.path.join(dirname, "taylorswift.txt")
 test_strings = [
     "", # empty string
     "?", # single character
     "hello world!!!? (안녕하세요!) lol123 😉", # fun small string
-    open(taylorswift_file, "r", encoding="utf-8").read(), # big string
+    "FILE:taylorswift.txt", # FILE: is handled as a special string in unpack()
 ]
+def unpack(text):
+    # we do this because `pytest -v .` prints the arguments to console, and we don't
+    # want to print the entire contents of the file, it creates a mess. So here we go.
+    if text.startswith("FILE:"):
+        dirname = os.path.dirname(os.path.abspath(__file__))
+        taylorswift_file = os.path.join(dirname, text[5:])
+        contents = open(taylorswift_file, "r", encoding="utf-8").read()
+        return contents
+    else:
+        return text
+
 specials_string = """
 <|endoftext|>Hello world this is one document
 <|endoftext|>And this is another document
@@ -43,6 +52,7 @@
 @pytest.mark.parametrize("tokenizer_factory", [BasicTokenizer, RegexTokenizer, GPT4Tokenizer])
 @pytest.mark.parametrize("text", test_strings)
 def test_encode_decode_identity(tokenizer_factory, text):
+    text = unpack(text)
     tokenizer = tokenizer_factory()
     ids = tokenizer.encode(text)
     decoded = tokenizer.decode(ids)
@@ -51,6 +61,7 @@ def test_encode_decode_identity(tokenizer_factory, text):
 # test that our tokenizer matches the official GPT-4 tokenizer
 @pytest.mark.parametrize("text", test_strings)
 def test_gpt4_tiktoken_equality(text):
+    text = unpack(text)
     tokenizer = GPT4Tokenizer()
     enc = tiktoken.get_encoding("cl100k_base")
     tiktoken_ids = enc.encode(text)