Skip to content

Commit

Permalink
fix pytest -v, it was printing all of taylor swift text
Browse files Browse the repository at this point in the history
  • Loading branch information
karpathy committed Feb 19, 2024
1 parent e82c123 commit 37b63c2
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 5 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,10 +55,10 @@ print(tokenizer.encode(text))
We use the pytest library for tests. All of them are located in the `tests/` directory. First `pip install pytest` if you haven't already, then:

```bash
$ pytest .
$ pytest -v .
```

to run the tests.
to run the tests. (-v is verbose, slightly prettier).

## todos

Expand Down
17 changes: 14 additions & 3 deletions tests/test_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,23 @@
# common test data

# a few strings to test the tokenizers on
dirname = os.path.dirname(os.path.abspath(__file__))
taylorswift_file = os.path.join(dirname, "taylorswift.txt")
test_strings = [
"", # empty string
"?", # single character
"hello world!!!? (안녕하세요!) lol123 😉", # fun small string
open(taylorswift_file, "r", encoding="utf-8").read(), # big string
"FILE:taylorswift.txt", # FILE: is handled as a special string in unpack()
]
def unpack(text):
# we do this because `pytest -v .` prints the arguments to console, and we don't
# want to print the entire contents of the file, it creates a mess. So here we go.
if text.startswith("FILE:"):
dirname = os.path.dirname(os.path.abspath(__file__))
taylorswift_file = os.path.join(dirname, text[5:])
contents = open(taylorswift_file, "r", encoding="utf-8").read()
return contents
else:
return text

specials_string = """
<|endoftext|>Hello world this is one document
<|endoftext|>And this is another document
Expand Down Expand Up @@ -43,6 +52,7 @@
@pytest.mark.parametrize("tokenizer_factory", [BasicTokenizer, RegexTokenizer, GPT4Tokenizer])
@pytest.mark.parametrize("text", test_strings)
def test_encode_decode_identity(tokenizer_factory, text):
text = unpack(text)
tokenizer = tokenizer_factory()
ids = tokenizer.encode(text)
decoded = tokenizer.decode(ids)
Expand All @@ -51,6 +61,7 @@ def test_encode_decode_identity(tokenizer_factory, text):
# test that our tokenizer matches the official GPT-4 tokenizer
@pytest.mark.parametrize("text", test_strings)
def test_gpt4_tiktoken_equality(text):
text = unpack(text)
tokenizer = GPT4Tokenizer()
enc = tiktoken.get_encoding("cl100k_base")
tiktoken_ids = enc.encode(text)
Expand Down

0 comments on commit 37b63c2

Please sign in to comment.