Merge pull request karpathy#13 from gklab/spelling_error

Adjust comments & block commit of the models folder
miknoj · Feb 18, 2024 · ff20c92 · ff20c92
2 parents b52bf9b + cf120a4
commit ff20c92
Show file tree

Hide file tree

Showing 5 changed files with 5 additions and 4 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 __pycache__/
 .DS_Store
+models/**/*
diff --git a/minbpe/basic.py b/minbpe/basic.py
@@ -35,7 +35,7 @@ def train(self, text, vocab_size, verbose=False):
             pair = max(stats, key=stats.get)
             # mint a new token: assign it the next available id
             idx = 256 + i
-            # replace all occurences of pair in ids with idx
+            # replace all occurrences of pair in ids with idx
             ids = merge(ids, pair, idx)
             # save the merge
             merges[pair] = idx

diff --git a/minbpe/regex.py b/minbpe/regex.py
@@ -52,7 +52,7 @@ def train(self, text, vocab_size, verbose=False):
             pair = max(stats, key=stats.get)
             # mint a new token: assign it the next available id
             idx = 256 + i
-            # replace all occurences of pair in ids with idx
+            # replace all occurrences of pair in ids with idx
             ids = [merge(chunk_ids, pair, idx) for chunk_ids in ids]
             # save the merge
             merges[pair] = idx

diff --git a/tests/test_tokenizer.py b/tests/test_tokenizer.py
@@ -39,7 +39,7 @@ def test_wikipedia_example(tokenizer_factory):
     Quick unit test, following along the Wikipedia example:
     https://en.wikipedia.org/wiki/Byte_pair_encoding
 
-    According to Wikipedia, running bpe on the the input string:
+    According to Wikipedia, running bpe on the input string:
     "aaabdaaabac"
 
     for 3 merges will result in string:

diff --git a/train.py b/train.py
@@ -9,7 +9,7 @@
 # open some text and train a vocab of 512 tokens
 text = open("tests/taylorswift.txt", "r", encoding="utf-8").read()
 
-# create a directory for models so we don't pollute the current directory
+# create a directory for models, so we don't pollute the current directory
 os.makedirs("models", exist_ok=True)
 
 for TokenizerClass, name in zip([BasicTokenizer, RegexTokenizer], ["basic", "regex"]):