From 7843c9627b34eb15c67799b335cd6017c13e8aeb Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sun, 18 Feb 2024 07:24:17 -0800
Subject: [PATCH] optimization: allow get_stats to update an existing counts
 dict. train.py runtime goes from 32s to 22s doing this. ty @gklab for
 original suggestion in a PR

---
 minbpe/base.py  |  5 +++--
 minbpe/regex.py | 10 ++++------
 train.py        |  5 +++++
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/minbpe/base.py b/minbpe/base.py
index 61c2759a..872c34e1 100644
--- a/minbpe/base.py
+++ b/minbpe/base.py
@@ -10,12 +10,13 @@
 # -----------------------------------------------------------------------------
 # a few helper functions useful for both BasicTokenizer and RegexTokenizer
 
-def get_stats(ids):
+def get_stats(ids, counts=None):
     """
     Given a list of integers, return a dictionary of counts of consecutive pairs
     Example: [1, 2, 3, 1, 2] -> {(1, 2): 2, (2, 3): 1, (3, 1): 1}
+    Optionally allows to update an existing dictionary of counts
     """
-    counts = {}
+    counts = {} if counts is None else counts
     for pair in zip(ids, ids[1:]): # iterate consecutive elements
         counts[pair] = counts.get(pair, 0) + 1
     return counts
diff --git a/minbpe/regex.py b/minbpe/regex.py
index 8678d477..5aa5b22b 100644
--- a/minbpe/regex.py
+++ b/minbpe/regex.py
@@ -41,13 +41,11 @@ def train(self, text, vocab_size, verbose=False):
         merges = {} # (int, int) -> int
         vocab = {idx: bytes([idx]) for idx in range(256)} # idx -> bytes
         for i in range(num_merges):
-            # count up the number of times every consecutive pair appears
-            chunk_stats = [get_stats(chunk_ids) for chunk_ids in ids]
-            # combine the pair counts from all chunks by summing them up
+            # count the number of times every consecutive pair appears
             stats = {}
-            for chstat in chunk_stats:
-                for pair, count in chstat.items():
-                    stats[pair] = stats.get(pair, 0) + count
+            for chunk_ids in ids:
+                # passing in stats will update it in place, adding up counts
+                get_stats(chunk_ids, stats)
             # find the pair with the highest count
             pair = max(stats, key=stats.get)
             # mint a new token: assign it the next available id
diff --git a/train.py b/train.py
index b8729ff3..b2768d3e 100644
--- a/train.py
+++ b/train.py
@@ -4,6 +4,7 @@
 """
 
 import os
+import time
 from minbpe import BasicTokenizer, RegexTokenizer
 
 # open some text and train a vocab of 512 tokens
@@ -12,6 +13,7 @@
 # create a directory for models, so we don't pollute the current directory
 os.makedirs("models", exist_ok=True)
 
+t0 = time.time()
 for TokenizerClass, name in zip([BasicTokenizer, RegexTokenizer], ["basic", "regex"]):
 
     # construct the Tokenizer object and kick off verbose training
@@ -20,3 +22,6 @@
     # writes two files in the models directory: name.model, and name.vocab
     prefix = os.path.join("models", name)
     tokenizer.save(prefix)
+t1 = time.time()
+
+print(f"Training took {t1 - t0:.2f} seconds")
\ No newline at end of file