From 7843c9627b34eb15c67799b335cd6017c13e8aeb Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 18 Feb 2024 07:24:17 -0800 Subject: [PATCH] optimization: allow get_stats to update an existing counts dict. train.py runtime goes from 32s to 22s doing this. ty @gklab for original suggestion in a PR --- minbpe/base.py | 5 +++-- minbpe/regex.py | 10 ++++------ train.py | 5 +++++ 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/minbpe/base.py b/minbpe/base.py index 61c2759a..872c34e1 100644 --- a/minbpe/base.py +++ b/minbpe/base.py @@ -10,12 +10,13 @@ # ----------------------------------------------------------------------------- # a few helper functions useful for both BasicTokenizer and RegexTokenizer -def get_stats(ids): +def get_stats(ids, counts=None): """ Given a list of integers, return a dictionary of counts of consecutive pairs Example: [1, 2, 3, 1, 2] -> {(1, 2): 2, (2, 3): 1, (3, 1): 1} + Optionally allows to update an existing dictionary of counts """ - counts = {} + counts = {} if counts is None else counts for pair in zip(ids, ids[1:]): # iterate consecutive elements counts[pair] = counts.get(pair, 0) + 1 return counts diff --git a/minbpe/regex.py b/minbpe/regex.py index 8678d477..5aa5b22b 100644 --- a/minbpe/regex.py +++ b/minbpe/regex.py @@ -41,13 +41,11 @@ def train(self, text, vocab_size, verbose=False): merges = {} # (int, int) -> int vocab = {idx: bytes([idx]) for idx in range(256)} # idx -> bytes for i in range(num_merges): - # count up the number of times every consecutive pair appears - chunk_stats = [get_stats(chunk_ids) for chunk_ids in ids] - # combine the pair counts from all chunks by summing them up + # count the number of times every consecutive pair appears stats = {} - for chstat in chunk_stats: - for pair, count in chstat.items(): - stats[pair] = stats.get(pair, 0) + count + for chunk_ids in ids: + # passing in stats will update it in place, adding up counts + get_stats(chunk_ids, stats) # find the pair with the highest count pair = max(stats, key=stats.get) # mint a new token: assign it the next available id diff --git a/train.py b/train.py index b8729ff3..b2768d3e 100644 --- a/train.py +++ b/train.py @@ -4,6 +4,7 @@ """ import os +import time from minbpe import BasicTokenizer, RegexTokenizer # open some text and train a vocab of 512 tokens @@ -12,6 +13,7 @@ # create a directory for models, so we don't pollute the current directory os.makedirs("models", exist_ok=True) +t0 = time.time() for TokenizerClass, name in zip([BasicTokenizer, RegexTokenizer], ["basic", "regex"]): # construct the Tokenizer object and kick off verbose training @@ -20,3 +22,6 @@ # writes two files in the models directory: name.model, and name.vocab prefix = os.path.join("models", name) tokenizer.save(prefix) +t1 = time.time() + +print(f"Training took {t1 - t0:.2f} seconds") \ No newline at end of file