diff --git a/minbpe/regex.py b/minbpe/regex.py index 0c282571..9ed78e43 100644 --- a/minbpe/regex.py +++ b/minbpe/regex.py @@ -146,7 +146,7 @@ def encode(self, text, allowed_special="none_raise"): return self.encode_ordinary(text) # otherwise, we have to be careful with potential special tokens in text # we handle special tokens by splitting the text - # based on the occurence of any exact match with any of the special tokens + # based on the occurrence of any exact match with any of the special tokens # we can use re.split for this. note that surrounding the pattern with () # makes it into a capturing group, so the special tokens will be included special_pattern = "(" + "|".join(re.escape(k) for k in special) + ")"