Skip to content

Commit

Permalink
Fix Split Operation For Merging Modes (#311)
Browse files Browse the repository at this point in the history
* Fix Split Operation For Merging Modes

* Format Tests

* Update Stats

* Update Stats
  • Loading branch information
apaniukov authored Nov 7, 2024
1 parent 306dcd8 commit 45f441a
Show file tree
Hide file tree
Showing 9 changed files with 4,332 additions and 4,310 deletions.
80 changes: 40 additions & 40 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -603,163 +603,163 @@ This report is autogenerated and includes tokenizers and detokenizers tests. The
<tr>
<td >SentencePiece</td>
<td >NousResearch/Llama-2-13b-hf</td>
<td >96.73</td>
<td >97.55</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >NousResearch/Llama-2-13b-hf_legacy</td>
<td >95.92</td>
<td >NousResearch/Llama-2-13b-hf_legacy_sp_backend</td>
<td >97.55</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >NousResearch/Llama-2-13b-hf_sp_backend</td>
<td >95.10</td>
<td >94.29</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >TinyLlama/TinyLlama-1.1B-Chat-v1.0</td>
<td >96.76</td>
<td >94.33</td>
<td >247</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_legacy</td>
<td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_legacy_sp_backend</td>
<td >95.14</td>
<td >247</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >TinyLlama/TinyLlama-1.1B-Chat-v1.0_sp_backend</td>
<td >94.33</td>
<td >96.76</td>
<td >247</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >baichuan-inc/Baichuan2-7B-Chat_legacy</td>
<td >baichuan-inc/Baichuan2-7B-Chat_legacy_sp_backend</td>
<td >100.00</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >camembert-base</td>
<td >52.24</td>
<td >camembert-base_legacy_sp_backend</td>
<td >75.51</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >camembert-base_legacy</td>
<td >75.51</td>
<td >camembert-base_sp_backend</td>
<td >52.24</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >facebook/musicgen-small</td>
<td >83.67</td>
<td >facebook/musicgen-small_legacy_sp_backend</td>
<td >78.37</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >facebook/musicgen-small_legacy</td>
<td >78.37</td>
<td >facebook/musicgen-small_sp_backend</td>
<td >83.67</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >microsoft/Phi-3-mini-128k-instruct</td>
<td >95.95</td>
<td >95.14</td>
<td >247</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >microsoft/Phi-3-mini-128k-instruct_legacy</td>
<td >microsoft/Phi-3-mini-128k-instruct_legacy_sp_backend</td>
<td >94.33</td>
<td >247</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >microsoft/Phi-3-mini-128k-instruct_sp_backend</td>
<td >95.14</td>
<td >95.95</td>
<td >247</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >microsoft/deberta-v3-base</td>
<td >96.73</td>
<td >microsoft/deberta-v3-base_legacy_sp_backend</td>
<td >100.00</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >microsoft/deberta-v3-base_legacy</td>
<td >100.00</td>
<td >microsoft/deberta-v3-base_sp_backend</td>
<td >96.73</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >mlx-community/quantized-gemma-7b-it</td>
<td >96.76</td>
<td >97.57</td>
<td >247</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >mlx-community/quantized-gemma-7b-it_legacy</td>
<td >mlx-community/quantized-gemma-7b-it_legacy_sp_backend</td>
<td >97.57</td>
<td >247</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >mlx-community/quantized-gemma-7b-it_sp_backend</td>
<td >97.57</td>
<td >96.76</td>
<td >247</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >rinna/bilingual-gpt-neox-4b</td>
<td >82.04</td>
<td >rinna/bilingual-gpt-neox-4b_legacy_sp_backend</td>
<td >86.12</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >rinna/bilingual-gpt-neox-4b_legacy</td>
<td >86.12</td>
<td >rinna/bilingual-gpt-neox-4b_sp_backend</td>
<td >80.41</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >t5-base</td>
<td >85.31</td>
<td >t5-base_legacy_sp_backend</td>
<td >80.00</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >t5-base_legacy</td>
<td >80.00</td>
<td >t5-base_sp_backend</td>
<td >85.31</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >xlm-roberta-base</td>
<td >xlm-roberta-base_legacy_sp_backend</td>
<td >95.10</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >xlm-roberta-base_legacy</td>
<td >xlm-roberta-base_sp_backend</td>
<td >95.10</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >xlnet-base-cased</td>
<td >64.49</td>
<td >xlnet-base-cased_legacy_sp_backend</td>
<td >57.96</td>
<td >245</td>
</tr>
<tr>
<td >SentencePiece</td>
<td >xlnet-base-cased_legacy</td>
<td >57.96</td>
<td >xlnet-base-cased_sp_backend</td>
<td >64.49</td>
<td >245</td>
</tr>
<tr>
Expand Down
2 changes: 1 addition & 1 deletion python/openvino_tokenizers/convert_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@
TokenzierConversionParams,
change_inputs_type,
change_outputs_type,
update_rt_info_with_params,
update_rt_info_with_environment,
update_rt_info_with_params,
)


Expand Down
2 changes: 1 addition & 1 deletion python/openvino_tokenizers/tokenizer_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ def replace_spaces_metaspace(cls, replace_term=r"▁") -> "RegexNormalizationSte

@classmethod
def prepend_regex(cls, string: str) -> "RegexNormalizationStep":
return cls(regex_search_pattern=r"(^)(.+)", replace_term=rf"{string}$2")
return cls(regex_search_pattern=r"(^)(.)", replace_term=rf"{string}$2")

@classmethod
def prepend_with_check_regex(cls, string: str, check_string: str) -> "RegexNormalizationStep":
Expand Down
2 changes: 1 addition & 1 deletion src/combine_segments.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ void CombineSegments::validate_and_infer_types() {
OPENVINO_ASSERT(get_input_size() > 0);
OPENVINO_ASSERT((get_input_size() - 1)%3 == 0);

// First come several ragged tensors each represented as 3 regular tesors
// First come several ragged tensors each represented as 3 regular tensors
size_t num_inputs = (get_input_size() - 1)/3;
PartialShape ps = PartialShape::dynamic();
element::Type et = element::dynamic;
Expand Down
14 changes: 11 additions & 3 deletions src/regex_split.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
new_ragged_begins[seq] = ragged_offset;

for(size_t ragged_col = ragged_begins[seq]; ragged_col < ragged_ends[seq]; ++ragged_col) {
auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]);
const auto str = std::string(chars + begins[ragged_col], chars + ends[ragged_col]);

if (skips[ragged_col]) {
new_begins[ragged_offset] = begins[ragged_col];
Expand All @@ -230,15 +230,15 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
case (SplitMode::CONTIGUOUS):
OPENVINO_THROW("Prior to evaluate 'contiguous' mode should've been replaced with 'isolated'.");
break;
case (SplitMode::MERGED_WITH_NEXT):
case (SplitMode::MERGED_WITH_PREVIOUS):
if (invert == false && end != str.length()) {
last_begin = begin;
return;
} else if (invert == true) {
begin = last_begin;
}
break;
case (SplitMode::MERGED_WITH_PREVIOUS):
case (SplitMode::MERGED_WITH_NEXT):
if (invert == false) {
if (last_begin != -1) { begin = last_begin; }
} else {
Expand All @@ -248,6 +248,10 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
break;
}

// Clamp begin and end to the string length
begin = std::max(0, begin);
end = std::min(static_cast<int>(str.length()), end);

new_begins[ragged_offset] = begins[ragged_col] + begin;
if (num_splits == m_max_splits) {
end = str.length();
Expand Down Expand Up @@ -276,6 +280,10 @@ bool RegexSplit::evaluate(ov::TensorVector& outputs, const ov::TensorVector& inp
if (start < str.length()) {
if (has_skips) { new_skips[ragged_offset] = false; }
add_split(start, str.length(), m_invert);
} else if (m_split_mode == SplitMode::MERGED_WITH_NEXT && last_begin != str.length()) {
// Add last split if the match was at the end of the string
if (has_skips) { new_skips[ragged_offset] = false; }
add_split(last_begin, str.length(), m_invert);
}
}
}
Expand Down
9 changes: 4 additions & 5 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def add_tokenizer_type(row):

results_df = get_session_results_df(session)
results_df["Tokenizer Type"] = results_df.apply(add_tokenizer_type, axis=1)
results_df = results_df[results_df.status != "skipped"] # filter skipped tests
results_df.hf_wordpiece_tokenizers_param.fillna(results_df.hf_bpe_tokenizers_param, inplace=True)
results_df.hf_wordpiece_tokenizers_param.fillna(results_df.hf_sentencepiece_tokenizers_param, inplace=True)
results_df.hf_wordpiece_tokenizers_param.fillna(results_df.hf_tiktoken_tokenizers_param, inplace=True)
Expand All @@ -55,15 +56,13 @@ def add_tokenizer_type(row):
results_df.hf_wordpiece_tokenizers_param.fillna(
results_df.hf_tiktoken_tokenizers_with_padding_sides_param, inplace=True
)
results_df.is_fast_tokenizer_param.fillna(True, inplace=True)
results_df = results_df[results_df.status != "skipped"] # filter skipped tests
results_df.status = (results_df.status == "passed").astype(int)
results_df = results_df.dropna(subset=['hf_wordpiece_tokenizers_param'])
results_df["Model"] = (
results_df.hf_wordpiece_tokenizers_param
+ results_df.is_fast_tokenizer_param.apply(lambda x: "" if x else "_legacy")
+ results_df.is_sentencepiece_backend_param.apply(lambda x: "" if x else "_sp_backend")
+ ["_legacy" * value for value in results_df.index.str.contains("Slow")]
+ ["_sp_backend" * value for value in results_df.index.str.contains("sp_backend")]
)

results_df = results_df[["Tokenizer Type", "Model", "test_string", "status"]]
grouped_by_model = results_df.groupby(["Tokenizer Type", "Model"]).agg({"status": ["mean", "count"]}).reset_index()
grouped_by_model.columns = ["Tokenizer Type", "Model", "Output Matched, %", "Number of Tests"]
Expand Down
14 changes: 14 additions & 0 deletions tests/layer_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,20 @@ def create_splitting_model(layer: PreTokenizatinStep) -> ov.CompiledModel:
("Hello world!", ("Hello", "world!"), RegexSplitStep.bert_whitespace_splitter()),
("", ("",), RegexSplitStep.whitespace_splitter()),
*[(prompt, tuple(re_clip_splitter.findall(prompt)), clip_splitter) for prompt in text2image_prompts],
(
"▁one▁two▁three▁",
("▁one", "▁two", "▁three", "▁"),
RegexSplitStep(split_pattern="▁", behaviour="mergedwithnext"),
),
("▁", ("▁",), RegexSplitStep(split_pattern="▁", behaviour="mergedwithnext")),
("No split pattern", ("No split pattern",), RegexSplitStep(split_pattern="▁", behaviour="mergedwithnext")),
(
"▁one▁two▁three▁",
("▁", "one▁", "two▁", "three▁"),
RegexSplitStep(split_pattern="▁", behaviour="mergedwithprevious"),
),
("▁", ("▁",), RegexSplitStep(split_pattern="▁", behaviour="mergedwithprevious")),
("No split pattern", ("No split pattern",), RegexSplitStep(split_pattern="▁", behaviour="mergedwithprevious")),
],
)
def test_regex_split(test_string, expected, layer):
Expand Down
3 changes: 2 additions & 1 deletion tests/pass_rates.json
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
{
"tests/tokenizers_test.py::test_": 0.9247631283121889
"tests/tokenizers_test.py::test_": 0.9247631283121889,
"tokenizers_test.py::test_": 0.9247631283121889
}
Loading

0 comments on commit 45f441a

Please sign in to comment.