Skip to content

Commit

Permalink
optimize text parser (infiniflow#2144)
Browse files Browse the repository at this point in the history
### What problem does this PR solve?


### Type of change

- [x] Performance Improvement
  • Loading branch information
KevinHuSh authored Aug 28, 2024
1 parent 54f7c6e commit a0b7c78
Showing 1 changed file with 26 additions and 10 deletions.
36 changes: 26 additions & 10 deletions deepdoc/parser/txt_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,30 @@ def __call__(self, fnm, binary=None, chunk_token_num=128, delimiter="\n!?;。;
def parser_txt(cls, txt, chunk_token_num=128, delimiter="\n!?;。;!?"):
if type(txt) != str:
raise TypeError("txt type should be str!")
sections = []
for sec in re.split(r"[%s]+"%delimiter, txt):
if sections and sec in delimiter:
sections[-1][0] += sec
continue
if num_tokens_from_string(sec) > 10 * int(chunk_token_num):
sections.append([sec[: int(len(sec) / 2)], ""])
sections.append([sec[int(len(sec) / 2) :], ""])
cks = [""]
tk_nums = [0]

def add_chunk(t):
nonlocal cks, tk_nums, delimiter
tnum = num_tokens_from_string(t)
if tnum < 8:
pos = ""
if tk_nums[-1] > chunk_token_num:
cks.append(t)
tk_nums.append(tnum)
else:
cks[-1] += t
tk_nums[-1] += tnum

s, e = 0, 1
while e < len(txt):
if txt[e] in delimiter:
add_chunk(txt[s: e + 1])
s = e + 1
e = s + 1
else:
sections.append([sec, ""])
return sections
e += 1
if s < e:
add_chunk(txt[s: e + 1])

return [[c,""] for c in cks]

0 comments on commit a0b7c78

Please sign in to comment.