Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Zhichang Yu
2024-11-28 13:00:38 +08:00
committed by GitHub
parent 9f57534843
commit bc701d7b4c
19 changed files with 51 additions and 46 deletions

View File

@ -192,7 +192,7 @@ class RagTokenizer:
# if split chars is part of token
res = []
tks = re.sub(r"[ ]+", " ", tks).split(" ")
tks = re.sub(r"[ ]+", " ", tks).split()
s = 0
while True:
if s >= len(tks):
@ -329,7 +329,7 @@ class RagTokenizer:
return self.merge_(res)
def fine_grained_tokenize(self, tks):
tks = tks.split(" ")
tks = tks.split()
zh_num = len([1 for c in tks if c and is_chinese(c[0])])
if zh_num < len(tks) * 0.2:
res = []
@ -393,7 +393,7 @@ def is_alphabet(s):
def naiveQie(txt):
tks = []
for t in txt.split(" "):
for t in txt.split():
if tks and re.match(r".*[a-zA-Z]$", tks[-1]
) and re.match(r".*[a-zA-Z]$", t):
tks.append(" ")