Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Zhichang Yu
2024-11-28 13:00:38 +08:00
committed by GitHub
parent 9f57534843
commit bc701d7b4c
19 changed files with 51 additions and 46 deletions

View File

@ -99,7 +99,7 @@ class Dealer:
txt = re.sub(p, r, txt)
res = []
for t in rag_tokenizer.tokenize(txt).split(" "):
for t in rag_tokenizer.tokenize(txt).split():
tk = t
if (stpwd and tk in self.stop_words) or (
re.match(r"[0-9]$", tk) and not num):
@ -150,7 +150,7 @@ class Dealer:
def split(self, txt):
tks = []
for t in re.sub(r"[ \t]+", " ", txt).split(" "):
for t in re.sub(r"[ \t]+", " ", txt).split():
if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
re.match(r".*[a-zA-Z]$", t) and tks and \
self.ne.get(t, "") != "func" and self.ne.get(tks[-1], "") != "func":
@ -198,7 +198,7 @@ class Dealer:
s = 0
if not s and len(t) >= 4:
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split() if len(tt) > 1]
if len(s) > 1:
s = np.min([freq(tt) for tt in s]) / 6.
else:
@ -214,7 +214,7 @@ class Dealer:
elif re.match(r"[a-z. -]+$", t):
return 300
elif len(t) >= 4:
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split() if len(tt) > 1]
if len(s) > 1:
return max(3, np.min([df(tt) for tt in s]) / 6.)