Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve?

Edit chunk shall update instead of insert it. Close #3679 

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Zhichang Yu
2024-11-28 13:00:38 +08:00
committed by GitHub
parent 9f57534843
commit bc701d7b4c
19 changed files with 51 additions and 46 deletions

View File

@ -325,12 +325,12 @@ def remove_contents_table(sections, eng=False):
sections.pop(i)
if i >= len(sections):
break
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
prefix = get(i)[:3] if not eng else " ".join(get(i).split()[:2])
while not prefix:
sections.pop(i)
if i >= len(sections):
break
prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
prefix = get(i)[:3] if not eng else " ".join(get(i).split()[:2])
sections.pop(i)
if i >= len(sections) or not prefix:
break
@ -389,7 +389,7 @@ def title_frequency(bull, sections):
def not_title(txt):
if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt):
return False
if len(txt.split(" ")) > 12 or (txt.find(" ") < 0 and len(txt) >= 32):
if len(txt.split()) > 12 or (txt.find(" ") < 0 and len(txt) >= 32):
return True
return re.search(r"[,;,。;!!]", txt)

View File

@ -74,7 +74,7 @@ class FulltextQueryer:
if not self.isChinese(txt):
txt = FulltextQueryer.rmWWW(txt)
tks = rag_tokenizer.tokenize(txt).split(" ")
tks = rag_tokenizer.tokenize(txt).split()
keywords = [t for t in tks if t]
tks_w = self.tw.weights(tks, preprocess=False)
tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
@ -83,7 +83,7 @@ class FulltextQueryer:
syns = []
for tk, w in tks_w:
syn = self.syn.lookup(tk)
syn = rag_tokenizer.tokenize(" ".join(syn)).split(" ")
syn = rag_tokenizer.tokenize(" ".join(syn)).split()
keywords.extend(syn)
syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn]
syns.append(" ".join(syn))
@ -114,7 +114,7 @@ class FulltextQueryer:
txt = FulltextQueryer.rmWWW(txt)
qs, keywords = [], []
for tt in self.tw.split(txt)[:256]: # .split(" "):
for tt in self.tw.split(txt)[:256]: # .split():
if not tt:
continue
keywords.append(tt)
@ -125,7 +125,7 @@ class FulltextQueryer:
tms = []
for tk, w in sorted(twts, key=lambda x: x[1] * -1):
sm = (
rag_tokenizer.fine_grained_tokenize(tk).split(" ")
rag_tokenizer.fine_grained_tokenize(tk).split()
if need_fine_grained_tokenize(tk)
else []
)
@ -194,7 +194,7 @@ class FulltextQueryer:
def toDict(tks):
d = {}
if isinstance(tks, str):
tks = tks.split(" ")
tks = tks.split()
for t, c in self.tw.weights(tks, preprocess=False):
if t not in d:
d[t] = 0

View File

@ -192,7 +192,7 @@ class RagTokenizer:
# if split chars is part of token
res = []
tks = re.sub(r"[ ]+", " ", tks).split(" ")
tks = re.sub(r"[ ]+", " ", tks).split()
s = 0
while True:
if s >= len(tks):
@ -329,7 +329,7 @@ class RagTokenizer:
return self.merge_(res)
def fine_grained_tokenize(self, tks):
tks = tks.split(" ")
tks = tks.split()
zh_num = len([1 for c in tks if c and is_chinese(c[0])])
if zh_num < len(tks) * 0.2:
res = []
@ -393,7 +393,7 @@ def is_alphabet(s):
def naiveQie(txt):
tks = []
for t in txt.split(" "):
for t in txt.split():
if tks and re.match(r".*[a-zA-Z]$", tks[-1]
) and re.match(r".*[a-zA-Z]$", t):
tks.append(" ")

View File

@ -114,7 +114,7 @@ class Dealer:
for k in keywords:
kwds.add(k)
for kk in rag_tokenizer.fine_grained_tokenize(k).split(" "):
for kk in rag_tokenizer.fine_grained_tokenize(k).split():
if len(kk) < 2:
continue
if kk in kwds:
@ -186,7 +186,7 @@ class Dealer:
assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
len(ans_v[0]), len(chunk_v[0]))
chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split(" ")
chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split()
for ck in chunks]
cites = {}
thr = 0.63
@ -195,7 +195,7 @@ class Dealer:
sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
chunk_v,
rag_tokenizer.tokenize(
self.qryr.rmWWW(pieces_[i])).split(" "),
self.qryr.rmWWW(pieces_[i])).split(),
chunks_tks,
tkweight, vtweight)
mx = np.max(sim) * 0.99
@ -244,8 +244,8 @@ class Dealer:
sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
ins_tw = []
for i in sres.ids:
content_ltks = sres.field[i][cfield].split(" ")
title_tks = [t for t in sres.field[i].get("title_tks", "").split(" ") if t]
content_ltks = sres.field[i][cfield].split()
title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
important_kwd = sres.field[i].get("important_kwd", [])
tks = content_ltks + title_tks + important_kwd
ins_tw.append(tks)
@ -265,8 +265,8 @@ class Dealer:
sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
ins_tw = []
for i in sres.ids:
content_ltks = sres.field[i][cfield].split(" ")
title_tks = [t for t in sres.field[i].get("title_tks", "").split(" ") if t]
content_ltks = sres.field[i][cfield].split()
title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
important_kwd = sres.field[i].get("important_kwd", [])
tks = content_ltks + title_tks + important_kwd
ins_tw.append(tks)
@ -279,8 +279,8 @@ class Dealer:
def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
return self.qryr.hybrid_similarity(ans_embd,
ins_embd,
rag_tokenizer.tokenize(ans).split(" "),
rag_tokenizer.tokenize(inst).split(" "))
rag_tokenizer.tokenize(ans).split(),
rag_tokenizer.tokenize(inst).split())
def retrieval(self, question, embd_mdl, tenant_ids, kb_ids, page, page_size, similarity_threshold=0.2,
vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True, rerank_mdl=None, highlight=False):

View File

@ -99,7 +99,7 @@ class Dealer:
txt = re.sub(p, r, txt)
res = []
for t in rag_tokenizer.tokenize(txt).split(" "):
for t in rag_tokenizer.tokenize(txt).split():
tk = t
if (stpwd and tk in self.stop_words) or (
re.match(r"[0-9]$", tk) and not num):
@ -150,7 +150,7 @@ class Dealer:
def split(self, txt):
tks = []
for t in re.sub(r"[ \t]+", " ", txt).split(" "):
for t in re.sub(r"[ \t]+", " ", txt).split():
if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
re.match(r".*[a-zA-Z]$", t) and tks and \
self.ne.get(t, "") != "func" and self.ne.get(tks[-1], "") != "func":
@ -198,7 +198,7 @@ class Dealer:
s = 0
if not s and len(t) >= 4:
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split() if len(tt) > 1]
if len(s) > 1:
s = np.min([freq(tt) for tt in s]) / 6.
else:
@ -214,7 +214,7 @@ class Dealer:
elif re.match(r"[a-z. -]+$", t):
return 300
elif len(t) >= 4:
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split() if len(tt) > 1]
if len(s) > 1:
return max(3, np.min([df(tt) for tt in s]) / 6.)