Edit chunk shall update instead of insert it (#3709)

### What problem does this PR solve? Edit chunk shall update instead of insert it. Close #3679 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-01-31 07:36:46 +08:00 · 2024-11-28 13:00:38 +08:00
parent 9f57534843
commit bc701d7b4c
19 changed files with 51 additions and 46 deletions
--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@ -99,11 +99,11 @@ class Pdf(PdfParser):
            i += 1
            txt = b["text"].lower().strip()
            if re.match("(abstract|摘要)", txt):
-                if len(txt.split(" ")) > 32 or len(txt) > 64:
+                if len(txt.split()) > 32 or len(txt) > 64:
                    abstr = txt + self._line_tag(b, zoomin)
                    break
                txt = self.boxes[i]["text"].lower().strip()
-                if len(txt.split(" ")) > 32 or len(txt) > 64:
+                if len(txt.split()) > 32 or len(txt) > 64:
                    abstr = txt + self._line_tag(self.boxes[i], zoomin)
                i += 1
                break
--- a/rag/app/picture.py
+++ b/rag/app/picture.py
@ -33,7 +33,7 @@ def chunk(filename, binary, tenant_id, lang, callback=None, **kwargs):
    txt = "\n".join([t[0] for _, t in bxs if t[0]])
    eng = lang.lower() == "english"
    callback(0.4, "Finish OCR: (%s ...)" % txt[:12])
-    if (eng and len(txt.split(" ")) > 32) or len(txt) > 32:
+    if (eng and len(txt.split()) > 32) or len(txt) > 32:
        tokenize(doc, txt, eng)
        callback(0.8, "OCR results is too long to use CV LLM.")
        return [doc]
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -325,12 +325,12 @@ def remove_contents_table(sections, eng=False):
        sections.pop(i)
        if i >= len(sections):
            break
-        prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
+        prefix = get(i)[:3] if not eng else " ".join(get(i).split()[:2])
        while not prefix:
            sections.pop(i)
            if i >= len(sections):
                break
-            prefix = get(i)[:3] if not eng else " ".join(get(i).split(" ")[:2])
+            prefix = get(i)[:3] if not eng else " ".join(get(i).split()[:2])
        sections.pop(i)
        if i >= len(sections) or not prefix:
            break
@ -389,7 +389,7 @@ def title_frequency(bull, sections):
 def not_title(txt):
    if re.match(r"第[零一二三四五六七八九十百0-9]+条", txt):
        return False
-    if len(txt.split(" ")) > 12 or (txt.find(" ") < 0 and len(txt) >= 32):
+    if len(txt.split()) > 12 or (txt.find(" ") < 0 and len(txt) >= 32):
        return True
    return re.search(r"[,;，。；！!]", txt)

--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@ -74,7 +74,7 @@ class FulltextQueryer:

        if not self.isChinese(txt):
            txt = FulltextQueryer.rmWWW(txt)
-            tks = rag_tokenizer.tokenize(txt).split(" ")
+            tks = rag_tokenizer.tokenize(txt).split()
            keywords = [t for t in tks if t]
            tks_w = self.tw.weights(tks, preprocess=False)
            tks_w = [(re.sub(r"[ \\\"'^]", "", tk), w) for tk, w in tks_w]
@ -83,7 +83,7 @@ class FulltextQueryer:
            syns = []
            for tk, w in tks_w:
                syn = self.syn.lookup(tk)
-                syn = rag_tokenizer.tokenize(" ".join(syn)).split(" ")
+                syn = rag_tokenizer.tokenize(" ".join(syn)).split()
                keywords.extend(syn)
                syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn]
                syns.append(" ".join(syn))
@ -114,7 +114,7 @@ class FulltextQueryer:

        txt = FulltextQueryer.rmWWW(txt)
        qs, keywords = [], []
-        for tt in self.tw.split(txt)[:256]:  # .split(" "):
+        for tt in self.tw.split(txt)[:256]:  # .split():
            if not tt:
                continue
            keywords.append(tt)
@ -125,7 +125,7 @@ class FulltextQueryer:
            tms = []
            for tk, w in sorted(twts, key=lambda x: x[1] * -1):
                sm = (
-                    rag_tokenizer.fine_grained_tokenize(tk).split(" ")
+                    rag_tokenizer.fine_grained_tokenize(tk).split()
                    if need_fine_grained_tokenize(tk)
                    else []
                )
@ -194,7 +194,7 @@ class FulltextQueryer:
        def toDict(tks):
            d = {}
            if isinstance(tks, str):
-                tks = tks.split(" ")
+                tks = tks.split()
            for t, c in self.tw.weights(tks, preprocess=False):
                if t not in d:
                    d[t] = 0
--- a/rag/nlp/rag_tokenizer.py
+++ b/rag/nlp/rag_tokenizer.py
@ -192,7 +192,7 @@ class RagTokenizer:

        # if split chars is part of token
        res = []
-        tks = re.sub(r"[ ]+", " ", tks).split(" ")
+        tks = re.sub(r"[ ]+", " ", tks).split()
        s = 0
        while True:
            if s >= len(tks):
@ -329,7 +329,7 @@ class RagTokenizer:
        return self.merge_(res)

    def fine_grained_tokenize(self, tks):
-        tks = tks.split(" ")
+        tks = tks.split()
        zh_num = len([1 for c in tks if c and is_chinese(c[0])])
        if zh_num < len(tks) * 0.2:
            res = []
@ -393,7 +393,7 @@ def is_alphabet(s):

 def naiveQie(txt):
    tks = []
-    for t in txt.split(" "):
+    for t in txt.split():
        if tks and re.match(r".*[a-zA-Z]$", tks[-1]
                            ) and re.match(r".*[a-zA-Z]$", t):
            tks.append(" ")
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -114,7 +114,7 @@ class Dealer:

            for k in keywords:
                kwds.add(k)
-                for kk in rag_tokenizer.fine_grained_tokenize(k).split(" "):
+                for kk in rag_tokenizer.fine_grained_tokenize(k).split():
                    if len(kk) < 2:
                        continue
                    if kk in kwds:
@ -186,7 +186,7 @@ class Dealer:
        assert len(ans_v[0]) == len(chunk_v[0]), "The dimension of query and chunk do not match: {} vs. {}".format(
                len(ans_v[0]), len(chunk_v[0]))

-        chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split(" ")
+        chunks_tks = [rag_tokenizer.tokenize(self.qryr.rmWWW(ck)).split()
                      for ck in chunks]
        cites = {}
        thr = 0.63
@ -195,7 +195,7 @@ class Dealer:
                sim, tksim, vtsim = self.qryr.hybrid_similarity(ans_v[i],
                                                                chunk_v,
                                                                rag_tokenizer.tokenize(
-                                                                    self.qryr.rmWWW(pieces_[i])).split(" "),
+                                                                    self.qryr.rmWWW(pieces_[i])).split(),
                                                                chunks_tks,
                                                                tkweight, vtweight)
                mx = np.max(sim) * 0.99
@ -244,8 +244,8 @@ class Dealer:
                sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
        ins_tw = []
        for i in sres.ids:
-            content_ltks = sres.field[i][cfield].split(" ")
-            title_tks = [t for t in sres.field[i].get("title_tks", "").split(" ") if t]
+            content_ltks = sres.field[i][cfield].split()
+            title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
            important_kwd = sres.field[i].get("important_kwd", [])
            tks = content_ltks + title_tks + important_kwd
            ins_tw.append(tks)
@ -265,8 +265,8 @@ class Dealer:
                sres.field[i]["important_kwd"] = [sres.field[i]["important_kwd"]]
        ins_tw = []
        for i in sres.ids:
-            content_ltks = sres.field[i][cfield].split(" ")
-            title_tks = [t for t in sres.field[i].get("title_tks", "").split(" ") if t]
+            content_ltks = sres.field[i][cfield].split()
+            title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
            important_kwd = sres.field[i].get("important_kwd", [])
            tks = content_ltks + title_tks + important_kwd
            ins_tw.append(tks)
@ -279,8 +279,8 @@ class Dealer:
    def hybrid_similarity(self, ans_embd, ins_embd, ans, inst):
        return self.qryr.hybrid_similarity(ans_embd,
                                           ins_embd,
-                                           rag_tokenizer.tokenize(ans).split(" "),
-                                           rag_tokenizer.tokenize(inst).split(" "))
+                                           rag_tokenizer.tokenize(ans).split(),
+                                           rag_tokenizer.tokenize(inst).split())

    def retrieval(self, question, embd_mdl, tenant_ids, kb_ids, page, page_size, similarity_threshold=0.2,
                  vector_similarity_weight=0.3, top=1024, doc_ids=None, aggs=True, rerank_mdl=None, highlight=False):
--- a/rag/nlp/term_weight.py
+++ b/rag/nlp/term_weight.py
@ -99,7 +99,7 @@ class Dealer:
            txt = re.sub(p, r, txt)

        res = []
-        for t in rag_tokenizer.tokenize(txt).split(" "):
+        for t in rag_tokenizer.tokenize(txt).split():
            tk = t
            if (stpwd and tk in self.stop_words) or (
                    re.match(r"[0-9]$", tk) and not num):
@ -150,7 +150,7 @@ class Dealer:

    def split(self, txt):
        tks = []
-        for t in re.sub(r"[ \t]+", " ", txt).split(" "):
+        for t in re.sub(r"[ \t]+", " ", txt).split():
            if tks and re.match(r".*[a-zA-Z]$", tks[-1]) and \
               re.match(r".*[a-zA-Z]$", t) and tks and \
               self.ne.get(t, "") != "func" and self.ne.get(tks[-1], "") != "func":
@ -198,7 +198,7 @@ class Dealer:
                s = 0

            if not s and len(t) >= 4:
-                s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
+                s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split() if len(tt) > 1]
                if len(s) > 1:
                    s = np.min([freq(tt) for tt in s]) / 6.
                else:
@ -214,7 +214,7 @@ class Dealer:
            elif re.match(r"[a-z. -]+$", t):
                return 300
            elif len(t) >= 4:
-                s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split(" ") if len(tt) > 1]
+                s = [tt for tt in rag_tokenizer.fine_grained_tokenize(t).split() if len(tt) > 1]
                if len(s) > 1:
                    return max(3, np.min([df(tt) for tt in s]) / 6.)

--- a/rag/utils/es_conn.py
+++ b/rag/utils/es_conn.py
@ -85,6 +85,9 @@ class ESConnection(DocStoreConnection):
            logging.exception("ESConnection.createIndex error %s" % (indexName))

    def deleteIdx(self, indexName: str, knowledgebaseId: str):
+        if len(knowledgebaseId) > 0:
+            # The index need to be alive after any kb deletion since all kb under this tenant are in one index.
+            return
        try:
            self.es.indices.delete(index=indexName, allow_no_indices=True)
        except NotFoundError:
@ -400,7 +403,7 @@ class ESConnection(DocStoreConnection):
            if not hlts:
                continue
            txt = "...".join([a for a in list(hlts.items())[0][1]])
-            if not is_english(txt.split(" ")):
+            if not is_english(txt.split()):
                ans[d["_id"]] = txt
                continue

--- a/rag/utils/infinity_conn.py
+++ b/rag/utils/infinity_conn.py
@ -419,7 +419,7 @@ class InfinityConnection(DocStoreConnection):
                    v = list(v)
                elif fieldnm == "important_kwd":
                    assert isinstance(v, str)
-                    v = v.split(" ")
+                    v = v.split()
                else:
                    if not isinstance(v, str):
                        v = str(v)