Feat: message manage (#12083)

### What problem does this PR solve? Message CRUD. Issue #4213 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2025-12-26 00:46:52 +08:00 · 2025-12-23 21:16:25 +08:00
parent bab6a4a219
commit 17b8bb62b6
49 changed files with 3480 additions and 1031 deletions
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@ -19,11 +19,12 @@ import json
 import re
 from collections import defaultdict

-from rag.utils.doc_store_conn import MatchTextExpr
+from common.query_base import QueryBase
+from common.doc_store.doc_store_base import MatchTextExpr
 from rag.nlp import rag_tokenizer, term_weight, synonym


-class FulltextQueryer:
+class FulltextQueryer(QueryBase):
    def __init__(self):
        self.tw = term_weight.Dealer()
        self.syn = synonym.Dealer()
@ -37,64 +38,19 @@ class FulltextQueryer:
            "content_sm_ltks",
        ]

-    @staticmethod
-    def sub_special_char(line):
-        return re.sub(r"([:\{\}/\[\]\-\*\"\(\)\|\+~\^])", r"\\\1", line).strip()
-
-    @staticmethod
-    def is_chinese(line):
-        arr = re.split(r"[ \t]+", line)
-        if len(arr) <= 3:
-            return True
-        e = 0
-        for t in arr:
-            if not re.match(r"[a-zA-Z]+$", t):
-                e += 1
-        return e * 1.0 / len(arr) >= 0.7
-
-    @staticmethod
-    def rmWWW(txt):
-        patts = [
-            (
-                r"是*(怎么办|什么样的|哪家|一下|那家|请问|啥样|咋样了|什么时候|何时|何地|何人|是否|是不是|多少|哪里|怎么|哪儿|怎么样|如何|哪些|是啥|啥是|啊|吗|呢|吧|咋|什么|有没有|呀|谁|哪位|哪个)是*",
-                "",
-            ),
-            (r"(^| )(what|who|how|which|where|why)('re|'s)? ", " "),
-            (
-                r"(^| )('s|'re|is|are|were|was|do|does|did|don't|doesn't|didn't|has|have|be|there|you|me|your|my|mine|just|please|may|i|should|would|wouldn't|will|won't|done|go|for|with|so|the|a|an|by|i'm|it's|he's|she's|they|they're|you're|as|by|on|in|at|up|out|down|of|to|or|and|if) ",
-                " ")
-        ]
-        otxt = txt
-        for r, p in patts:
-            txt = re.sub(r, p, txt, flags=re.IGNORECASE)
-        if not txt:
-            txt = otxt
-        return txt
-
-    @staticmethod
-    def add_space_between_eng_zh(txt):
-        # (ENG/ENG+NUM) + ZH
-        txt = re.sub(r'([A-Za-z]+[0-9]+)([\u4e00-\u9fa5]+)', r'\1 \2', txt)
-        # ENG + ZH
-        txt = re.sub(r'([A-Za-z])([\u4e00-\u9fa5]+)', r'\1 \2', txt)
-        # ZH + (ENG/ENG+NUM)
-        txt = re.sub(r'([\u4e00-\u9fa5]+)([A-Za-z]+[0-9]+)', r'\1 \2', txt)
-        txt = re.sub(r'([\u4e00-\u9fa5]+)([A-Za-z])', r'\1 \2', txt)
-        return txt
-
    def question(self, txt, tbl="qa", min_match: float = 0.6):
        original_query = txt
-        txt = FulltextQueryer.add_space_between_eng_zh(txt)
+        txt = self.add_space_between_eng_zh(txt)
        txt = re.sub(
            r"[ :|\r\n\t,，。？?/`!！&^%%()\[\]{}<>]+",
            " ",
            rag_tokenizer.tradi2simp(rag_tokenizer.strQ2B(txt.lower())),
        ).strip()
        otxt = txt
-        txt = FulltextQueryer.rmWWW(txt)
+        txt = self.rmWWW(txt)

        if not self.is_chinese(txt):
-            txt = FulltextQueryer.rmWWW(txt)
+            txt = self.rmWWW(txt)
            tks = rag_tokenizer.tokenize(txt).split()
            keywords = [t for t in tks if t]
            tks_w = self.tw.weights(tks, preprocess=False)
@ -138,7 +94,7 @@ class FulltextQueryer:
                return False
            return True

-        txt = FulltextQueryer.rmWWW(txt)
+        txt = self.rmWWW(txt)
        qs, keywords = [], []
        for tt in self.tw.split(txt)[:256]:  # .split():
            if not tt:
@ -164,7 +120,7 @@ class FulltextQueryer:
                    )
                    for m in sm
                ]
-                sm = [FulltextQueryer.sub_special_char(m) for m in sm if len(m) > 1]
+                sm = [self.sub_special_char(m) for m in sm if len(m) > 1]
                sm = [m for m in sm if len(m) > 1]

                if len(keywords) < 32:
@ -172,7 +128,7 @@ class FulltextQueryer:
                    keywords.extend(sm)

                tk_syns = self.syn.lookup(tk)
-                tk_syns = [FulltextQueryer.sub_special_char(s) for s in tk_syns]
+                tk_syns = [self.sub_special_char(s) for s in tk_syns]
                if len(keywords) < 32:
                    keywords.extend([s for s in tk_syns if s])
                tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
@ -181,7 +137,7 @@ class FulltextQueryer:
                if len(keywords) >= 32:
                    break

-                tk = FulltextQueryer.sub_special_char(tk)
+                tk = self.sub_special_char(tk)
                if tk.find(" ") > 0:
                    tk = '"%s"' % tk
                if tk_syns:
@ -199,7 +155,7 @@ class FulltextQueryer:
            syns = " OR ".join(
                [
                    '"%s"'
-                    % rag_tokenizer.tokenize(FulltextQueryer.sub_special_char(s))
+                    % rag_tokenizer.tokenize(self.sub_special_char(s))
                    for s in syns
                ]
            )
@ -264,10 +220,10 @@ class FulltextQueryer:
        keywords = [f'"{k.strip()}"' for k in keywords]
        for tk, w in sorted(tks_w, key=lambda x: x[1] * -1)[:keywords_topn]:
            tk_syns = self.syn.lookup(tk)
-            tk_syns = [FulltextQueryer.sub_special_char(s) for s in tk_syns]
+            tk_syns = [self.sub_special_char(s) for s in tk_syns]
            tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
            tk_syns = [f"\"{s}\"" if s.find(" ") > 0 else s for s in tk_syns]
-            tk = FulltextQueryer.sub_special_char(tk)
+            tk = self.sub_special_char(tk)
            if tk.find(" ") > 0:
                tk = '"%s"' % tk
            if tk_syns:
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -24,7 +24,7 @@ from dataclasses import dataclass
 from rag.prompts.generator import relevant_chunks_with_toc
 from rag.nlp import rag_tokenizer, query
 import numpy as np
-from rag.utils.doc_store_conn import DocStoreConnection, MatchDenseExpr, FusionExpr, OrderByExpr
+from common.doc_store.doc_store_base import MatchDenseExpr, FusionExpr, OrderByExpr, DocStoreConnection
 from common.string_utils import remove_redundant_spaces
 from common.float_utils import get_float
 from common.constants import PAGERANK_FLD, TAG_FLD
@ -155,7 +155,7 @@ class Dealer:
                    kwds.add(kk)

        logging.debug(f"TOTAL: {total}")
-        ids = self.dataStore.get_chunk_ids(res)
+        ids = self.dataStore.get_doc_ids(res)
        keywords = list(kwds)
        highlight = self.dataStore.get_highlight(res, keywords, "content_with_weight")
        aggs = self.dataStore.get_aggregation(res, "docnm_kwd")
@ -545,7 +545,7 @@ class Dealer:
        return res

    def all_tags(self, tenant_id: str, kb_ids: list[str], S=1000):
-        if not self.dataStore.indexExist(index_name(tenant_id), kb_ids[0]):
+        if not self.dataStore.index_exist(index_name(tenant_id), kb_ids[0]):
            return []
        res = self.dataStore.search([], [], {}, [], OrderByExpr(), 0, 0, index_name(tenant_id), kb_ids, ["tag_kwd"])
        return self.dataStore.get_aggregation(res, "tag_kwd")