Tagging (#4426)

### What problem does this PR solve? #4367 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-02 08:35:08 +08:00 · 2025-01-09 17:07:21 +08:00
parent f892d7d426
commit c5da3cdd97
30 changed files with 736 additions and 202 deletions
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@ -59,13 +59,15 @@ class FulltextQueryer:
                "",
            ),
            (r"(^| )(what|who|how|which|where|why)('re|'s)? ", " "),
-            (r"(^| )('s|'re|is|are|were|was|do|does|did|don't|doesn't|didn't|has|have|be|there|you|me|your|my|mine|just|please|may|i|should|would|wouldn't|will|won't|done|go|for|with|so|the|a|an|by|i'm|it's|he's|she's|they|they're|you're|as|by|on|in|at|up|out|down|of|to|or|and|if) ", " ")
+            (
+                r"(^| )('s|'re|is|are|were|was|do|does|did|don't|doesn't|didn't|has|have|be|there|you|me|your|my|mine|just|please|may|i|should|would|wouldn't|will|won't|done|go|for|with|so|the|a|an|by|i'm|it's|he's|she's|they|they're|you're|as|by|on|in|at|up|out|down|of|to|or|and|if) ",
+                " ")
        ]
        for r, p in patts:
            txt = re.sub(r, p, txt, flags=re.IGNORECASE)
        return txt

-    def question(self, txt, tbl="qa", min_match:float=0.6):
+    def question(self, txt, tbl="qa", min_match: float = 0.6):
        txt = re.sub(
            r"[ :|\r\n\t,，。？?/`!！&^%%()\[\]{}<>]+",
            " ",
@ -90,7 +92,8 @@ class FulltextQueryer:
                syn = ["\"{}\"^{:.4f}".format(s, w / 4.) for s in syn if s.strip()]
                syns.append(" ".join(syn))

-            q = ["({}^{:.4f}".format(tk, w) + " {})".format(syn) for (tk, w), syn in zip(tks_w, syns) if tk and not re.match(r"[.^+\(\)-]", tk)]
+            q = ["({}^{:.4f}".format(tk, w) + " {})".format(syn) for (tk, w), syn in zip(tks_w, syns) if
+                 tk and not re.match(r"[.^+\(\)-]", tk)]
            for i in range(1, len(tks_w)):
                left, right = tks_w[i - 1][0].strip(), tks_w[i][0].strip()
                if not left or not right:
@ -155,7 +158,7 @@ class FulltextQueryer:
                if len(keywords) < 32:
                    keywords.extend([s for s in tk_syns if s])
                tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
-                tk_syns = [f"\"{s}\"" if s.find(" ")>0 else s for s in tk_syns]
+                tk_syns = [f"\"{s}\"" if s.find(" ") > 0 else s for s in tk_syns]

                if len(keywords) >= 32:
                    break
@ -174,8 +177,6 @@ class FulltextQueryer:

            if len(twts) > 1:
                tms += ' ("%s"~2)^1.5' % rag_tokenizer.tokenize(tt)
-            if re.match(r"[0-9a-z ]+$", tt):
-                tms = f'("{tt}" OR "%s")' % rag_tokenizer.tokenize(tt)

            syns = " OR ".join(
                [
@ -232,3 +233,25 @@ class FulltextQueryer:
        for k, v in qtwt.items():
            q += v
        return s / q
+
+    def paragraph(self, content_tks: str, keywords: list = [], keywords_topn=30):
+        if isinstance(content_tks, str):
+            content_tks = [c.strip() for c in content_tks.strip() if c.strip()]
+        tks_w = self.tw.weights(content_tks, preprocess=False)
+
+        keywords = [f'"{k.strip()}"' for k in keywords]
+        for tk, w in sorted(tks_w, key=lambda x: x[1] * -1)[:keywords_topn]:
+            tk_syns = self.syn.lookup(tk)
+            tk_syns = [FulltextQueryer.subSpecialChar(s) for s in tk_syns]
+            tk_syns = [rag_tokenizer.fine_grained_tokenize(s) for s in tk_syns if s]
+            tk_syns = [f"\"{s}\"" if s.find(" ") > 0 else s for s in tk_syns]
+            tk = FulltextQueryer.subSpecialChar(tk)
+            if tk.find(" ") > 0:
+                tk = '"%s"' % tk
+            if tk_syns:
+                tk = f"({tk} OR (%s)^0.2)" % " ".join(tk_syns)
+            if tk:
+                keywords.append(f"{tk}^{w}")
+
+        return MatchTextExpr(self.query_fields, " ".join(keywords), 100,
+                             {"minimum_should_match": min(3, len(keywords) / 10)})