Feat: Add question parameter to edit chunk modal (#3875)

### What problem does this PR solve? Close #3873 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-01 16:15:07 +08:00 · 2024-12-05 14:51:19 +08:00
parent b502dc7399
commit 56f473b680
8 changed files with 55 additions and 24 deletions
--- a/rag/nlp/query.py
+++ b/rag/nlp/query.py
@ -31,6 +31,7 @@ class FulltextQueryer:
            "title_sm_tks^5",
            "important_kwd^30",
            "important_tks^20",
+            "question_tks^20",
            "content_ltks^2",
            "content_sm_ltks",
        ]
--- a/rag/nlp/search.py
+++ b/rag/nlp/search.py
@ -74,7 +74,7 @@ class Dealer:
        offset, limit = pg * ps, (pg + 1) * ps

        src = req.get("fields", ["docnm_kwd", "content_ltks", "kb_id", "img_id", "title_tks", "important_kwd",
-                                 "doc_id", "position_list", "knowledge_graph_kwd",
+                                 "doc_id", "position_list", "knowledge_graph_kwd", "question_kwd", "question_tks",
                                 "available_int", "content_with_weight", "pagerank_fea"])
        kwds = set([])

@ -251,8 +251,9 @@ class Dealer:
        for i in sres.ids:
            content_ltks = sres.field[i][cfield].split()
            title_tks = [t for t in sres.field[i].get("title_tks", "").split() if t]
+            question_tks = [t for t in sres.field[i].get("question_tks", "").split() if t]
            important_kwd = sres.field[i].get("important_kwd", [])
-            tks = content_ltks + title_tks*2 + important_kwd*5
+            tks = content_ltks + title_tks*2 + important_kwd*5 + question_tks*6
            ins_tw.append(tks)

        sim, tksim, vtsim = self.qryr.hybrid_similarity(sres.query_vector,
@ -322,11 +323,14 @@ class Dealer:
            sim = tsim = vsim = [1]*len(sres.ids)
            idx = list(range(len(sres.ids)))

+        def floor_sim(score):
+            return (int(score * 100.)%100)/100.
+
        dim = len(sres.query_vector)
        vector_column = f"q_{dim}_vec"
        zero_vector = [0.0] * dim
        for i in idx:
-            if sim[i] < similarity_threshold:
+            if floor_sim(sim[i]) < similarity_threshold:
                break
            if len(ranks["chunks"]) >= page_size:
                if aggs:
@ -337,8 +341,6 @@ class Dealer:
            dnm = chunk["docnm_kwd"]
            did = chunk["doc_id"]
            position_list = chunk.get("position_list", "[]")
-            if not position_list:
-                position_list = "[]"
            d = {
                "chunk_id": id,
                "content_ltks": chunk["content_ltks"],
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@ -255,13 +255,8 @@ def build_chunks(task, progress_callback):
        progress_callback(msg="Start to generate questions for every chunk ...")
        chat_mdl = LLMBundle(task["tenant_id"], LLMType.CHAT, llm_name=task["llm_id"], lang=task["language"])
        for d in docs:
-            qst = question_proposal(chat_mdl, d["content_with_weight"], task["parser_config"]["auto_questions"])
-            d["content_with_weight"] = f"Question: \n{qst}\n\nAnswer:\n" + d["content_with_weight"]
-            qst = rag_tokenizer.tokenize(qst)
-            if "content_ltks" in d:
-                d["content_ltks"] += " " + qst
-            if "content_sm_ltks" in d:
-                d["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)
+            d["question_kwd"] = question_proposal(chat_mdl, d["content_with_weight"], task["parser_config"]["auto_questions"]).split("\n")
+            d["question_tks"] = rag_tokenizer.tokenize("\n".join(d["question_kwd"]))
        progress_callback(msg="Question generation completed in {:.2f}s".format(timer() - st))

    return docs
@ -275,9 +270,16 @@ def init_kb(row, vector_size: int):
 def embedding(docs, mdl, parser_config=None, callback=None):
    if parser_config is None:
        parser_config = {}
-    batch_size = 32
-    tts, cnts = [rmSpace(d["title_tks"]) for d in docs if d.get("title_tks")], [
-        re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", d["content_with_weight"]) for d in docs]
+    batch_size = 16
+    tts, cnts = [], []
+    for d in docs:
+        tts.append(rmSpace(d["title_tks"]))
+        c = "\n".join(d.get("question_kwd", []))
+        if not c:
+            c = d["content_with_weight"]
+        c = re.sub(r"</?(table|td|caption|tr|th)( [^<>]{0,12})?>", " ", c)
+        cnts.append(c)
+
    tk_count = 0
    if len(tts) == len(cnts):
        tts_ = np.array([])