add auto keywords and auto-question (#2965)

### What problem does this PR solve? #2687 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-01 08:05:07 +08:00 · 2024-10-22 13:12:49 +08:00
parent 5aa9d7787e
commit 226bdd6e99
8 changed files with 119 additions and 61 deletions
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -570,14 +570,3 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。；！？"):

    return cks, images

-
-def keyword_extraction(chat_mdl, content):
-    prompt = """
-You're a question analyzer. 
-1. Please give me the most important keyword/phrase of this question.
-Answer format: (in language of user's question)
- - keyword: 
-"""
-    kwd = chat_mdl.chat(prompt, [{"role": "user",  "content": content}], {"temperature": 0.2})
-    if isinstance(kwd, tuple): return kwd[0]
-    return kwd
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@ -34,6 +34,7 @@ import pandas as pd
 from elasticsearch_dsl import Q

 from api.db import LLMType, ParserType
+from api.db.services.dialog_service import keyword_extraction, question_proposal
 from api.db.services.document_service import DocumentService
 from api.db.services.llm_service import LLMBundle
 from api.db.services.task_service import TaskService
@ -198,6 +199,23 @@ def build(row):
        d["_id"] = md5.hexdigest()
        d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
        d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
+
+        if row["parser_config"].get("auto_keywords", 0):
+            chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
+            d["important_kwd"] = keyword_extraction(chat_mdl, ck["content_with_weight"],
+                                                    row["parser_config"]["auto_keywords"]).split(",")
+            d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"]))
+
+        if row["parser_config"].get("auto_questions", 0):
+            chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
+            qst = question_proposal(chat_mdl, ck["content_with_weight"], row["parser_config"]["auto_keywords"])
+            ck["content_with_weight"] = f"Question: \n{qst}\n\nAnswer:\n" + ck["content_with_weight"]
+            qst = rag_tokenizer.tokenize(qst)
+            if "content_ltks" in ck:
+                ck["content_ltks"] += " " + qst
+            if "content_sm_ltks" in ck:
+                ck["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)
+
        if not d.get("image"):
            docs.append(d)
            continue