add auto keywords and auto-question (#2965)

### What problem does this PR solve? #2687 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-31 07:36:46 +08:00 · 2024-10-22 13:12:49 +08:00
parent 5aa9d7787e
commit 226bdd6e99
8 changed files with 119 additions and 61 deletions
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@ -28,7 +28,6 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.services.llm_service import LLMService, TenantLLMService, LLMBundle
 from api.settings import chat_logger, retrievaler, kg_retrievaler
 from rag.app.resume import forbidden_select_fields4resume
-from rag.nlp import keyword_extraction
 from rag.nlp.search import index_name
 from rag.utils import rmSpace, num_tokens_from_string, encoder
 from api.utils.file_utils import get_project_base_directory
@ -80,6 +79,7 @@ class ConversationService(CommonService):

        return list(sessions.dicts())

+
 def message_fit_in(msg, max_length=4000):
    def count():
        nonlocal msg
@ -456,6 +456,58 @@ def rewrite(tenant_id, llm_id, question):
    return ans


+def keyword_extraction(chat_mdl, content, topn=3):
+    prompt = f"""
+Role: You're a text analyzer. 
+Task: extract the most important keywords/phrases of a given piece of text content.
+Requirements: 
+  - Summarize the text content, and give top {topn} important keywords/phrases.
+  - The keywords MUST be in language of the given piece of text content.
+  - The keywords are delimited by ENGLISH COMMA.
+  - Keywords ONLY in output.
+
+### Text Content 
+{content}
+
+"""
+    msg = [
+        {"role": "system", "content": prompt},
+        {"role": "user", "content": "Output: "}
+    ]
+    _, msg = message_fit_in(msg, chat_mdl.max_length)
+    kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
+    if isinstance(kwd, tuple): kwd = kwd[0]
+    if kwd.find("**ERROR**") >=0: return ""
+    return kwd
+
+
+def question_proposal(chat_mdl, content, topn=3):
+    prompt = f"""
+Role: You're a text analyzer. 
+Task:  propose {topn} questions about a given piece of text content.
+Requirements: 
+  - Understand and summarize the text content, and propose top {topn} important questions.
+  - The questions SHOULD NOT have overlapping meanings.
+  - The questions SHOULD cover the main content of the text as much as possible.
+  - The questions MUST be in language of the given piece of text content.
+  - One question per line.
+  - Question ONLY in output.
+
+### Text Content 
+{content}
+
+"""
+    msg = [
+        {"role": "system", "content": prompt},
+        {"role": "user", "content": "Output: "}
+    ]
+    _, msg = message_fit_in(msg, chat_mdl.max_length)
+    kwd = chat_mdl.chat(prompt, msg[1:], {"temperature": 0.2})
+    if isinstance(kwd, tuple): kwd = kwd[0]
+    if kwd.find("**ERROR**") >= 0: return ""
+    return kwd
+
+
 def full_question(tenant_id, llm_id, messages):
    if llm_id2llm_type(llm_id) == "image2text":
        chat_mdl = LLMBundle(tenant_id, LLMType.IMAGE2TEXT, llm_id)