add auto keywords and auto-question (#2965)

### What problem does this PR solve?

#2687

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2024-10-22 13:12:49 +08:00
committed by GitHub
parent 5aa9d7787e
commit 226bdd6e99
8 changed files with 119 additions and 61 deletions

View File

@ -570,14 +570,3 @@ def naive_merge_docx(sections, chunk_token_num=128, delimiter="\n。"):
return cks, images
def keyword_extraction(chat_mdl, content):
prompt = """
You're a question analyzer.
1. Please give me the most important keyword/phrase of this question.
Answer format: (in language of user's question)
- keyword:
"""
kwd = chat_mdl.chat(prompt, [{"role": "user", "content": content}], {"temperature": 0.2})
if isinstance(kwd, tuple): return kwd[0]
return kwd

View File

@ -34,6 +34,7 @@ import pandas as pd
from elasticsearch_dsl import Q
from api.db import LLMType, ParserType
from api.db.services.dialog_service import keyword_extraction, question_proposal
from api.db.services.document_service import DocumentService
from api.db.services.llm_service import LLMBundle
from api.db.services.task_service import TaskService
@ -198,6 +199,23 @@ def build(row):
d["_id"] = md5.hexdigest()
d["create_time"] = str(datetime.datetime.now()).replace("T", " ")[:19]
d["create_timestamp_flt"] = datetime.datetime.now().timestamp()
if row["parser_config"].get("auto_keywords", 0):
chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
d["important_kwd"] = keyword_extraction(chat_mdl, ck["content_with_weight"],
row["parser_config"]["auto_keywords"]).split(",")
d["important_tks"] = rag_tokenizer.tokenize(" ".join(d["important_kwd"]))
if row["parser_config"].get("auto_questions", 0):
chat_mdl = LLMBundle(row["tenant_id"], LLMType.CHAT, llm_name=row["llm_id"], lang=row["language"])
qst = question_proposal(chat_mdl, ck["content_with_weight"], row["parser_config"]["auto_keywords"])
ck["content_with_weight"] = f"Question: \n{qst}\n\nAnswer:\n" + ck["content_with_weight"]
qst = rag_tokenizer.tokenize(qst)
if "content_ltks" in ck:
ck["content_ltks"] += " " + qst
if "content_sm_ltks" in ck:
ck["content_sm_ltks"] += " " + rag_tokenizer.fine_grained_tokenize(qst)
if not d.get("image"):
docs.append(d)
continue