upgrade laws parser of docx (#1332)

### What problem does this PR solve? ### Type of change - [x] Refactoring
2026-01-30 23:26:36 +08:00 · 2024-07-01 15:50:24 +08:00
parent 5eb21b9c7c
commit 92e9320657
4 changed files with 56 additions and 53 deletions
--- a/api/apps/chunk_app.py
+++ b/api/apps/chunk_app.py
@ -20,7 +20,7 @@ from flask_login import login_required, current_user
 from elasticsearch_dsl import Q

 from rag.app.qa import rmPrefix, beAdoc
-from rag.nlp import search, rag_tokenizer
+from rag.nlp import search, rag_tokenizer, keyword_extraction
 from rag.utils.es_conn import ELASTICSEARCH
 from rag.utils import rmSpace
 from api.db import LLMType, ParserType
@ -268,6 +268,10 @@ def retrieval_test():
            rerank_mdl = TenantLLMService.model_instance(
                kb.tenant_id, LLMType.RERANK.value, llm_name=req["rerank_id"])

+        if req.get("keyword", False):
+            chat_mdl = TenantLLMService.model_instance(kb.tenant_id, LLMType.CHAT)
+            question += keyword_extraction(chat_mdl, question)
+
        ranks = retrievaler.retrieval(question, embd_mdl, kb.tenant_id, [kb_id], page, size,
                                      similarity_threshold, vector_similarity_weight, top,
                                      doc_ids, rerank_mdl=rerank_mdl)
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@ -23,7 +23,7 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.services.llm_service import LLMService, TenantLLMService, LLMBundle
 from api.settings import chat_logger, retrievaler
 from rag.app.resume import forbidden_select_fields4resume
-from rag.nlp.rag_tokenizer import is_chinese
+from rag.nlp import keyword_extraction
 from rag.nlp.search import index_name
 from rag.utils import rmSpace, num_tokens_from_string, encoder

@ -121,6 +121,8 @@ def chat(dialog, messages, stream=True, **kwargs):
    if "knowledge" not in [p["key"] for p in prompt_config["parameters"]]:
        kbinfos = {"total": 0, "chunks": [], "doc_aggs": []}
    else:
+        if prompt_config.get("keyword", False):
+            questions[-1] += keyword_extraction(chat_mdl, questions[-1])
        kbinfos = retrievaler.retrieval(" ".join(questions), embd_mdl, dialog.tenant_id, dialog.kb_ids, 1, dialog.top_n,
                                        dialog.similarity_threshold,
                                        dialog.vector_similarity_weight,