Tagging (#4426)

### What problem does this PR solve? #4367 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-01 08:05:07 +08:00 · 2025-01-09 17:07:21 +08:00
parent f892d7d426
commit c5da3cdd97
30 changed files with 736 additions and 202 deletions
--- a/api/apps/sdk/dataset.py
+++ b/api/apps/sdk/dataset.py
@ -73,7 +73,8 @@ def create(tenant_id):
            chunk_method:
              type: string
              enum: ["naive", "manual", "qa", "table", "paper", "book", "laws",
-                     "presentation", "picture", "one", "knowledge_graph", "email"]
+                     "presentation", "picture", "one", "knowledge_graph", "email", "tag"
+                     ]
              description: Chunking method.
            parser_config:
              type: object
@ -108,6 +109,7 @@ def create(tenant_id):
        "one",
        "knowledge_graph",
        "email",
+        "tag"
    ]
    check_validation = valid(
        permission,
@ -302,7 +304,8 @@ def update(tenant_id, dataset_id):
            chunk_method:
              type: string
              enum: ["naive", "manual", "qa", "table", "paper", "book", "laws",
-                     "presentation", "picture", "one", "knowledge_graph", "email"]
+                     "presentation", "picture", "one", "knowledge_graph", "email", "tag"
+                     ]
              description: Updated chunking method.
            parser_config:
              type: object
@ -339,6 +342,7 @@ def update(tenant_id, dataset_id):
        "one",
        "knowledge_graph",
        "email",
+        "tag"
    ]
    check_validation = valid(
        permission,
--- a/api/apps/sdk/dify_retrieval.py
+++ b/api/apps/sdk/dify_retrieval.py
@ -16,6 +16,7 @@
 from flask import request, jsonify

 from api.db import LLMType, ParserType
+from api.db.services.dialog_service import label_question
 from api.db.services.knowledgebase_service import KnowledgebaseService
 from api.db.services.llm_service import LLMBundle
 from api import settings
@ -54,7 +55,8 @@ def retrieval(tenant_id):
            page_size=top,
            similarity_threshold=similarity_threshold,
            vector_similarity_weight=0.3,
-            top=top
+            top=top,
+            rank_feature=label_question(question, [kb])
        )
        records = []
        for c in ranks["chunks"]:
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
@ -16,7 +16,7 @@
 import pathlib
 import datetime

-from api.db.services.dialog_service import keyword_extraction
+from api.db.services.dialog_service import keyword_extraction, label_question
 from rag.app.qa import rmPrefix, beAdoc
 from rag.nlp import rag_tokenizer
 from api.db import LLMType, ParserType
@ -276,6 +276,7 @@ def update_doc(tenant_id, dataset_id, document_id):
            "one",
            "knowledge_graph",
            "email",
+            "tag"
        }
        if req.get("chunk_method") not in valid_chunk_method:
            return get_error_data_result(
@ -1355,6 +1356,7 @@ def retrieval_test(tenant_id):
            doc_ids,
            rerank_mdl=rerank_mdl,
            highlight=highlight,
+            rank_feature=label_question(question, kbs)
        )
        for c in ranks["chunks"]:
            c.pop("vector", None)