optimize doc id check: do not query db when doc id to validate is empty (#12500)

### What problem does this PR solve? when a kb contains many documents, say 50000, and the retrieval is only made against some kb without specifying any doc ids, the query for all docs from the db is not necessary, and can be omitted to improve performance. ### Type of change - [x] Performance Improvement
2026-01-23 03:26:53 +08:00 · 2026-01-08 13:22:58 +08:00
parent 2fd4a3134d
commit f4e2783eb4
1 changed files with 9 additions and 8 deletions
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
@ -647,10 +647,10 @@ async def metadata_batch_update(dataset_id, tenant_id):
    for d in deletes:
        if not isinstance(d, dict) or not d.get("key"):
            return get_error_data_result(message="Each delete requires key.")
-
-    kb_doc_ids = KnowledgebaseService.list_documents_by_ids([dataset_id])
-    target_doc_ids = set(kb_doc_ids)
+   
    if document_ids:
+        kb_doc_ids = KnowledgebaseService.list_documents_by_ids([dataset_id])
+        target_doc_ids = set(kb_doc_ids)
        invalid_ids = set(document_ids) - set(kb_doc_ids)
        if invalid_ids:
            return get_error_data_result(message=f"These documents do not belong to dataset {dataset_id}: {', '.join(invalid_ids)}")
@ -1519,11 +1519,12 @@ async def retrieval_test(tenant_id):
    toc_enhance = req.get("toc_enhance", False)
    langs = req.get("cross_languages", [])
    if not isinstance(doc_ids, list):
-        return get_error_data_result("`documents` should be a list")
-    doc_ids_list = KnowledgebaseService.list_documents_by_ids(kb_ids)
-    for doc_id in doc_ids:
-        if doc_id not in doc_ids_list:
-            return get_error_data_result(f"The datasets don't own the document {doc_id}")
+        return get_error_data_result("`documents` should be a list")   
+    if doc_ids: 
+        doc_ids_list = KnowledgebaseService.list_documents_by_ids(kb_ids)
+        for doc_id in doc_ids:
+            if doc_id not in doc_ids_list:
+                return get_error_data_result(f"The datasets don't own the document {doc_id}")
    if not doc_ids:
        metadata_condition = req.get("metadata_condition", {}) or {}
        metas = DocumentService.get_meta_by_kbs(kb_ids)