From f4e2783eb478ae42d34ceaaa046087441628e689 Mon Sep 17 00:00:00 2001
From: Paul Lu <taotao.nov@gmail.com>
Date: Thu, 8 Jan 2026 13:22:58 +0800
Subject: [PATCH] optimize doc id check: do not query db when doc id to
 validate is empty (#12500)

### What problem does this PR solve?
when a kb contains many documents, say 50000, and the retrieval is only
made against some kb without specifying any doc ids, the query for all
docs from the db is not necessary, and can be omitted to improve
performance.

### Type of change

- [x] Performance Improvement
---
 api/apps/sdk/doc.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py
index d341cea55..d8afe5f27 100644
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
@@ -647,10 +647,10 @@ async def metadata_batch_update(dataset_id, tenant_id):
     for d in deletes:
         if not isinstance(d, dict) or not d.get("key"):
             return get_error_data_result(message="Each delete requires key.")
-
-    kb_doc_ids = KnowledgebaseService.list_documents_by_ids([dataset_id])
-    target_doc_ids = set(kb_doc_ids)
+   
     if document_ids:
+        kb_doc_ids = KnowledgebaseService.list_documents_by_ids([dataset_id])
+        target_doc_ids = set(kb_doc_ids)
         invalid_ids = set(document_ids) - set(kb_doc_ids)
         if invalid_ids:
             return get_error_data_result(message=f"These documents do not belong to dataset {dataset_id}: {', '.join(invalid_ids)}")
@@ -1519,11 +1519,12 @@ async def retrieval_test(tenant_id):
     toc_enhance = req.get("toc_enhance", False)
     langs = req.get("cross_languages", [])
     if not isinstance(doc_ids, list):
-        return get_error_data_result("`documents` should be a list")
-    doc_ids_list = KnowledgebaseService.list_documents_by_ids(kb_ids)
-    for doc_id in doc_ids:
-        if doc_id not in doc_ids_list:
-            return get_error_data_result(f"The datasets don't own the document {doc_id}")
+        return get_error_data_result("`documents` should be a list")   
+    if doc_ids: 
+        doc_ids_list = KnowledgebaseService.list_documents_by_ids(kb_ids)
+        for doc_id in doc_ids:
+            if doc_id not in doc_ids_list:
+                return get_error_data_result(f"The datasets don't own the document {doc_id}")
     if not doc_ids:
         metadata_condition = req.get("metadata_condition", {}) or {}
         metas = DocumentService.get_meta_by_kbs(kb_ids)