From f4e2783eb478ae42d34ceaaa046087441628e689 Mon Sep 17 00:00:00 2001 From: Paul Lu Date: Thu, 8 Jan 2026 13:22:58 +0800 Subject: [PATCH] optimize doc id check: do not query db when doc id to validate is empty (#12500) ### What problem does this PR solve? when a kb contains many documents, say 50000, and the retrieval is only made against some kb without specifying any doc ids, the query for all docs from the db is not necessary, and can be omitted to improve performance. ### Type of change - [x] Performance Improvement --- api/apps/sdk/doc.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index d341cea55..d8afe5f27 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -647,10 +647,10 @@ async def metadata_batch_update(dataset_id, tenant_id): for d in deletes: if not isinstance(d, dict) or not d.get("key"): return get_error_data_result(message="Each delete requires key.") - - kb_doc_ids = KnowledgebaseService.list_documents_by_ids([dataset_id]) - target_doc_ids = set(kb_doc_ids) + if document_ids: + kb_doc_ids = KnowledgebaseService.list_documents_by_ids([dataset_id]) + target_doc_ids = set(kb_doc_ids) invalid_ids = set(document_ids) - set(kb_doc_ids) if invalid_ids: return get_error_data_result(message=f"These documents do not belong to dataset {dataset_id}: {', '.join(invalid_ids)}") @@ -1519,11 +1519,12 @@ async def retrieval_test(tenant_id): toc_enhance = req.get("toc_enhance", False) langs = req.get("cross_languages", []) if not isinstance(doc_ids, list): - return get_error_data_result("`documents` should be a list") - doc_ids_list = KnowledgebaseService.list_documents_by_ids(kb_ids) - for doc_id in doc_ids: - if doc_id not in doc_ids_list: - return get_error_data_result(f"The datasets don't own the document {doc_id}") + return get_error_data_result("`documents` should be a list") + if doc_ids: + doc_ids_list = KnowledgebaseService.list_documents_by_ids(kb_ids) + for doc_id in doc_ids: + if doc_id not in doc_ids_list: + return get_error_data_result(f"The datasets don't own the document {doc_id}") if not doc_ids: metadata_condition = req.get("metadata_condition", {}) or {} metas = DocumentService.get_meta_by_kbs(kb_ids)