From b36d9744ae44d3843107a16438658003bd64a060 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mathias=20Panzenb=C3=B6ck?= <134175+panzi@users.noreply.github.com> Date: Tue, 27 Jan 2026 05:45:58 +0100 Subject: [PATCH] shortcut metadata_condition if there is none (#12835) ### What problem does this PR solve? If no `metadata_condition` parameter is given then don't load the metadata of all documents into memory. Instead just pass `doc_ids` as `None` to the `retrieval()` method, which means to use all documents of the given datasets. This is relevant if you have *a lot* of documents! ### Type of change - [x] Performance Improvement --- api/apps/sdk/doc.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 2e97c1668..d073f334f 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -1526,14 +1526,18 @@ async def retrieval_test(tenant_id): if doc_id not in doc_ids_list: return get_error_data_result(f"The datasets don't own the document {doc_id}") if not doc_ids: - metadata_condition = req.get("metadata_condition", {}) or {} - metas = DocumentService.get_meta_by_kbs(kb_ids) - doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")) - # If metadata_condition has conditions but no docs match, return empty result - if not doc_ids and metadata_condition.get("conditions"): - return get_result(data={"total": 0, "chunks": [], "doc_aggs": {}}) - if metadata_condition and not doc_ids: - doc_ids = ["-999"] + metadata_condition = req.get("metadata_condition") + if metadata_condition: + metas = DocumentService.get_meta_by_kbs(kb_ids) + doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")) + # If metadata_condition has conditions but no docs match, return empty result + if not doc_ids and metadata_condition.get("conditions"): + return get_result(data={"total": 0, "chunks": [], "doc_aggs": {}}) + if metadata_condition and not doc_ids: + doc_ids = ["-999"] + else: + # If doc_ids is None all documents of the datasets are used + doc_ids = None similarity_threshold = float(req.get("similarity_threshold", 0.2)) vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3)) top = int(req.get("top_k", 1024))