From 3ee47e4af767afe7351fae6c26e0b9e3882bccd6 Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Mon, 22 Dec 2025 09:35:11 +0800 Subject: [PATCH] Feat: document list and filter supports metadata filtering (#12053) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What problem does this PR solve? Document list and filter supports metadata filtering. **OR within the same field, AND across different fields** Example 1 (multi-field AND): ```markdown Doc1 metadata: { "a": "b", "as": ["a", "b", "c"] } Doc2 metadata: { "a": "x", "as": ["d"] } Query: metadata = { "a": ["b"], "as": ["d"] } Result: Doc1 matches a=b but not as=d → excluded Doc2 matches as=d but not a=b → excluded Final result: empty ``` Example 2 (same field OR): ```markdown Doc1 metadata: { "as": ["a", "b", "c"] } Doc2 metadata: { "as": ["d"] } Query: metadata = { "as": ["a", "d"] } Result: Doc1 matches as=a → included Doc2 matches as=d → included Final result: Doc1 + Doc2 ``` ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- api/apps/document_app.py | 40 +++++++++++++++++++++++++++-- api/db/services/document_service.py | 35 +++++++++++++++++++++++-- 2 files changed, 71 insertions(+), 4 deletions(-) diff --git a/api/apps/document_app.py b/api/apps/document_app.py index 3c37b5eec..137ec9ac1 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -250,14 +250,50 @@ async def list_docs(): metadata_condition = req.get("metadata_condition", {}) or {} if metadata_condition and not isinstance(metadata_condition, dict): return get_data_error_result(message="metadata_condition must be an object.") + metadata = req.get("metadata", {}) or {} + if metadata and not isinstance(metadata, dict): + return get_data_error_result(message="metadata must be an object.") doc_ids_filter = None - if metadata_condition: + metas = None + if metadata_condition or metadata: metas = DocumentService.get_flatted_meta_by_kbs([kb_id]) - doc_ids_filter = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")) + + if metadata_condition: + doc_ids_filter = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))) if metadata_condition.get("conditions") and not doc_ids_filter: return get_json_result(data={"total": 0, "docs": []}) + if metadata: + metadata_doc_ids = None + for key, values in metadata.items(): + if not values: + continue + if not isinstance(values, list): + values = [values] + values = [str(v) for v in values if v is not None and str(v).strip()] + if not values: + continue + key_doc_ids = set() + for value in values: + key_doc_ids.update(metas.get(key, {}).get(value, [])) + if metadata_doc_ids is None: + metadata_doc_ids = key_doc_ids + else: + metadata_doc_ids &= key_doc_ids + if not metadata_doc_ids: + return get_json_result(data={"total": 0, "docs": []}) + if metadata_doc_ids is not None: + if doc_ids_filter is None: + doc_ids_filter = metadata_doc_ids + else: + doc_ids_filter &= metadata_doc_ids + if not doc_ids_filter: + return get_json_result(data={"total": 0, "docs": []}) + + if doc_ids_filter is not None: + doc_ids_filter = list(doc_ids_filter) + try: docs, tol = DocumentService.get_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types, suffix, doc_ids_filter) diff --git a/api/db/services/document_service.py b/api/db/services/document_service.py index 8308ae4fb..7b3a253b7 100644 --- a/api/db/services/document_service.py +++ b/api/db/services/document_service.py @@ -180,6 +180,16 @@ class DocumentService(CommonService): "1": 2, "2": 2 } + "metadata": { + "key1": { + "key1_value1": 1, + "key1_value2": 2, + }, + "key2": { + "key2_value1": 2, + "key2_value2": 1, + }, + } }, total where "1" => RUNNING, "2" => CANCEL """ @@ -200,19 +210,40 @@ class DocumentService(CommonService): if suffix: query = query.where(cls.model.suffix.in_(suffix)) - rows = query.select(cls.model.run, cls.model.suffix) + rows = query.select(cls.model.run, cls.model.suffix, cls.model.meta_fields) total = rows.count() suffix_counter = {} run_status_counter = {} + metadata_counter = {} for row in rows: suffix_counter[row.suffix] = suffix_counter.get(row.suffix, 0) + 1 run_status_counter[str(row.run)] = run_status_counter.get(str(row.run), 0) + 1 + meta_fields = row.meta_fields or {} + if isinstance(meta_fields, str): + try: + meta_fields = json.loads(meta_fields) + except Exception: + meta_fields = {} + if not isinstance(meta_fields, dict): + continue + for key, value in meta_fields.items(): + values = value if isinstance(value, list) else [value] + for vv in values: + if vv is None: + continue + if isinstance(vv, str) and not vv.strip(): + continue + sv = str(vv) + if key not in metadata_counter: + metadata_counter[key] = {} + metadata_counter[key][sv] = metadata_counter[key].get(sv, 0) + 1 return { "suffix": suffix_counter, - "run_status": run_status_counter + "run_status": run_status_counter, + "metadata": metadata_counter, }, total @classmethod