Feat: document list and filter supports metadata filtering (#12053)

### What problem does this PR solve?

Document list and filter supports metadata filtering.

**OR within the same field, AND across different fields**

Example 1 (multi-field AND):

```markdown
Doc1 metadata: { "a": "b", "as": ["a", "b", "c"] }
Doc2 metadata: { "a": "x", "as": ["d"] }

Query:

metadata = {
  "a": ["b"],
  "as": ["d"]
}

Result:

Doc1 matches a=b but not as=d → excluded
Doc2 matches as=d but not a=b → excluded

Final result: empty
```

Example 2 (same field OR):

```markdown
Doc1 metadata: { "as": ["a", "b", "c"] }
Doc2 metadata: { "as": ["d"] }

Query:

metadata = {
  "as": ["a", "d"]
}
Result:

Doc1 matches as=a → included
Doc2 matches as=d → included

Final result: Doc1 + Doc2
```

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-12-22 09:35:11 +08:00
committed by GitHub
parent 55c0468ac9
commit 3ee47e4af7
2 changed files with 71 additions and 4 deletions

View File

@ -250,14 +250,50 @@ async def list_docs():
metadata_condition = req.get("metadata_condition", {}) or {}
if metadata_condition and not isinstance(metadata_condition, dict):
return get_data_error_result(message="metadata_condition must be an object.")
metadata = req.get("metadata", {}) or {}
if metadata and not isinstance(metadata, dict):
return get_data_error_result(message="metadata must be an object.")
doc_ids_filter = None
if metadata_condition:
metas = None
if metadata_condition or metadata:
metas = DocumentService.get_flatted_meta_by_kbs([kb_id])
doc_ids_filter = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
if metadata_condition:
doc_ids_filter = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
if metadata_condition.get("conditions") and not doc_ids_filter:
return get_json_result(data={"total": 0, "docs": []})
if metadata:
metadata_doc_ids = None
for key, values in metadata.items():
if not values:
continue
if not isinstance(values, list):
values = [values]
values = [str(v) for v in values if v is not None and str(v).strip()]
if not values:
continue
key_doc_ids = set()
for value in values:
key_doc_ids.update(metas.get(key, {}).get(value, []))
if metadata_doc_ids is None:
metadata_doc_ids = key_doc_ids
else:
metadata_doc_ids &= key_doc_ids
if not metadata_doc_ids:
return get_json_result(data={"total": 0, "docs": []})
if metadata_doc_ids is not None:
if doc_ids_filter is None:
doc_ids_filter = metadata_doc_ids
else:
doc_ids_filter &= metadata_doc_ids
if not doc_ids_filter:
return get_json_result(data={"total": 0, "docs": []})
if doc_ids_filter is not None:
doc_ids_filter = list(doc_ids_filter)
try:
docs, tol = DocumentService.get_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types, suffix, doc_ids_filter)