mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-22 14:16:42 +08:00
Feat: document list and filter supports metadata filtering (#12053)
### What problem does this PR solve?
Document list and filter supports metadata filtering.
**OR within the same field, AND across different fields**
Example 1 (multi-field AND):
```markdown
Doc1 metadata: { "a": "b", "as": ["a", "b", "c"] }
Doc2 metadata: { "a": "x", "as": ["d"] }
Query:
metadata = {
"a": ["b"],
"as": ["d"]
}
Result:
Doc1 matches a=b but not as=d → excluded
Doc2 matches as=d but not a=b → excluded
Final result: empty
```
Example 2 (same field OR):
```markdown
Doc1 metadata: { "as": ["a", "b", "c"] }
Doc2 metadata: { "as": ["d"] }
Query:
metadata = {
"as": ["a", "d"]
}
Result:
Doc1 matches as=a → included
Doc2 matches as=d → included
Final result: Doc1 + Doc2
```
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -250,14 +250,50 @@ async def list_docs():
|
||||
metadata_condition = req.get("metadata_condition", {}) or {}
|
||||
if metadata_condition and not isinstance(metadata_condition, dict):
|
||||
return get_data_error_result(message="metadata_condition must be an object.")
|
||||
metadata = req.get("metadata", {}) or {}
|
||||
if metadata and not isinstance(metadata, dict):
|
||||
return get_data_error_result(message="metadata must be an object.")
|
||||
|
||||
doc_ids_filter = None
|
||||
if metadata_condition:
|
||||
metas = None
|
||||
if metadata_condition or metadata:
|
||||
metas = DocumentService.get_flatted_meta_by_kbs([kb_id])
|
||||
doc_ids_filter = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
|
||||
|
||||
if metadata_condition:
|
||||
doc_ids_filter = set(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
|
||||
if metadata_condition.get("conditions") and not doc_ids_filter:
|
||||
return get_json_result(data={"total": 0, "docs": []})
|
||||
|
||||
if metadata:
|
||||
metadata_doc_ids = None
|
||||
for key, values in metadata.items():
|
||||
if not values:
|
||||
continue
|
||||
if not isinstance(values, list):
|
||||
values = [values]
|
||||
values = [str(v) for v in values if v is not None and str(v).strip()]
|
||||
if not values:
|
||||
continue
|
||||
key_doc_ids = set()
|
||||
for value in values:
|
||||
key_doc_ids.update(metas.get(key, {}).get(value, []))
|
||||
if metadata_doc_ids is None:
|
||||
metadata_doc_ids = key_doc_ids
|
||||
else:
|
||||
metadata_doc_ids &= key_doc_ids
|
||||
if not metadata_doc_ids:
|
||||
return get_json_result(data={"total": 0, "docs": []})
|
||||
if metadata_doc_ids is not None:
|
||||
if doc_ids_filter is None:
|
||||
doc_ids_filter = metadata_doc_ids
|
||||
else:
|
||||
doc_ids_filter &= metadata_doc_ids
|
||||
if not doc_ids_filter:
|
||||
return get_json_result(data={"total": 0, "docs": []})
|
||||
|
||||
if doc_ids_filter is not None:
|
||||
doc_ids_filter = list(doc_ids_filter)
|
||||
|
||||
try:
|
||||
docs, tol = DocumentService.get_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types, suffix, doc_ids_filter)
|
||||
|
||||
|
||||
@ -180,6 +180,16 @@ class DocumentService(CommonService):
|
||||
"1": 2,
|
||||
"2": 2
|
||||
}
|
||||
"metadata": {
|
||||
"key1": {
|
||||
"key1_value1": 1,
|
||||
"key1_value2": 2,
|
||||
},
|
||||
"key2": {
|
||||
"key2_value1": 2,
|
||||
"key2_value2": 1,
|
||||
},
|
||||
}
|
||||
}, total
|
||||
where "1" => RUNNING, "2" => CANCEL
|
||||
"""
|
||||
@ -200,19 +210,40 @@ class DocumentService(CommonService):
|
||||
if suffix:
|
||||
query = query.where(cls.model.suffix.in_(suffix))
|
||||
|
||||
rows = query.select(cls.model.run, cls.model.suffix)
|
||||
rows = query.select(cls.model.run, cls.model.suffix, cls.model.meta_fields)
|
||||
total = rows.count()
|
||||
|
||||
suffix_counter = {}
|
||||
run_status_counter = {}
|
||||
metadata_counter = {}
|
||||
|
||||
for row in rows:
|
||||
suffix_counter[row.suffix] = suffix_counter.get(row.suffix, 0) + 1
|
||||
run_status_counter[str(row.run)] = run_status_counter.get(str(row.run), 0) + 1
|
||||
meta_fields = row.meta_fields or {}
|
||||
if isinstance(meta_fields, str):
|
||||
try:
|
||||
meta_fields = json.loads(meta_fields)
|
||||
except Exception:
|
||||
meta_fields = {}
|
||||
if not isinstance(meta_fields, dict):
|
||||
continue
|
||||
for key, value in meta_fields.items():
|
||||
values = value if isinstance(value, list) else [value]
|
||||
for vv in values:
|
||||
if vv is None:
|
||||
continue
|
||||
if isinstance(vv, str) and not vv.strip():
|
||||
continue
|
||||
sv = str(vv)
|
||||
if key not in metadata_counter:
|
||||
metadata_counter[key] = {}
|
||||
metadata_counter[key][sv] = metadata_counter[key].get(sv, 0) + 1
|
||||
|
||||
return {
|
||||
"suffix": suffix_counter,
|
||||
"run_status": run_status_counter
|
||||
"run_status": run_status_counter,
|
||||
"metadata": metadata_counter,
|
||||
}, total
|
||||
|
||||
@classmethod
|
||||
|
||||
Reference in New Issue
Block a user