mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-29 22:56:36 +08:00
Fix metadata filter (#12861)
### What problem does this PR solve? ##### Summary This PR fixes a bug in the metadata filtering logic where the contains and not contains operators were behaving identically to the in and not in operators. It also standardizes the syntax for string-based operators. ##### The Issue On the main branch, the contains operator was implemented as: `matched = input in value if not isinstance(input, list) else all(i in value for i in input)` This logic is identical to the `in` operator. It checks if the metadata (`input`) exists within the filter (`value`). For a "contains" search, the logic should be reversed: _we want to check if the filter value exists within the metadata input_. ##### Solution Presented Here The operators have been rewritten using str.find(): Contains: `str(input).find(value) >= 0` Not Contains: `str(input).find(value) == -1` ##### Advantage This approach places the metadata (input) on the left side of the expression. This maintains stylistic consistency with the existing start with and end with operators in the same file, which also place the input on the left (e.g., str(input).lower().startswith(...)). ##### Considered Alternative In a previous PR we considered using the standard Python `in` operator: `value in str(input)`. The `in` operator is approximately 15% faster because it uses optimized Python bytecode (CONTAINS_OP) and avoids an attribute lookup. However following rejection of this PR we now propose the change presented here. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): --------- Co-authored-by: Philipp Heyken Soares <philipp.heyken-soares@am.ai>
This commit is contained in:
committed by
GitHub
parent
47e55ab324
commit
6305c7e411
@ -19,9 +19,6 @@ from typing import Any, Callable, Dict
|
||||
|
||||
import json_repair
|
||||
|
||||
from rag.prompts.generator import gen_meta_filter
|
||||
|
||||
|
||||
def convert_conditions(metadata_condition):
|
||||
if metadata_condition is None:
|
||||
metadata_condition = {}
|
||||
@ -62,9 +59,9 @@ def meta_filter(metas: dict, filters: list[dict], logic: str = "and"):
|
||||
matched = False
|
||||
try:
|
||||
if operator == "contains":
|
||||
matched = input in value if not isinstance(input, list) else all(i in value for i in input)
|
||||
matched = str(input).find(value) >= 0 if not isinstance(input, list) else any(str(i).find(value) >= 0 for i in input)
|
||||
elif operator == "not contains":
|
||||
matched = input not in value if not isinstance(input, list) else all(i not in value for i in input)
|
||||
matched = str(input).find(value) == -1 if not isinstance(input, list) else all(str(i).find(value) == -1 for i in input)
|
||||
elif operator == "in":
|
||||
matched = input in value if not isinstance(input, list) else all(i in value for i in input)
|
||||
elif operator == "not in":
|
||||
@ -133,6 +130,8 @@ async def apply_meta_data_filter(
|
||||
list of doc_ids, ["-999"] when manual filters yield no result, or None
|
||||
when auto/semi_auto filters return empty.
|
||||
"""
|
||||
from rag.prompts.generator import gen_meta_filter # move from the top of the file to avoid circular import
|
||||
|
||||
doc_ids = list(base_doc_ids) if base_doc_ids else []
|
||||
|
||||
if not meta_data_filter:
|
||||
|
||||
113
test/unit_test/common/test_metadata_filter_operators.py
Normal file
113
test/unit_test/common/test_metadata_filter_operators.py
Normal file
@ -0,0 +1,113 @@
|
||||
from common.metadata_utils import meta_filter
|
||||
|
||||
|
||||
def test_contains():
|
||||
# returns chunk where the metadata contains the value
|
||||
metas = {"version": {"hello earth": ["doc1"], "hello mars": ["doc2"]}}
|
||||
filters = [{"key": "version", "op": "contains", "value": "earth"}]
|
||||
|
||||
assert meta_filter(metas, filters) == ["doc1"]
|
||||
|
||||
|
||||
def test_not_contains():
|
||||
# returns chunk where the metadata does not contain the value
|
||||
metas = {"version": {"hello earth": ["doc1"], "hello mars": ["doc2"]}}
|
||||
filters = [{"key": "version", "op": "not contains", "value": "earth"}]
|
||||
|
||||
assert meta_filter(metas, filters) == ["doc2"]
|
||||
|
||||
|
||||
def test_in_operator():
|
||||
# returns chunk where the metadata is in the value
|
||||
metas = {"status": {"active": ["doc1"], "pending": ["doc2"], "done": ["doc3"]}}
|
||||
filters = [{"key": "status", "op": "in", "value": "active,pending"}]
|
||||
|
||||
assert set(meta_filter(metas, filters)) == {"doc1", "doc2"}
|
||||
|
||||
|
||||
def test_not_in_operator():
|
||||
# returns chunk where the metadata is not in the value
|
||||
metas = {"status": {"active": ["doc1"], "pending": ["doc2"], "done": ["doc3"]}}
|
||||
filters = [{"key": "status", "op": "not in", "value": "active,pending"}]
|
||||
|
||||
assert meta_filter(metas, filters) == ["doc3"]
|
||||
|
||||
|
||||
def test_start_with():
|
||||
# returns chunk where the metadata starts with the value
|
||||
metas = {"name": {"prefix_value": ["doc1"], "other": ["doc2"]}}
|
||||
filters = [{"key": "name", "op": "start with", "value": "pre"}]
|
||||
|
||||
assert meta_filter(metas, filters) == ["doc1"]
|
||||
|
||||
|
||||
def test_end_with():
|
||||
# returns chunk where the metadata ends with the value
|
||||
metas = {"file": {"report.pdf": ["doc1"], "image.png": ["doc2"]}}
|
||||
filters = [{"key": "file", "op": "end with", "value": ".pdf"}]
|
||||
|
||||
assert meta_filter(metas, filters) == ["doc1"]
|
||||
|
||||
|
||||
def test_empty():
|
||||
# returns chunk where the metadata is empty
|
||||
metas = {"notes": {"": ["doc1"], "non-empty": ["doc2"]}}
|
||||
filters = [{"key": "notes", "op": "empty", "value": ""}]
|
||||
|
||||
assert meta_filter(metas, filters) == ["doc1"]
|
||||
|
||||
|
||||
def test_not_empty():
|
||||
# returns chunk where the metadata is not empty
|
||||
metas = {"notes": {"": ["doc1"], "non-empty": ["doc2"]}}
|
||||
filters = [{"key": "notes", "op": "not empty", "value": ""}]
|
||||
|
||||
assert meta_filter(metas, filters) == ["doc2"]
|
||||
|
||||
|
||||
def test_equal():
|
||||
# returns chunk where the metadata is equal to the value
|
||||
metas = {"score": {"5": ["doc1"], "6": ["doc2"]}}
|
||||
filters = [{"key": "score", "op": "=", "value": "5"}]
|
||||
|
||||
assert meta_filter(metas, filters) == ["doc1"]
|
||||
|
||||
|
||||
def test_not_equal():
|
||||
# returns chunk where the metadata is not equal to the value
|
||||
metas = {"score": {"5": ["doc1"], "6": ["doc2"]}}
|
||||
filters = [{"key": "score", "op": "≠", "value": "5"}]
|
||||
|
||||
assert meta_filter(metas, filters) == ["doc2"]
|
||||
|
||||
|
||||
def test_greater_than():
|
||||
# returns chunk where the metadata is greater than the value
|
||||
metas = {"score": {"10": ["doc1"], "2": ["doc2"]}}
|
||||
filters = [{"key": "score", "op": ">", "value": "5"}]
|
||||
|
||||
assert meta_filter(metas, filters) == ["doc1"]
|
||||
|
||||
|
||||
def test_less_than():
|
||||
# returns chunk where the metadata is less than the value
|
||||
metas = {"score": {"10": ["doc1"], "2": ["doc2"]}}
|
||||
filters = [{"key": "score", "op": "<", "value": "5"}]
|
||||
|
||||
assert meta_filter(metas, filters) == ["doc2"]
|
||||
|
||||
|
||||
def test_greater_than_or_equal():
|
||||
# returns chunk where the metadata is greater than or equal to the value
|
||||
metas = {"score": {"5": ["doc1"], "6": ["doc2"], "4": ["doc3"]}}
|
||||
filters = [{"key": "score", "op": "≥", "value": "5"}]
|
||||
|
||||
assert set(meta_filter(metas, filters)) == {"doc1", "doc2"}
|
||||
|
||||
|
||||
def test_less_than_or_equal():
|
||||
# returns chunk where the metadata is less than or equal to the value
|
||||
metas = {"score": {"5": ["doc1"], "6": ["doc2"], "4": ["doc3"]}}
|
||||
filters = [{"key": "score", "op": "≤", "value": "5"}]
|
||||
|
||||
assert set(meta_filter(metas, filters)) == {"doc1", "doc3"}
|
||||
Reference in New Issue
Block a user