Fix metadata filter (#12861)

### What problem does this PR solve?

##### Summary
This PR fixes a bug in the metadata filtering logic where the contains
and not contains operators were behaving identically to the in and not
in operators. It also standardizes the syntax for string-based
operators.

##### The Issue
On the main branch, the contains operator was implemented as:
`matched = input in value if not isinstance(input, list) else all(i in
value for i in input)`
This logic is identical to the `in` operator. It checks if the metadata
(`input`) exists within the filter (`value`). For a "contains" search,
the logic should be reversed: _we want to check if the filter value
exists within the metadata input_.

##### Solution Presented Here
The operators have been rewritten using str.find():
Contains: `str(input).find(value) >= 0`
Not Contains: `str(input).find(value) == -1`

##### Advantage
This approach places the metadata (input) on the left side of the
expression. This maintains stylistic consistency with the existing start
with and end with operators in the same file, which also place the input
on the left (e.g., str(input).lower().startswith(...)).

##### Considered Alternative
In a previous PR we considered using the standard Python `in` operator:
`value in str(input)`.
The `in` operator is approximately 15% faster because it uses optimized
Python bytecode (CONTAINS_OP) and avoids an attribute lookup. However
following rejection of this PR we now propose the change presented here.


### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
- [ ] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

---------

Co-authored-by: Philipp Heyken Soares <philipp.heyken-soares@am.ai>
This commit is contained in:
Philipp Heyken Soares
2026-01-29 02:59:48 +01:00
committed by GitHub
parent 47e55ab324
commit 6305c7e411
2 changed files with 117 additions and 5 deletions

View File

@ -19,9 +19,6 @@ from typing import Any, Callable, Dict
import json_repair
from rag.prompts.generator import gen_meta_filter
def convert_conditions(metadata_condition):
if metadata_condition is None:
metadata_condition = {}
@ -62,9 +59,9 @@ def meta_filter(metas: dict, filters: list[dict], logic: str = "and"):
matched = False
try:
if operator == "contains":
matched = input in value if not isinstance(input, list) else all(i in value for i in input)
matched = str(input).find(value) >= 0 if not isinstance(input, list) else any(str(i).find(value) >= 0 for i in input)
elif operator == "not contains":
matched = input not in value if not isinstance(input, list) else all(i not in value for i in input)
matched = str(input).find(value) == -1 if not isinstance(input, list) else all(str(i).find(value) == -1 for i in input)
elif operator == "in":
matched = input in value if not isinstance(input, list) else all(i in value for i in input)
elif operator == "not in":
@ -133,6 +130,8 @@ async def apply_meta_data_filter(
list of doc_ids, ["-999"] when manual filters yield no result, or None
when auto/semi_auto filters return empty.
"""
from rag.prompts.generator import gen_meta_filter # move from the top of the file to avoid circular import
doc_ids = list(base_doc_ids) if base_doc_ids else []
if not meta_data_filter:

View File

@ -0,0 +1,113 @@
from common.metadata_utils import meta_filter
def test_contains():
# returns chunk where the metadata contains the value
metas = {"version": {"hello earth": ["doc1"], "hello mars": ["doc2"]}}
filters = [{"key": "version", "op": "contains", "value": "earth"}]
assert meta_filter(metas, filters) == ["doc1"]
def test_not_contains():
# returns chunk where the metadata does not contain the value
metas = {"version": {"hello earth": ["doc1"], "hello mars": ["doc2"]}}
filters = [{"key": "version", "op": "not contains", "value": "earth"}]
assert meta_filter(metas, filters) == ["doc2"]
def test_in_operator():
# returns chunk where the metadata is in the value
metas = {"status": {"active": ["doc1"], "pending": ["doc2"], "done": ["doc3"]}}
filters = [{"key": "status", "op": "in", "value": "active,pending"}]
assert set(meta_filter(metas, filters)) == {"doc1", "doc2"}
def test_not_in_operator():
# returns chunk where the metadata is not in the value
metas = {"status": {"active": ["doc1"], "pending": ["doc2"], "done": ["doc3"]}}
filters = [{"key": "status", "op": "not in", "value": "active,pending"}]
assert meta_filter(metas, filters) == ["doc3"]
def test_start_with():
# returns chunk where the metadata starts with the value
metas = {"name": {"prefix_value": ["doc1"], "other": ["doc2"]}}
filters = [{"key": "name", "op": "start with", "value": "pre"}]
assert meta_filter(metas, filters) == ["doc1"]
def test_end_with():
# returns chunk where the metadata ends with the value
metas = {"file": {"report.pdf": ["doc1"], "image.png": ["doc2"]}}
filters = [{"key": "file", "op": "end with", "value": ".pdf"}]
assert meta_filter(metas, filters) == ["doc1"]
def test_empty():
# returns chunk where the metadata is empty
metas = {"notes": {"": ["doc1"], "non-empty": ["doc2"]}}
filters = [{"key": "notes", "op": "empty", "value": ""}]
assert meta_filter(metas, filters) == ["doc1"]
def test_not_empty():
# returns chunk where the metadata is not empty
metas = {"notes": {"": ["doc1"], "non-empty": ["doc2"]}}
filters = [{"key": "notes", "op": "not empty", "value": ""}]
assert meta_filter(metas, filters) == ["doc2"]
def test_equal():
# returns chunk where the metadata is equal to the value
metas = {"score": {"5": ["doc1"], "6": ["doc2"]}}
filters = [{"key": "score", "op": "=", "value": "5"}]
assert meta_filter(metas, filters) == ["doc1"]
def test_not_equal():
# returns chunk where the metadata is not equal to the value
metas = {"score": {"5": ["doc1"], "6": ["doc2"]}}
filters = [{"key": "score", "op": "", "value": "5"}]
assert meta_filter(metas, filters) == ["doc2"]
def test_greater_than():
# returns chunk where the metadata is greater than the value
metas = {"score": {"10": ["doc1"], "2": ["doc2"]}}
filters = [{"key": "score", "op": ">", "value": "5"}]
assert meta_filter(metas, filters) == ["doc1"]
def test_less_than():
# returns chunk where the metadata is less than the value
metas = {"score": {"10": ["doc1"], "2": ["doc2"]}}
filters = [{"key": "score", "op": "<", "value": "5"}]
assert meta_filter(metas, filters) == ["doc2"]
def test_greater_than_or_equal():
# returns chunk where the metadata is greater than or equal to the value
metas = {"score": {"5": ["doc1"], "6": ["doc2"], "4": ["doc3"]}}
filters = [{"key": "score", "op": "", "value": "5"}]
assert set(meta_filter(metas, filters)) == {"doc1", "doc2"}
def test_less_than_or_equal():
# returns chunk where the metadata is less than or equal to the value
metas = {"score": {"5": ["doc1"], "6": ["doc2"], "4": ["doc3"]}}
filters = [{"key": "score", "op": "", "value": "5"}]
assert set(meta_filter(metas, filters)) == {"doc1", "doc3"}