mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-01 16:15:07 +08:00
Fix metadata filter (#12861)
### What problem does this PR solve? ##### Summary This PR fixes a bug in the metadata filtering logic where the contains and not contains operators were behaving identically to the in and not in operators. It also standardizes the syntax for string-based operators. ##### The Issue On the main branch, the contains operator was implemented as: `matched = input in value if not isinstance(input, list) else all(i in value for i in input)` This logic is identical to the `in` operator. It checks if the metadata (`input`) exists within the filter (`value`). For a "contains" search, the logic should be reversed: _we want to check if the filter value exists within the metadata input_. ##### Solution Presented Here The operators have been rewritten using str.find(): Contains: `str(input).find(value) >= 0` Not Contains: `str(input).find(value) == -1` ##### Advantage This approach places the metadata (input) on the left side of the expression. This maintains stylistic consistency with the existing start with and end with operators in the same file, which also place the input on the left (e.g., str(input).lower().startswith(...)). ##### Considered Alternative In a previous PR we considered using the standard Python `in` operator: `value in str(input)`. The `in` operator is approximately 15% faster because it uses optimized Python bytecode (CONTAINS_OP) and avoids an attribute lookup. However following rejection of this PR we now propose the change presented here. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [ ] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): --------- Co-authored-by: Philipp Heyken Soares <philipp.heyken-soares@am.ai>
This commit is contained in:
committed by
GitHub
parent
47e55ab324
commit
6305c7e411
@ -19,9 +19,6 @@ from typing import Any, Callable, Dict
|
||||
|
||||
import json_repair
|
||||
|
||||
from rag.prompts.generator import gen_meta_filter
|
||||
|
||||
|
||||
def convert_conditions(metadata_condition):
|
||||
if metadata_condition is None:
|
||||
metadata_condition = {}
|
||||
@ -62,9 +59,9 @@ def meta_filter(metas: dict, filters: list[dict], logic: str = "and"):
|
||||
matched = False
|
||||
try:
|
||||
if operator == "contains":
|
||||
matched = input in value if not isinstance(input, list) else all(i in value for i in input)
|
||||
matched = str(input).find(value) >= 0 if not isinstance(input, list) else any(str(i).find(value) >= 0 for i in input)
|
||||
elif operator == "not contains":
|
||||
matched = input not in value if not isinstance(input, list) else all(i not in value for i in input)
|
||||
matched = str(input).find(value) == -1 if not isinstance(input, list) else all(str(i).find(value) == -1 for i in input)
|
||||
elif operator == "in":
|
||||
matched = input in value if not isinstance(input, list) else all(i in value for i in input)
|
||||
elif operator == "not in":
|
||||
@ -133,6 +130,8 @@ async def apply_meta_data_filter(
|
||||
list of doc_ids, ["-999"] when manual filters yield no result, or None
|
||||
when auto/semi_auto filters return empty.
|
||||
"""
|
||||
from rag.prompts.generator import gen_meta_filter # move from the top of the file to avoid circular import
|
||||
|
||||
doc_ids = list(base_doc_ids) if base_doc_ids else []
|
||||
|
||||
if not meta_data_filter:
|
||||
|
||||
Reference in New Issue
Block a user