Feat: add or logic operations for meta data filters. (#11404)

### What problem does this PR solve?

#11376 #11387

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2025-11-20 14:31:12 +08:00
committed by GitHub
parent d2b1da0e26
commit 06cef71ba6
11 changed files with 129 additions and 48 deletions

View File

@ -132,8 +132,8 @@ class Retrieval(ToolBase, ABC):
metas = DocumentService.get_meta_by_kbs(kb_ids) metas = DocumentService.get_meta_by_kbs(kb_ids)
if self._param.meta_data_filter.get("method") == "auto": if self._param.meta_data_filter.get("method") == "auto":
chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT) chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT)
filters = gen_meta_filter(chat_mdl, metas, query) filters: dict = gen_meta_filter(chat_mdl, metas, query)
doc_ids.extend(meta_filter(metas, filters)) doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not doc_ids: if not doc_ids:
doc_ids = None doc_ids = None
elif self._param.meta_data_filter.get("method") == "manual": elif self._param.meta_data_filter.get("method") == "manual":
@ -165,7 +165,7 @@ class Retrieval(ToolBase, ABC):
out_parts.append(s[last:]) out_parts.append(s[last:])
flt["value"] = "".join(out_parts) flt["value"] = "".join(out_parts)
doc_ids.extend(meta_filter(metas, filters)) doc_ids.extend(meta_filter(metas, filters, self._param.meta_data_filter.get("logic", "and")))
if not doc_ids: if not doc_ids:
doc_ids = None doc_ids = None

View File

@ -305,12 +305,12 @@ async def retrieval_test():
metas = DocumentService.get_meta_by_kbs(kb_ids) metas = DocumentService.get_meta_by_kbs(kb_ids)
if meta_data_filter.get("method") == "auto": if meta_data_filter.get("method") == "auto":
chat_mdl = LLMBundle(current_user.id, LLMType.CHAT, llm_name=search_config.get("chat_id", "")) chat_mdl = LLMBundle(current_user.id, LLMType.CHAT, llm_name=search_config.get("chat_id", ""))
filters = gen_meta_filter(chat_mdl, metas, question) filters: dict = gen_meta_filter(chat_mdl, metas, question)
doc_ids.extend(meta_filter(metas, filters)) doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not doc_ids: if not doc_ids:
doc_ids = None doc_ids = None
elif meta_data_filter.get("method") == "manual": elif meta_data_filter.get("method") == "manual":
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"])) doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
if not doc_ids: if not doc_ids:
doc_ids = None doc_ids = None

View File

@ -159,10 +159,10 @@ async def webhook(tenant_id: str, agent_id: str):
data=False, message=str(e), data=False, message=str(e),
code=RetCode.EXCEPTION_ERROR) code=RetCode.EXCEPTION_ERROR)
def sse(): async def sse():
nonlocal canvas nonlocal canvas
try: try:
for ans in canvas.run(query=req.get("query", ""), files=req.get("files", []), user_id=req.get("user_id", tenant_id), webhook_payload=req): async for ans in canvas.run(query=req.get("query", ""), files=req.get("files", []), user_id=req.get("user_id", tenant_id), webhook_payload=req):
yield "data:" + json.dumps(ans, ensure_ascii=False) + "\n\n" yield "data:" + json.dumps(ans, ensure_ascii=False) + "\n\n"
cvs.dsl = json.loads(str(canvas)) cvs.dsl = json.loads(str(canvas))

View File

@ -120,7 +120,7 @@ async def retrieval(tenant_id):
retrieval_setting = req.get("retrieval_setting", {}) retrieval_setting = req.get("retrieval_setting", {})
similarity_threshold = float(retrieval_setting.get("score_threshold", 0.0)) similarity_threshold = float(retrieval_setting.get("score_threshold", 0.0))
top = int(retrieval_setting.get("top_k", 1024)) top = int(retrieval_setting.get("top_k", 1024))
metadata_condition = req.get("metadata_condition", {}) metadata_condition = req.get("metadata_condition", {}) or {}
metas = DocumentService.get_meta_by_kbs([kb_id]) metas = DocumentService.get_meta_by_kbs([kb_id])
doc_ids = [] doc_ids = []
@ -132,7 +132,7 @@ async def retrieval(tenant_id):
embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id) embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id)
if metadata_condition: if metadata_condition:
doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition))) doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
if not doc_ids and metadata_condition: if not doc_ids and metadata_condition:
doc_ids = ["-999"] doc_ids = ["-999"]
ranks = settings.retriever.retrieval( ranks = settings.retriever.retrieval(

View File

@ -1442,9 +1442,9 @@ async def retrieval_test(tenant_id):
if doc_id not in doc_ids_list: if doc_id not in doc_ids_list:
return get_error_data_result(f"The datasets don't own the document {doc_id}") return get_error_data_result(f"The datasets don't own the document {doc_id}")
if not doc_ids: if not doc_ids:
metadata_condition = req.get("metadata_condition", {}) metadata_condition = req.get("metadata_condition", {}) or {}
metas = DocumentService.get_meta_by_kbs(kb_ids) metas = DocumentService.get_meta_by_kbs(kb_ids)
doc_ids = meta_filter(metas, convert_conditions(metadata_condition)) doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
similarity_threshold = float(req.get("similarity_threshold", 0.2)) similarity_threshold = float(req.get("similarity_threshold", 0.2))
vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3)) vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
top = int(req.get("top_k", 1024)) top = int(req.get("top_k", 1024))

View File

@ -428,17 +428,15 @@ async def agents_completion_openai_compatibility(tenant_id, agent_id):
return resp return resp
else: else:
# For non-streaming, just return the response directly # For non-streaming, just return the response directly
response = next( async for response in completion_openai(
completion_openai(
tenant_id, tenant_id,
agent_id, agent_id,
question, question,
session_id=req.pop("session_id", req.get("id", "")) or req.get("metadata", {}).get("id", ""), session_id=req.pop("session_id", req.get("id", "")) or req.get("metadata", {}).get("id", ""),
stream=False, stream=False,
**req, **req,
) ):
) return jsonify(response)
return jsonify(response)
@manager.route("/agents/<agent_id>/completions", methods=["POST"]) # noqa: F821 @manager.route("/agents/<agent_id>/completions", methods=["POST"]) # noqa: F821
@ -977,12 +975,12 @@ async def retrieval_test_embedded():
metas = DocumentService.get_meta_by_kbs(kb_ids) metas = DocumentService.get_meta_by_kbs(kb_ids)
if meta_data_filter.get("method") == "auto": if meta_data_filter.get("method") == "auto":
chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_name=search_config.get("chat_id", "")) chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_name=search_config.get("chat_id", ""))
filters = gen_meta_filter(chat_mdl, metas, question) filters: dict = gen_meta_filter(chat_mdl, metas, question)
doc_ids.extend(meta_filter(metas, filters)) doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not doc_ids: if not doc_ids:
doc_ids = None doc_ids = None
elif meta_data_filter.get("method") == "manual": elif meta_data_filter.get("method") == "manual":
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"])) doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
if not doc_ids: if not doc_ids:
doc_ids = None doc_ids = None

View File

@ -177,7 +177,7 @@ class UserCanvasService(CommonService):
return True return True
def completion(tenant_id, agent_id, session_id=None, **kwargs): async def completion(tenant_id, agent_id, session_id=None, **kwargs):
query = kwargs.get("query", "") or kwargs.get("question", "") query = kwargs.get("query", "") or kwargs.get("question", "")
files = kwargs.get("files", []) files = kwargs.get("files", [])
inputs = kwargs.get("inputs", {}) inputs = kwargs.get("inputs", {})
@ -219,7 +219,7 @@ def completion(tenant_id, agent_id, session_id=None, **kwargs):
"id": message_id "id": message_id
}) })
txt = "" txt = ""
for ans in canvas.run(query=query, files=files, user_id=user_id, inputs=inputs): async for ans in canvas.run(query=query, files=files, user_id=user_id, inputs=inputs):
ans["session_id"] = session_id ans["session_id"] = session_id
if ans["event"] == "message": if ans["event"] == "message":
txt += ans["data"]["content"] txt += ans["data"]["content"]
@ -237,7 +237,7 @@ def completion(tenant_id, agent_id, session_id=None, **kwargs):
API4ConversationService.append_message(conv["id"], conv) API4ConversationService.append_message(conv["id"], conv)
def completion_openai(tenant_id, agent_id, question, session_id=None, stream=True, **kwargs): async def completion_openai(tenant_id, agent_id, question, session_id=None, stream=True, **kwargs):
tiktoken_encoder = tiktoken.get_encoding("cl100k_base") tiktoken_encoder = tiktoken.get_encoding("cl100k_base")
prompt_tokens = len(tiktoken_encoder.encode(str(question))) prompt_tokens = len(tiktoken_encoder.encode(str(question)))
user_id = kwargs.get("user_id", "") user_id = kwargs.get("user_id", "")
@ -245,7 +245,7 @@ def completion_openai(tenant_id, agent_id, question, session_id=None, stream=Tru
if stream: if stream:
completion_tokens = 0 completion_tokens = 0
try: try:
for ans in completion( async for ans in completion(
tenant_id=tenant_id, tenant_id=tenant_id,
agent_id=agent_id, agent_id=agent_id,
session_id=session_id, session_id=session_id,
@ -304,7 +304,7 @@ def completion_openai(tenant_id, agent_id, question, session_id=None, stream=Tru
try: try:
all_content = "" all_content = ""
reference = {} reference = {}
for ans in completion( async for ans in completion(
tenant_id=tenant_id, tenant_id=tenant_id,
agent_id=agent_id, agent_id=agent_id,
session_id=session_id, session_id=session_id,

View File

@ -287,7 +287,7 @@ def convert_conditions(metadata_condition):
] ]
def meta_filter(metas: dict, filters: list[dict]): def meta_filter(metas: dict, filters: list[dict], logic: str = "and"):
doc_ids = set([]) doc_ids = set([])
def filter_out(v2docs, operator, value): def filter_out(v2docs, operator, value):
@ -331,7 +331,10 @@ def meta_filter(metas: dict, filters: list[dict]):
if not doc_ids: if not doc_ids:
doc_ids = set(ids) doc_ids = set(ids)
else: else:
doc_ids = doc_ids & set(ids) if logic == "and":
doc_ids = doc_ids & set(ids)
else:
doc_ids = doc_ids | set(ids)
if not doc_ids: if not doc_ids:
return [] return []
return list(doc_ids) return list(doc_ids)
@ -407,12 +410,12 @@ def chat(dialog, messages, stream=True, **kwargs):
if dialog.meta_data_filter: if dialog.meta_data_filter:
metas = DocumentService.get_meta_by_kbs(dialog.kb_ids) metas = DocumentService.get_meta_by_kbs(dialog.kb_ids)
if dialog.meta_data_filter.get("method") == "auto": if dialog.meta_data_filter.get("method") == "auto":
filters = gen_meta_filter(chat_mdl, metas, questions[-1]) filters: dict = gen_meta_filter(chat_mdl, metas, questions[-1])
attachments.extend(meta_filter(metas, filters)) attachments.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not attachments: if not attachments:
attachments = None attachments = None
elif dialog.meta_data_filter.get("method") == "manual": elif dialog.meta_data_filter.get("method") == "manual":
attachments.extend(meta_filter(metas, dialog.meta_data_filter["manual"])) attachments.extend(meta_filter(metas, dialog.meta_data_filter["manual"], dialog.meta_data_filter.get("logic", "and")))
if not attachments: if not attachments:
attachments = None attachments = None
@ -778,12 +781,12 @@ def ask(question, kb_ids, tenant_id, chat_llm_name=None, search_config={}):
if meta_data_filter: if meta_data_filter:
metas = DocumentService.get_meta_by_kbs(kb_ids) metas = DocumentService.get_meta_by_kbs(kb_ids)
if meta_data_filter.get("method") == "auto": if meta_data_filter.get("method") == "auto":
filters = gen_meta_filter(chat_mdl, metas, question) filters: dict = gen_meta_filter(chat_mdl, metas, question)
doc_ids.extend(meta_filter(metas, filters)) doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not doc_ids: if not doc_ids:
doc_ids = None doc_ids = None
elif meta_data_filter.get("method") == "manual": elif meta_data_filter.get("method") == "manual":
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"])) doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
if not doc_ids: if not doc_ids:
doc_ids = None doc_ids = None
@ -853,12 +856,12 @@ def gen_mindmap(question, kb_ids, tenant_id, search_config={}):
if meta_data_filter: if meta_data_filter:
metas = DocumentService.get_meta_by_kbs(kb_ids) metas = DocumentService.get_meta_by_kbs(kb_ids)
if meta_data_filter.get("method") == "auto": if meta_data_filter.get("method") == "auto":
filters = gen_meta_filter(chat_mdl, metas, question) filters: dict = gen_meta_filter(chat_mdl, metas, question)
doc_ids.extend(meta_filter(metas, filters)) doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
if not doc_ids: if not doc_ids:
doc_ids = None doc_ids = None
elif meta_data_filter.get("method") == "manual": elif meta_data_filter.get("method") == "manual":
doc_ids.extend(meta_filter(metas, meta_data_filter["manual"])) doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
if not doc_ids: if not doc_ids:
doc_ids = None doc_ids = None

View File

@ -2085,6 +2085,7 @@ curl --request POST \
"dataset_ids": ["b2a62730759d11ef987d0242ac120004"], "dataset_ids": ["b2a62730759d11ef987d0242ac120004"],
"document_ids": ["77df9ef4759a11ef8bdd0242ac120004"], "document_ids": ["77df9ef4759a11ef8bdd0242ac120004"],
"metadata_condition": { "metadata_condition": {
"logic": "and",
"conditions": [ "conditions": [
{ {
"name": "author", "name": "author",

View File

@ -429,7 +429,7 @@ def rank_memories(chat_mdl, goal:str, sub_goal:str, tool_call_summaries: list[st
return re.sub(r"^.*</think>", "", ans, flags=re.DOTALL) return re.sub(r"^.*</think>", "", ans, flags=re.DOTALL)
def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list: def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> dict:
sys_prompt = PROMPT_JINJA_ENV.from_string(META_FILTER).render( sys_prompt = PROMPT_JINJA_ENV.from_string(META_FILTER).render(
current_date=datetime.datetime.today().strftime('%Y-%m-%d'), current_date=datetime.datetime.today().strftime('%Y-%m-%d'),
metadata_keys=json.dumps(meta_data), metadata_keys=json.dumps(meta_data),
@ -440,11 +440,13 @@ def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
ans = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", ans, flags=re.DOTALL) ans = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", ans, flags=re.DOTALL)
try: try:
ans = json_repair.loads(ans) ans = json_repair.loads(ans)
assert isinstance(ans, list), ans assert isinstance(ans, dict), ans
assert "conditions" in ans and isinstance(ans["conditions"], list), ans
return ans return ans
except Exception: except Exception:
logging.exception(f"Loading json failure: {ans}") logging.exception(f"Loading json failure: {ans}")
return []
return {"conditions": []}
def gen_json(system_prompt:str, user_prompt:str, chat_mdl, gen_conf = None): def gen_json(system_prompt:str, user_prompt:str, chat_mdl, gen_conf = None):

View File

@ -9,11 +9,13 @@ You are a metadata filtering condition generator. Analyze the user's question an
} }
2. **Output Requirements**: 2. **Output Requirements**:
- Always output a JSON array of filter objects - Always output a JSON dictionary with only 2 keys: 'conditions'(filter objects) and 'logic' between the conditions ('and' or 'or').
- Each object must have: - Each filter object in conditions must have:
"key": (metadata attribute name), "key": (metadata attribute name),
"value": (string value to compare), "value": (string value to compare),
"op": (operator from allowed list) "op": (operator from allowed list)
- Logic between all the conditions: 'and'(Intersection of results for each condition) / 'or' (union of results for all conditions)
3. **Operator Guide**: 3. **Operator Guide**:
- Use these operators only: ["contains", "not contains", "start with", "end with", "empty", "not empty", "=", "≠", ">", "<", "≥", "≤"] - Use these operators only: ["contains", "not contains", "start with", "end with", "empty", "not empty", "=", "≠", ">", "<", "≥", "≤"]
@ -32,22 +34,97 @@ You are a metadata filtering condition generator. Analyze the user's question an
- Attribute doesn't exist in metadata - Attribute doesn't exist in metadata
- Value has no match in metadata - Value has no match in metadata
5. **Example**: 5. **Example A**:
- User query: "上市日期七月份的有哪些商品不要蓝色的" - User query: "上市日期七月份的有哪些商品不要蓝色的"
- Metadata: { "color": {...}, "listing_date": {...} } - Metadata: { "color": {...}, "listing_date": {...} }
- Output: - Output:
[ {
"logic": "and",
"conditions": [
{"key": "listing_date", "value": "2025-07-01", "op": "≥"}, {"key": "listing_date", "value": "2025-07-01", "op": "≥"},
{"key": "listing_date", "value": "2025-08-01", "op": "<"}, {"key": "listing_date", "value": "2025-08-01", "op": "<"},
{"key": "color", "value": "blue", "op": "≠"} {"key": "color", "value": "blue", "op": "≠"}
] ]
}
6. **Final Output**: 6. **Example B**:
- ONLY output valid JSON array - User query: "Both blue and red are acceptable."
- Metadata: { "color": {...}, "listing_date": {...} }
- Output:
{
"logic": "or",
"conditions": [
{"key": "color", "value": "blue", "op": "="},
{"key": "color", "value": "red", "op": "="}
]
}
7. **Final Output**:
- ONLY output valid JSON dictionary
- NO additional text/explanations - NO additional text/explanations
- Json schema is as following:
```json
{
"type": "object",
"properties": {
"logic": {
"type": "string",
"description": "Logic relationship between all the conditions, the default is 'and'.",
"enum": [
"and",
"or"
]
},
"conditions": {
"type": "array",
"items": {
"type": "object",
"properties": {
"key": {
"type": "string",
"description": "Metadata attribute name."
},
"value": {
"type": "string",
"description": "Value to compare."
},
"op": {
"type": "string",
"description": "Operator from allowed list.",
"enum": [
"contains",
"not contains",
"start with",
"end with",
"empty",
"not empty",
"=",
"≠",
">",
"<",
"≥",
"≤"
]
}
},
"required": [
"key",
"value",
"op"
],
"additionalProperties": false
}
}
},
"required": [
"conditions"
],
"additionalProperties": false
}
```
**Current Task**: **Current Task**:
- Today's date: {{current_date}} - Today's date: {{ current_date }}
- Available metadata keys: {{metadata_keys}} - Available metadata keys: {{ metadata_keys }}
- User query: "{{user_question}}" - User query: "{{ user_question }}"