Feat: add or logic operations for meta data filters. (#11404)

### What problem does this PR solve? #11376 #11387 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-05 10:05:05 +08:00 · 2025-11-20 14:31:12 +08:00
parent d2b1da0e26
commit 06cef71ba6
11 changed files with 129 additions and 48 deletions
--- a/agent/tools/retrieval.py
+++ b/agent/tools/retrieval.py
@ -132,8 +132,8 @@ class Retrieval(ToolBase, ABC):
            metas = DocumentService.get_meta_by_kbs(kb_ids)
            if self._param.meta_data_filter.get("method") == "auto":
                chat_mdl = LLMBundle(self._canvas.get_tenant_id(), LLMType.CHAT)
-                filters = gen_meta_filter(chat_mdl, metas, query)
-                doc_ids.extend(meta_filter(metas, filters))
+                filters: dict = gen_meta_filter(chat_mdl, metas, query)
+                doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
                if not doc_ids:
                    doc_ids = None
            elif self._param.meta_data_filter.get("method") == "manual":
@ -165,7 +165,7 @@ class Retrieval(ToolBase, ABC):

                    out_parts.append(s[last:])
                    flt["value"] = "".join(out_parts)
-                doc_ids.extend(meta_filter(metas, filters))
+                doc_ids.extend(meta_filter(metas, filters, self._param.meta_data_filter.get("logic", "and")))
                if not doc_ids:
                    doc_ids = None

--- a/api/apps/chunk_app.py
+++ b/api/apps/chunk_app.py
@ -305,12 +305,12 @@ async def retrieval_test():
        metas = DocumentService.get_meta_by_kbs(kb_ids)
        if meta_data_filter.get("method") == "auto":
            chat_mdl = LLMBundle(current_user.id, LLMType.CHAT, llm_name=search_config.get("chat_id", ""))
-            filters = gen_meta_filter(chat_mdl, metas, question)
-            doc_ids.extend(meta_filter(metas, filters))
+            filters: dict = gen_meta_filter(chat_mdl, metas, question)
+            doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
            if not doc_ids:
                doc_ids = None
        elif meta_data_filter.get("method") == "manual":
-            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
+            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
            if not doc_ids:
                doc_ids = None

--- a/api/apps/sdk/agents.py
+++ b/api/apps/sdk/agents.py
@ -159,10 +159,10 @@ async def webhook(tenant_id: str, agent_id: str):
            data=False, message=str(e),
            code=RetCode.EXCEPTION_ERROR)

-    def sse():
+    async def sse():
        nonlocal canvas
        try:
-            for ans in canvas.run(query=req.get("query", ""), files=req.get("files", []), user_id=req.get("user_id", tenant_id), webhook_payload=req):
+            async for ans in canvas.run(query=req.get("query", ""), files=req.get("files", []), user_id=req.get("user_id", tenant_id), webhook_payload=req):
                yield "data:" + json.dumps(ans, ensure_ascii=False) + "\n\n"

            cvs.dsl = json.loads(str(canvas))
--- a/api/apps/sdk/dify_retrieval.py
+++ b/api/apps/sdk/dify_retrieval.py
@ -120,7 +120,7 @@ async def retrieval(tenant_id):
    retrieval_setting = req.get("retrieval_setting", {})
    similarity_threshold = float(retrieval_setting.get("score_threshold", 0.0))
    top = int(retrieval_setting.get("top_k", 1024))
-    metadata_condition = req.get("metadata_condition", {})
+    metadata_condition = req.get("metadata_condition", {}) or {}
    metas = DocumentService.get_meta_by_kbs([kb_id])

    doc_ids = []
@ -132,7 +132,7 @@ async def retrieval(tenant_id):

        embd_mdl = LLMBundle(kb.tenant_id, LLMType.EMBEDDING.value, llm_name=kb.embd_id)
        if metadata_condition:
-            doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition)))
+            doc_ids.extend(meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and")))
        if not doc_ids and metadata_condition:
            doc_ids = ["-999"]
        ranks = settings.retriever.retrieval(
--- a/api/apps/sdk/doc.py
+++ b/api/apps/sdk/doc.py
@ -1442,9 +1442,9 @@ async def retrieval_test(tenant_id):
        if doc_id not in doc_ids_list:
            return get_error_data_result(f"The datasets don't own the document {doc_id}")
    if not doc_ids:
-        metadata_condition = req.get("metadata_condition", {})
+        metadata_condition = req.get("metadata_condition", {}) or {}
        metas = DocumentService.get_meta_by_kbs(kb_ids)
-        doc_ids = meta_filter(metas, convert_conditions(metadata_condition))
+        doc_ids = meta_filter(metas, convert_conditions(metadata_condition), metadata_condition.get("logic", "and"))
    similarity_threshold = float(req.get("similarity_threshold", 0.2))
    vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
    top = int(req.get("top_k", 1024))
--- a/api/apps/sdk/session.py
+++ b/api/apps/sdk/session.py
@ -428,17 +428,15 @@ async def agents_completion_openai_compatibility(tenant_id, agent_id):
        return resp
    else:
        # For non-streaming, just return the response directly
-        response = next(
-            completion_openai(
+        async for response in completion_openai(
                tenant_id,
                agent_id,
                question,
                session_id=req.pop("session_id", req.get("id", "")) or req.get("metadata", {}).get("id", ""),
                stream=False,
                **req,
-            )
-        )
-        return jsonify(response)
+            ):
+            return jsonify(response)


@manager.route("/agents/<agent_id>/completions", methods=["POST"])  # noqa: F821
@ -977,12 +975,12 @@ async def retrieval_test_embedded():
        metas = DocumentService.get_meta_by_kbs(kb_ids)
        if meta_data_filter.get("method") == "auto":
            chat_mdl = LLMBundle(tenant_id, LLMType.CHAT, llm_name=search_config.get("chat_id", ""))
-            filters = gen_meta_filter(chat_mdl, metas, question)
-            doc_ids.extend(meta_filter(metas, filters))
+            filters: dict = gen_meta_filter(chat_mdl, metas, question)
+            doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
            if not doc_ids:
                doc_ids = None
        elif meta_data_filter.get("method") == "manual":
-            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
+            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
            if not doc_ids:
                doc_ids = None

--- a/api/db/services/canvas_service.py
+++ b/api/db/services/canvas_service.py
@ -177,7 +177,7 @@ class UserCanvasService(CommonService):
        return True


-def completion(tenant_id, agent_id, session_id=None, **kwargs):
+async def completion(tenant_id, agent_id, session_id=None, **kwargs):
    query = kwargs.get("query", "") or kwargs.get("question", "")
    files = kwargs.get("files", [])
    inputs = kwargs.get("inputs", {})
@ -219,7 +219,7 @@ def completion(tenant_id, agent_id, session_id=None, **kwargs):
        "id": message_id
    })
    txt = ""
-    for ans in canvas.run(query=query, files=files, user_id=user_id, inputs=inputs):
+    async for ans in canvas.run(query=query, files=files, user_id=user_id, inputs=inputs):
        ans["session_id"] = session_id
        if ans["event"] == "message":
            txt += ans["data"]["content"]
@ -237,7 +237,7 @@ def completion(tenant_id, agent_id, session_id=None, **kwargs):
    API4ConversationService.append_message(conv["id"], conv)


-def completion_openai(tenant_id, agent_id, question, session_id=None, stream=True, **kwargs):
+async def completion_openai(tenant_id, agent_id, question, session_id=None, stream=True, **kwargs):
    tiktoken_encoder = tiktoken.get_encoding("cl100k_base")
    prompt_tokens = len(tiktoken_encoder.encode(str(question)))
    user_id = kwargs.get("user_id", "")
@ -245,7 +245,7 @@ def completion_openai(tenant_id, agent_id, question, session_id=None, stream=Tru
    if stream:
        completion_tokens = 0
        try:
-            for ans in completion(
+            async for ans in completion(
                tenant_id=tenant_id,
                agent_id=agent_id,
                session_id=session_id,
@ -304,7 +304,7 @@ def completion_openai(tenant_id, agent_id, question, session_id=None, stream=Tru
        try:
            all_content = ""
            reference = {}
-            for ans in completion(
+            async for ans in completion(
                tenant_id=tenant_id,
                agent_id=agent_id,
                session_id=session_id,
--- a/api/db/services/dialog_service.py
+++ b/api/db/services/dialog_service.py
@ -287,7 +287,7 @@ def convert_conditions(metadata_condition):
    ]


-def meta_filter(metas: dict, filters: list[dict]):
+def meta_filter(metas: dict, filters: list[dict], logic: str = "and"):
    doc_ids = set([])

    def filter_out(v2docs, operator, value):
@ -331,7 +331,10 @@ def meta_filter(metas: dict, filters: list[dict]):
            if not doc_ids:
                doc_ids = set(ids)
            else:
-                doc_ids = doc_ids & set(ids)
+                if logic == "and":
+                    doc_ids = doc_ids & set(ids)
+                else:
+                    doc_ids = doc_ids | set(ids)
            if not doc_ids:
                return []
    return list(doc_ids)
@ -407,12 +410,12 @@ def chat(dialog, messages, stream=True, **kwargs):
    if dialog.meta_data_filter:
        metas = DocumentService.get_meta_by_kbs(dialog.kb_ids)
        if dialog.meta_data_filter.get("method") == "auto":
-            filters = gen_meta_filter(chat_mdl, metas, questions[-1])
-            attachments.extend(meta_filter(metas, filters))
+            filters: dict = gen_meta_filter(chat_mdl, metas, questions[-1])
+            attachments.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
            if not attachments:
                attachments = None
        elif dialog.meta_data_filter.get("method") == "manual":
-            attachments.extend(meta_filter(metas, dialog.meta_data_filter["manual"]))
+            attachments.extend(meta_filter(metas, dialog.meta_data_filter["manual"], dialog.meta_data_filter.get("logic", "and")))
            if not attachments:
                attachments = None

@ -778,12 +781,12 @@ def ask(question, kb_ids, tenant_id, chat_llm_name=None, search_config={}):
    if meta_data_filter:
        metas = DocumentService.get_meta_by_kbs(kb_ids)
        if meta_data_filter.get("method") == "auto":
-            filters = gen_meta_filter(chat_mdl, metas, question)
-            doc_ids.extend(meta_filter(metas, filters))
+            filters: dict = gen_meta_filter(chat_mdl, metas, question)
+            doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
            if not doc_ids:
                doc_ids = None
        elif meta_data_filter.get("method") == "manual":
-            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
+            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
            if not doc_ids:
                doc_ids = None

@ -853,12 +856,12 @@ def gen_mindmap(question, kb_ids, tenant_id, search_config={}):
    if meta_data_filter:
        metas = DocumentService.get_meta_by_kbs(kb_ids)
        if meta_data_filter.get("method") == "auto":
-            filters = gen_meta_filter(chat_mdl, metas, question)
-            doc_ids.extend(meta_filter(metas, filters))
+            filters: dict = gen_meta_filter(chat_mdl, metas, question)
+            doc_ids.extend(meta_filter(metas, filters["conditions"], filters.get("logic", "and")))
            if not doc_ids:
                doc_ids = None
        elif meta_data_filter.get("method") == "manual":
-            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"]))
+            doc_ids.extend(meta_filter(metas, meta_data_filter["manual"], meta_data_filter.get("logic", "and")))
            if not doc_ids:
                doc_ids = None

--- a/docs/references/http_api_reference.md
+++ b/docs/references/http_api_reference.md
@ -2085,6 +2085,7 @@ curl --request POST \
          "dataset_ids": ["b2a62730759d11ef987d0242ac120004"],
          "document_ids": ["77df9ef4759a11ef8bdd0242ac120004"],
          "metadata_condition": {
+            "logic": "and",
            "conditions": [
              {
                "name": "author",
--- a/rag/prompts/generator.py
+++ b/rag/prompts/generator.py
@ -429,7 +429,7 @@ def rank_memories(chat_mdl, goal:str, sub_goal:str, tool_call_summaries: list[st
    return re.sub(r"^.*</think>", "", ans, flags=re.DOTALL)


-def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
+def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> dict:
    sys_prompt = PROMPT_JINJA_ENV.from_string(META_FILTER).render(
        current_date=datetime.datetime.today().strftime('%Y-%m-%d'),
        metadata_keys=json.dumps(meta_data),
@ -440,11 +440,13 @@ def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
    ans = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", ans, flags=re.DOTALL)
    try:
        ans = json_repair.loads(ans)
-        assert isinstance(ans, list), ans
+        assert isinstance(ans, dict), ans
+        assert "conditions" in ans and isinstance(ans["conditions"], list), ans
        return ans
    except Exception:
        logging.exception(f"Loading json failure: {ans}")
-    return []
+
+    return {"conditions": []}


 def gen_json(system_prompt:str, user_prompt:str, chat_mdl, gen_conf = None):
--- a/rag/prompts/meta_filter.md
+++ b/rag/prompts/meta_filter.md
@ -9,11 +9,13 @@ You are a metadata filtering condition generator. Analyze the user's question an
     }

 2. **Output Requirements**:
-   - Always output a JSON array of filter objects
-   - Each object must have:
+   - Always output a JSON dictionary with only 2 keys: 'conditions'(filter objects) and 'logic' between the conditions ('and' or 'or').
+   - Each filter object in conditions must have:
        "key": (metadata attribute name),
        "value": (string value to compare),
        "op": (operator from allowed list)
+   - Logic between all the conditions: 'and'(Intersection of results for each condition) / 'or' (union of results for all conditions)
+

 3. **Operator Guide**:
   - Use these operators only: ["contains", "not contains", "start with", "end with", "empty", "not empty", "=", "≠", ">", "<", "≥", "≤"]
@ -32,22 +34,97 @@ You are a metadata filtering condition generator. Analyze the user's question an
        - Attribute doesn't exist in metadata
        - Value has no match in metadata

-5. **Example**:
+5. **Example A**:
   - User query: "上市日期七月份的有哪些商品，不要蓝色的"
   - Metadata: { "color": {...}, "listing_date": {...} }
   - Output: 
-        [
+   {
+        "logic": "and",
+        "conditions": [
          {"key": "listing_date", "value": "2025-07-01", "op": "≥"},
          {"key": "listing_date", "value": "2025-08-01", "op": "<"},
          {"key": "color", "value": "blue", "op": "≠"}
        ]
+   }

-6. **Final Output**:
-   - ONLY output valid JSON array
+6. **Example B**:
+   - User query: "Both blue and red are acceptable."
+   - Metadata: { "color": {...}, "listing_date": {...} }
+   - Output: 
+   {
+        "logic": "or",
+        "conditions": [
+          {"key": "color", "value": "blue", "op": "="},
+          {"key": "color", "value": "red", "op": "="}
+        ]
+   }
+
+7. **Final Output**:
+   - ONLY output valid JSON dictionary
   - NO additional text/explanations
+   - Json schema is as following:
+```json
+{
+  "type": "object",
+  "properties": {
+    "logic": {
+      "type": "string",
+      "description": "Logic relationship between all the conditions, the default is 'and'.",
+      "enum": [
+        "and",
+        "or"
+      ]
+    },
+    "conditions": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "key": {
+            "type": "string",
+            "description": "Metadata attribute name."
+          },
+          "value": {
+            "type": "string",
+            "description": "Value to compare."
+          },
+          "op": {
+            "type": "string",
+            "description": "Operator from allowed list.",
+            "enum": [
+              "contains",
+              "not contains",
+              "start with",
+              "end with",
+              "empty",
+              "not empty",
+              "=",
+              "≠",
+              ">",
+              "<",
+              "≥",
+              "≤"
+            ]
+          }
+        },
+        "required": [
+          "key",
+          "value",
+          "op"
+        ],
+        "additionalProperties": false
+      }
+    }
+  },
+  "required": [
+    "conditions"
+  ],
+  "additionalProperties": false
+}
+```

 **Current Task**:
- Today's date: {{current_date}}
- Available metadata keys: {{metadata_keys}}
- User query: "{{user_question}}"
+- Today's date: {{ current_date }}
+- Available metadata keys: {{ metadata_keys }}
+- User query: "{{ user_question }}"