From 677c99b090f76e4a1cc94447841b0adab3b03afc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=A4=A9=E6=B5=B7=E8=92=BC=E7=81=86?= Date: Fri, 5 Sep 2025 11:12:15 +0800 Subject: [PATCH] Feat: Add metadata filtering function for /api/v1/retrieval (#9877) -Added the metadata_dedition parameter in the document retrieval interface to filter document metadata -Updated the API documentation and added explanations for the metadata_dedition parameter ### What problem does this PR solve? Make /api/v1/retrieval api also can use metadata filter ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- api/apps/sdk/doc.py | 9 +++++++++ docs/references/http_api_reference.md | 6 ++++-- docs/references/python_api_reference.md | 13 +++++-------- sdk/python/ragflow_sdk/ragflow.py | 6 ++++-- 4 files changed, 22 insertions(+), 12 deletions(-) diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index 454633f07..50e6f7bba 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -35,6 +35,8 @@ from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.llm_service import LLMBundle from api.db.services.tenant_llm_service import TenantLLMService from api.db.services.task_service import TaskService, queue_tasks +from api.db.services.dialog_service import meta_filter +from api.apps.sdk.dify_retrieval import convert_conditions from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_parser_config, get_result, server_error_response, token_required from rag.app.qa import beAdoc, rmPrefix from rag.app.tag import label_question @@ -1350,6 +1352,9 @@ def retrieval_test(tenant_id): highlight: type: boolean description: Whether to highlight matched content. + metadata_condition: + type: object + description: metadata filter condition. - in: header name: Authorization type: string @@ -1413,6 +1418,10 @@ def retrieval_test(tenant_id): for doc_id in doc_ids: if doc_id not in doc_ids_list: return get_error_data_result(f"The datasets don't own the document {doc_id}") + if not doc_ids: + metadata_condition = req.get("metadata_condition", {}) + metas = DocumentService.get_meta_by_kbs(kb_ids) + doc_ids = meta_filter(metas, convert_conditions(metadata_condition)) similarity_threshold = float(req.get("similarity_threshold", 0.2)) vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3)) top = int(req.get("top_k", 1024)) diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index 6dbeaf444..791701ebf 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -1808,7 +1808,8 @@ Retrieves chunks from specified datasets. - `"rerank_id"`: `string` - `"keyword"`: `boolean` - `"highlight"`: `boolean` - - `"cross_languages"`: `list[string]` + - `"cross_languages"`: `list[string]` + - `"metadata_condition"`: `object` ##### Request example @@ -1855,7 +1856,8 @@ curl --request POST \ - `false`: Disable highlighting of matched terms (default). - `"cross_languages"`: (*Body parameter*) `list[string]` The languages that should be translated into, in order to achieve keywords retrievals in different languages. - +- `"metadata_condition"`: (*Body parameter*), `object` + The metadata condition for filtering chunks. #### Response Success: diff --git a/docs/references/python_api_reference.md b/docs/references/python_api_reference.md index d13b0516e..1d788f6ab 100644 --- a/docs/references/python_api_reference.md +++ b/docs/references/python_api_reference.md @@ -921,7 +921,7 @@ chunk.update({"content":"sdfx..."}) ### Retrieve chunks ```python -RAGFlow.retrieve(question:str="", dataset_ids:list[str]=None, document_ids=list[str]=None, page:int=1, page_size:int=30, similarity_threshold:float=0.2, vector_similarity_weight:float=0.3, top_k:int=1024,rerank_id:str=None,keyword:bool=False,highlight:bool=False) -> list[Chunk] +RAGFlow.retrieve(question:str="", dataset_ids:list[str]=None, document_ids=list[str]=None, page:int=1, page_size:int=30, similarity_threshold:float=0.2, vector_similarity_weight:float=0.3, top_k:int=1024,rerank_id:str=None,keyword:bool=False,cross_languages:list[str]=None,metadata_condition: dict=None) -> list[Chunk] ``` Retrieves chunks from specified datasets. @@ -971,17 +971,14 @@ Indicates whether to enable keyword-based matching: - `True`: Enable keyword-based matching. - `False`: Disable keyword-based matching (default). -##### highlight: `bool` - -Specifies whether to enable highlighting of matched terms in the results: - -- `True`: Enable highlighting of matched terms. -- `False`: Disable highlighting of matched terms (default). - ##### cross_languages: `list[string]` The languages that should be translated into, in order to achieve keywords retrievals in different languages. +##### metadata_condition: `dict` + +filter condition for meta_fields + #### Returns - Success: A list of `Chunk` objects representing the document chunks. diff --git a/sdk/python/ragflow_sdk/ragflow.py b/sdk/python/ragflow_sdk/ragflow.py index b38851aad..f200a6b5c 100644 --- a/sdk/python/ragflow_sdk/ragflow.py +++ b/sdk/python/ragflow_sdk/ragflow.py @@ -197,7 +197,8 @@ class RAGFlow: top_k=1024, rerank_id: str | None = None, keyword: bool = False, - cross_languages: list[str]|None = None + cross_languages: list[str]|None = None, + metadata_condition: dict | None = None, ): if document_ids is None: document_ids = [] @@ -212,7 +213,8 @@ class RAGFlow: "question": question, "dataset_ids": dataset_ids, "document_ids": document_ids, - "cross_languages": cross_languages + "cross_languages": cross_languages, + "metadata_condition": metadata_condition } # Send a POST request to the backend service (using requests library as an example, actual implementation may vary) res = self.post("/retrieval", json=data_json)