Feat: Add metadata filtering function for /api/v1/retrieval (#9877)

-Added the metadata_dedition parameter in the document retrieval
interface to filter document metadata -Updated the API documentation and
added explanations for the metadata_dedition parameter

### What problem does this PR solve?

Make /api/v1/retrieval api also can use metadata filter

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
天海蒼灆
2025-09-05 11:12:15 +08:00
committed by GitHub
parent 8e30a75e5c
commit 677c99b090
4 changed files with 22 additions and 12 deletions

View File

@ -35,6 +35,8 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
from api.db.services.tenant_llm_service import TenantLLMService from api.db.services.tenant_llm_service import TenantLLMService
from api.db.services.task_service import TaskService, queue_tasks from api.db.services.task_service import TaskService, queue_tasks
from api.db.services.dialog_service import meta_filter
from api.apps.sdk.dify_retrieval import convert_conditions
from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_parser_config, get_result, server_error_response, token_required from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_parser_config, get_result, server_error_response, token_required
from rag.app.qa import beAdoc, rmPrefix from rag.app.qa import beAdoc, rmPrefix
from rag.app.tag import label_question from rag.app.tag import label_question
@ -1350,6 +1352,9 @@ def retrieval_test(tenant_id):
highlight: highlight:
type: boolean type: boolean
description: Whether to highlight matched content. description: Whether to highlight matched content.
metadata_condition:
type: object
description: metadata filter condition.
- in: header - in: header
name: Authorization name: Authorization
type: string type: string
@ -1413,6 +1418,10 @@ def retrieval_test(tenant_id):
for doc_id in doc_ids: for doc_id in doc_ids:
if doc_id not in doc_ids_list: if doc_id not in doc_ids_list:
return get_error_data_result(f"The datasets don't own the document {doc_id}") return get_error_data_result(f"The datasets don't own the document {doc_id}")
if not doc_ids:
metadata_condition = req.get("metadata_condition", {})
metas = DocumentService.get_meta_by_kbs(kb_ids)
doc_ids = meta_filter(metas, convert_conditions(metadata_condition))
similarity_threshold = float(req.get("similarity_threshold", 0.2)) similarity_threshold = float(req.get("similarity_threshold", 0.2))
vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3)) vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
top = int(req.get("top_k", 1024)) top = int(req.get("top_k", 1024))

View File

@ -1808,7 +1808,8 @@ Retrieves chunks from specified datasets.
- `"rerank_id"`: `string` - `"rerank_id"`: `string`
- `"keyword"`: `boolean` - `"keyword"`: `boolean`
- `"highlight"`: `boolean` - `"highlight"`: `boolean`
- `"cross_languages"`: `list[string]` - `"cross_languages"`: `list[string]`
- `"metadata_condition"`: `object`
##### Request example ##### Request example
@ -1855,7 +1856,8 @@ curl --request POST \
- `false`: Disable highlighting of matched terms (default). - `false`: Disable highlighting of matched terms (default).
- `"cross_languages"`: (*Body parameter*) `list[string]` - `"cross_languages"`: (*Body parameter*) `list[string]`
The languages that should be translated into, in order to achieve keywords retrievals in different languages. The languages that should be translated into, in order to achieve keywords retrievals in different languages.
- `"metadata_condition"`: (*Body parameter*), `object`
The metadata condition for filtering chunks.
#### Response #### Response
Success: Success:

View File

@ -921,7 +921,7 @@ chunk.update({"content":"sdfx..."})
### Retrieve chunks ### Retrieve chunks
```python ```python
RAGFlow.retrieve(question:str="", dataset_ids:list[str]=None, document_ids=list[str]=None, page:int=1, page_size:int=30, similarity_threshold:float=0.2, vector_similarity_weight:float=0.3, top_k:int=1024,rerank_id:str=None,keyword:bool=False,highlight:bool=False) -> list[Chunk] RAGFlow.retrieve(question:str="", dataset_ids:list[str]=None, document_ids=list[str]=None, page:int=1, page_size:int=30, similarity_threshold:float=0.2, vector_similarity_weight:float=0.3, top_k:int=1024,rerank_id:str=None,keyword:bool=False,cross_languages:list[str]=None,metadata_condition: dict=None) -> list[Chunk]
``` ```
Retrieves chunks from specified datasets. Retrieves chunks from specified datasets.
@ -971,17 +971,14 @@ Indicates whether to enable keyword-based matching:
- `True`: Enable keyword-based matching. - `True`: Enable keyword-based matching.
- `False`: Disable keyword-based matching (default). - `False`: Disable keyword-based matching (default).
##### highlight: `bool`
Specifies whether to enable highlighting of matched terms in the results:
- `True`: Enable highlighting of matched terms.
- `False`: Disable highlighting of matched terms (default).
##### cross_languages: `list[string]` ##### cross_languages: `list[string]`
The languages that should be translated into, in order to achieve keywords retrievals in different languages. The languages that should be translated into, in order to achieve keywords retrievals in different languages.
##### metadata_condition: `dict`
filter condition for meta_fields
#### Returns #### Returns
- Success: A list of `Chunk` objects representing the document chunks. - Success: A list of `Chunk` objects representing the document chunks.

View File

@ -197,7 +197,8 @@ class RAGFlow:
top_k=1024, top_k=1024,
rerank_id: str | None = None, rerank_id: str | None = None,
keyword: bool = False, keyword: bool = False,
cross_languages: list[str]|None = None cross_languages: list[str]|None = None,
metadata_condition: dict | None = None,
): ):
if document_ids is None: if document_ids is None:
document_ids = [] document_ids = []
@ -212,7 +213,8 @@ class RAGFlow:
"question": question, "question": question,
"dataset_ids": dataset_ids, "dataset_ids": dataset_ids,
"document_ids": document_ids, "document_ids": document_ids,
"cross_languages": cross_languages "cross_languages": cross_languages,
"metadata_condition": metadata_condition
} }
# Send a POST request to the backend service (using requests library as an example, actual implementation may vary) # Send a POST request to the backend service (using requests library as an example, actual implementation may vary)
res = self.post("/retrieval", json=data_json) res = self.post("/retrieval", json=data_json)