mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: Add metadata filtering function for /api/v1/retrieval (#9877)
-Added the metadata_dedition parameter in the document retrieval interface to filter document metadata -Updated the API documentation and added explanations for the metadata_dedition parameter ### What problem does this PR solve? Make /api/v1/retrieval api also can use metadata filter ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -35,6 +35,8 @@ from api.db.services.knowledgebase_service import KnowledgebaseService
|
|||||||
from api.db.services.llm_service import LLMBundle
|
from api.db.services.llm_service import LLMBundle
|
||||||
from api.db.services.tenant_llm_service import TenantLLMService
|
from api.db.services.tenant_llm_service import TenantLLMService
|
||||||
from api.db.services.task_service import TaskService, queue_tasks
|
from api.db.services.task_service import TaskService, queue_tasks
|
||||||
|
from api.db.services.dialog_service import meta_filter
|
||||||
|
from api.apps.sdk.dify_retrieval import convert_conditions
|
||||||
from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_parser_config, get_result, server_error_response, token_required
|
from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_error_data_result, get_parser_config, get_result, server_error_response, token_required
|
||||||
from rag.app.qa import beAdoc, rmPrefix
|
from rag.app.qa import beAdoc, rmPrefix
|
||||||
from rag.app.tag import label_question
|
from rag.app.tag import label_question
|
||||||
@ -1350,6 +1352,9 @@ def retrieval_test(tenant_id):
|
|||||||
highlight:
|
highlight:
|
||||||
type: boolean
|
type: boolean
|
||||||
description: Whether to highlight matched content.
|
description: Whether to highlight matched content.
|
||||||
|
metadata_condition:
|
||||||
|
type: object
|
||||||
|
description: metadata filter condition.
|
||||||
- in: header
|
- in: header
|
||||||
name: Authorization
|
name: Authorization
|
||||||
type: string
|
type: string
|
||||||
@ -1413,6 +1418,10 @@ def retrieval_test(tenant_id):
|
|||||||
for doc_id in doc_ids:
|
for doc_id in doc_ids:
|
||||||
if doc_id not in doc_ids_list:
|
if doc_id not in doc_ids_list:
|
||||||
return get_error_data_result(f"The datasets don't own the document {doc_id}")
|
return get_error_data_result(f"The datasets don't own the document {doc_id}")
|
||||||
|
if not doc_ids:
|
||||||
|
metadata_condition = req.get("metadata_condition", {})
|
||||||
|
metas = DocumentService.get_meta_by_kbs(kb_ids)
|
||||||
|
doc_ids = meta_filter(metas, convert_conditions(metadata_condition))
|
||||||
similarity_threshold = float(req.get("similarity_threshold", 0.2))
|
similarity_threshold = float(req.get("similarity_threshold", 0.2))
|
||||||
vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
|
vector_similarity_weight = float(req.get("vector_similarity_weight", 0.3))
|
||||||
top = int(req.get("top_k", 1024))
|
top = int(req.get("top_k", 1024))
|
||||||
|
|||||||
@ -1808,7 +1808,8 @@ Retrieves chunks from specified datasets.
|
|||||||
- `"rerank_id"`: `string`
|
- `"rerank_id"`: `string`
|
||||||
- `"keyword"`: `boolean`
|
- `"keyword"`: `boolean`
|
||||||
- `"highlight"`: `boolean`
|
- `"highlight"`: `boolean`
|
||||||
- `"cross_languages"`: `list[string]`
|
- `"cross_languages"`: `list[string]`
|
||||||
|
- `"metadata_condition"`: `object`
|
||||||
|
|
||||||
##### Request example
|
##### Request example
|
||||||
|
|
||||||
@ -1855,7 +1856,8 @@ curl --request POST \
|
|||||||
- `false`: Disable highlighting of matched terms (default).
|
- `false`: Disable highlighting of matched terms (default).
|
||||||
- `"cross_languages"`: (*Body parameter*) `list[string]`
|
- `"cross_languages"`: (*Body parameter*) `list[string]`
|
||||||
The languages that should be translated into, in order to achieve keywords retrievals in different languages.
|
The languages that should be translated into, in order to achieve keywords retrievals in different languages.
|
||||||
|
- `"metadata_condition"`: (*Body parameter*), `object`
|
||||||
|
The metadata condition for filtering chunks.
|
||||||
#### Response
|
#### Response
|
||||||
|
|
||||||
Success:
|
Success:
|
||||||
|
|||||||
@ -921,7 +921,7 @@ chunk.update({"content":"sdfx..."})
|
|||||||
### Retrieve chunks
|
### Retrieve chunks
|
||||||
|
|
||||||
```python
|
```python
|
||||||
RAGFlow.retrieve(question:str="", dataset_ids:list[str]=None, document_ids=list[str]=None, page:int=1, page_size:int=30, similarity_threshold:float=0.2, vector_similarity_weight:float=0.3, top_k:int=1024,rerank_id:str=None,keyword:bool=False,highlight:bool=False) -> list[Chunk]
|
RAGFlow.retrieve(question:str="", dataset_ids:list[str]=None, document_ids=list[str]=None, page:int=1, page_size:int=30, similarity_threshold:float=0.2, vector_similarity_weight:float=0.3, top_k:int=1024,rerank_id:str=None,keyword:bool=False,cross_languages:list[str]=None,metadata_condition: dict=None) -> list[Chunk]
|
||||||
```
|
```
|
||||||
|
|
||||||
Retrieves chunks from specified datasets.
|
Retrieves chunks from specified datasets.
|
||||||
@ -971,17 +971,14 @@ Indicates whether to enable keyword-based matching:
|
|||||||
- `True`: Enable keyword-based matching.
|
- `True`: Enable keyword-based matching.
|
||||||
- `False`: Disable keyword-based matching (default).
|
- `False`: Disable keyword-based matching (default).
|
||||||
|
|
||||||
##### highlight: `bool`
|
|
||||||
|
|
||||||
Specifies whether to enable highlighting of matched terms in the results:
|
|
||||||
|
|
||||||
- `True`: Enable highlighting of matched terms.
|
|
||||||
- `False`: Disable highlighting of matched terms (default).
|
|
||||||
|
|
||||||
##### cross_languages: `list[string]`
|
##### cross_languages: `list[string]`
|
||||||
|
|
||||||
The languages that should be translated into, in order to achieve keywords retrievals in different languages.
|
The languages that should be translated into, in order to achieve keywords retrievals in different languages.
|
||||||
|
|
||||||
|
##### metadata_condition: `dict`
|
||||||
|
|
||||||
|
filter condition for meta_fields
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
- Success: A list of `Chunk` objects representing the document chunks.
|
- Success: A list of `Chunk` objects representing the document chunks.
|
||||||
|
|||||||
@ -197,7 +197,8 @@ class RAGFlow:
|
|||||||
top_k=1024,
|
top_k=1024,
|
||||||
rerank_id: str | None = None,
|
rerank_id: str | None = None,
|
||||||
keyword: bool = False,
|
keyword: bool = False,
|
||||||
cross_languages: list[str]|None = None
|
cross_languages: list[str]|None = None,
|
||||||
|
metadata_condition: dict | None = None,
|
||||||
):
|
):
|
||||||
if document_ids is None:
|
if document_ids is None:
|
||||||
document_ids = []
|
document_ids = []
|
||||||
@ -212,7 +213,8 @@ class RAGFlow:
|
|||||||
"question": question,
|
"question": question,
|
||||||
"dataset_ids": dataset_ids,
|
"dataset_ids": dataset_ids,
|
||||||
"document_ids": document_ids,
|
"document_ids": document_ids,
|
||||||
"cross_languages": cross_languages
|
"cross_languages": cross_languages,
|
||||||
|
"metadata_condition": metadata_condition
|
||||||
}
|
}
|
||||||
# Send a POST request to the backend service (using requests library as an example, actual implementation may vary)
|
# Send a POST request to the backend service (using requests library as an example, actual implementation may vary)
|
||||||
res = self.post("/retrieval", json=data_json)
|
res = self.post("/retrieval", json=data_json)
|
||||||
|
|||||||
Reference in New Issue
Block a user