mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: list documents supports range filtering (#9214)
### What problem does this PR solve? list_document supports range filtering. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -206,6 +206,8 @@ def list_docs():
|
|||||||
desc = False
|
desc = False
|
||||||
else:
|
else:
|
||||||
desc = True
|
desc = True
|
||||||
|
create_time_from = int(request.args.get("create_time_from", 0))
|
||||||
|
create_time_to = int(request.args.get("create_time_to", 0))
|
||||||
|
|
||||||
req = request.get_json()
|
req = request.get_json()
|
||||||
|
|
||||||
@ -226,6 +228,14 @@ def list_docs():
|
|||||||
try:
|
try:
|
||||||
docs, tol = DocumentService.get_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types, suffix)
|
docs, tol = DocumentService.get_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types, suffix)
|
||||||
|
|
||||||
|
if create_time_from or create_time_to:
|
||||||
|
filtered_docs = []
|
||||||
|
for doc in docs:
|
||||||
|
doc_create_time = doc.get("create_time", 0)
|
||||||
|
if (create_time_from == 0 or doc_create_time >= create_time_from) and (create_time_to == 0 or doc_create_time <= create_time_to):
|
||||||
|
filtered_docs.append(doc)
|
||||||
|
docs = filtered_docs
|
||||||
|
|
||||||
for doc_item in docs:
|
for doc_item in docs:
|
||||||
if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
|
if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
|
||||||
doc_item["thumbnail"] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}"
|
doc_item["thumbnail"] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}"
|
||||||
|
|||||||
@ -38,7 +38,7 @@ from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_
|
|||||||
from rag.app.qa import beAdoc, rmPrefix
|
from rag.app.qa import beAdoc, rmPrefix
|
||||||
from rag.app.tag import label_question
|
from rag.app.tag import label_question
|
||||||
from rag.nlp import rag_tokenizer, search
|
from rag.nlp import rag_tokenizer, search
|
||||||
from rag.prompts import keyword_extraction, cross_languages
|
from rag.prompts import cross_languages, keyword_extraction
|
||||||
from rag.utils import rmSpace
|
from rag.utils import rmSpace
|
||||||
from rag.utils.storage_factory import STORAGE_IMPL
|
from rag.utils.storage_factory import STORAGE_IMPL
|
||||||
|
|
||||||
@ -456,6 +456,18 @@ def list_docs(dataset_id, tenant_id):
|
|||||||
required: false
|
required: false
|
||||||
default: true
|
default: true
|
||||||
description: Order in descending.
|
description: Order in descending.
|
||||||
|
- in: query
|
||||||
|
name: create_time_from
|
||||||
|
type: integer
|
||||||
|
required: false
|
||||||
|
default: 0
|
||||||
|
description: Unix timestamp for filtering documents created after this time. 0 means no filter.
|
||||||
|
- in: query
|
||||||
|
name: create_time_to
|
||||||
|
type: integer
|
||||||
|
required: false
|
||||||
|
default: 0
|
||||||
|
description: Unix timestamp for filtering documents created before this time. 0 means no filter.
|
||||||
- in: header
|
- in: header
|
||||||
name: Authorization
|
name: Authorization
|
||||||
type: string
|
type: string
|
||||||
@ -517,6 +529,17 @@ def list_docs(dataset_id, tenant_id):
|
|||||||
desc = True
|
desc = True
|
||||||
docs, tol = DocumentService.get_list(dataset_id, page, page_size, orderby, desc, keywords, id, name)
|
docs, tol = DocumentService.get_list(dataset_id, page, page_size, orderby, desc, keywords, id, name)
|
||||||
|
|
||||||
|
create_time_from = int(request.args.get("create_time_from", 0))
|
||||||
|
create_time_to = int(request.args.get("create_time_to", 0))
|
||||||
|
|
||||||
|
if create_time_from or create_time_to:
|
||||||
|
filtered_docs = []
|
||||||
|
for doc in docs:
|
||||||
|
doc_create_time = doc.get("create_time", 0)
|
||||||
|
if (create_time_from == 0 or doc_create_time >= create_time_from) and (create_time_to == 0 or doc_create_time <= create_time_to):
|
||||||
|
filtered_docs.append(doc)
|
||||||
|
docs = filtered_docs
|
||||||
|
|
||||||
# rename key's name
|
# rename key's name
|
||||||
renamed_doc_list = []
|
renamed_doc_list = []
|
||||||
key_mapping = {
|
key_mapping = {
|
||||||
|
|||||||
@ -1118,14 +1118,14 @@ Failure:
|
|||||||
|
|
||||||
### List documents
|
### List documents
|
||||||
|
|
||||||
**GET** `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}`
|
**GET** `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}`
|
||||||
|
|
||||||
Lists documents in a specified dataset.
|
Lists documents in a specified dataset.
|
||||||
|
|
||||||
#### Request
|
#### Request
|
||||||
|
|
||||||
- Method: GET
|
- Method: GET
|
||||||
- URL: `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}`
|
- URL: `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}`
|
||||||
- Headers:
|
- Headers:
|
||||||
- `'content-Type: application/json'`
|
- `'content-Type: application/json'`
|
||||||
- `'Authorization: Bearer <YOUR_API_KEY>'`
|
- `'Authorization: Bearer <YOUR_API_KEY>'`
|
||||||
@ -1134,7 +1134,7 @@ Lists documents in a specified dataset.
|
|||||||
|
|
||||||
```bash
|
```bash
|
||||||
curl --request GET \
|
curl --request GET \
|
||||||
--url http://{address}/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name} \
|
--url http://{address}/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp} \
|
||||||
--header 'Authorization: Bearer <YOUR_API_KEY>'
|
--header 'Authorization: Bearer <YOUR_API_KEY>'
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -1156,6 +1156,10 @@ curl --request GET \
|
|||||||
Indicates whether the retrieved documents should be sorted in descending order. Defaults to `true`.
|
Indicates whether the retrieved documents should be sorted in descending order. Defaults to `true`.
|
||||||
- `id`: (*Filter parameter*), `string`
|
- `id`: (*Filter parameter*), `string`
|
||||||
The ID of the document to retrieve.
|
The ID of the document to retrieve.
|
||||||
|
- `create_time_from`: (*Filter parameter*), `integer`
|
||||||
|
Unix timestamp for filtering documents created after this time. 0 means no filter. Defaults to `0`.
|
||||||
|
- `create_time_to`: (*Filter parameter*), `integer`
|
||||||
|
Unix timestamp for filtering documents created before this time. 0 means no filter. Defaults to `0`.
|
||||||
|
|
||||||
#### Response
|
#### Response
|
||||||
|
|
||||||
|
|||||||
@ -507,7 +507,16 @@ print(doc)
|
|||||||
### List documents
|
### List documents
|
||||||
|
|
||||||
```python
|
```python
|
||||||
Dataset.list_documents(id:str =None, keywords: str=None, page: int=1, page_size:int = 30, order_by:str = "create_time", desc: bool = True) -> list[Document]
|
Dataset.list_documents(
|
||||||
|
id: str = None,
|
||||||
|
keywords: str = None,
|
||||||
|
page: int = 1,
|
||||||
|
page_size: int = 30,
|
||||||
|
order_by: str = "create_time",
|
||||||
|
desc: bool = True,
|
||||||
|
create_time_from: int = 0,
|
||||||
|
create_time_to: int = 0
|
||||||
|
) -> list[Document]
|
||||||
```
|
```
|
||||||
|
|
||||||
Lists documents in the current dataset.
|
Lists documents in the current dataset.
|
||||||
@ -541,6 +550,12 @@ The field by which documents should be sorted. Available options:
|
|||||||
|
|
||||||
Indicates whether the retrieved documents should be sorted in descending order. Defaults to `True`.
|
Indicates whether the retrieved documents should be sorted in descending order. Defaults to `True`.
|
||||||
|
|
||||||
|
##### create_time_from: `int`
|
||||||
|
Unix timestamp for filtering documents created after this time. 0 means no filter. Defaults to 0.
|
||||||
|
|
||||||
|
##### create_time_to: `int`
|
||||||
|
Unix timestamp for filtering documents created before this time. 0 means no filter. Defaults to 0.
|
||||||
|
|
||||||
#### Returns
|
#### Returns
|
||||||
|
|
||||||
- Success: A list of `Document` objects.
|
- Success: A list of `Document` objects.
|
||||||
|
|||||||
@ -63,8 +63,30 @@ class DataSet(Base):
|
|||||||
return doc_list
|
return doc_list
|
||||||
raise Exception(res.get("message"))
|
raise Exception(res.get("message"))
|
||||||
|
|
||||||
def list_documents(self, id: str | None = None, name: str | None = None, keywords: str | None = None, page: int = 1, page_size: int = 30, orderby: str = "create_time", desc: bool = True):
|
def list_documents(
|
||||||
res = self.get(f"/datasets/{self.id}/documents", params={"id": id, "name": name, "keywords": keywords, "page": page, "page_size": page_size, "orderby": orderby, "desc": desc})
|
self,
|
||||||
|
id: str | None = None,
|
||||||
|
name: str | None = None,
|
||||||
|
keywords: str | None = None,
|
||||||
|
page: int = 1,
|
||||||
|
page_size: int = 30,
|
||||||
|
orderby: str = "create_time",
|
||||||
|
desc: bool = True,
|
||||||
|
create_time_from: int = 0,
|
||||||
|
create_time_to: int = 0,
|
||||||
|
):
|
||||||
|
params = {
|
||||||
|
"id": id,
|
||||||
|
"name": name,
|
||||||
|
"keywords": keywords,
|
||||||
|
"page": page,
|
||||||
|
"page_size": page_size,
|
||||||
|
"orderby": orderby,
|
||||||
|
"desc": desc,
|
||||||
|
"create_time_from": create_time_from,
|
||||||
|
"create_time_to": create_time_to,
|
||||||
|
}
|
||||||
|
res = self.get(f"/datasets/{self.id}/documents", params=params)
|
||||||
res = res.json()
|
res = res.json()
|
||||||
documents = []
|
documents = []
|
||||||
if res.get("code") == 0:
|
if res.get("code") == 0:
|
||||||
|
|||||||
Reference in New Issue
Block a user