From 60d652d2e1e8b765989bf90ea561283f61a075c5 Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Mon, 4 Aug 2025 16:35:35 +0800 Subject: [PATCH] Feat: list documents supports range filtering (#9214) ### What problem does this PR solve? list_document supports range filtering. ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- api/apps/document_app.py | 10 +++++++++ api/apps/sdk/doc.py | 25 +++++++++++++++++++++- docs/references/http_api_reference.md | 10 ++++++--- docs/references/python_api_reference.md | 17 ++++++++++++++- sdk/python/ragflow_sdk/modules/dataset.py | 26 +++++++++++++++++++++-- 5 files changed, 81 insertions(+), 7 deletions(-) diff --git a/api/apps/document_app.py b/api/apps/document_app.py index aa5ee7d79..d7aa5d829 100644 --- a/api/apps/document_app.py +++ b/api/apps/document_app.py @@ -206,6 +206,8 @@ def list_docs(): desc = False else: desc = True + create_time_from = int(request.args.get("create_time_from", 0)) + create_time_to = int(request.args.get("create_time_to", 0)) req = request.get_json() @@ -226,6 +228,14 @@ def list_docs(): try: docs, tol = DocumentService.get_by_kb_id(kb_id, page_number, items_per_page, orderby, desc, keywords, run_status, types, suffix) + if create_time_from or create_time_to: + filtered_docs = [] + for doc in docs: + doc_create_time = doc.get("create_time", 0) + if (create_time_from == 0 or doc_create_time >= create_time_from) and (create_time_to == 0 or doc_create_time <= create_time_to): + filtered_docs.append(doc) + docs = filtered_docs + for doc_item in docs: if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX): doc_item["thumbnail"] = f"/v1/document/image/{kb_id}-{doc_item['thumbnail']}" diff --git a/api/apps/sdk/doc.py b/api/apps/sdk/doc.py index d81d16f30..24420f0f3 100644 --- a/api/apps/sdk/doc.py +++ b/api/apps/sdk/doc.py @@ -38,7 +38,7 @@ from api.utils.api_utils import check_duplicate_ids, construct_json_result, get_ from rag.app.qa import beAdoc, rmPrefix from rag.app.tag import label_question from rag.nlp import rag_tokenizer, search -from rag.prompts import keyword_extraction, cross_languages +from rag.prompts import cross_languages, keyword_extraction from rag.utils import rmSpace from rag.utils.storage_factory import STORAGE_IMPL @@ -456,6 +456,18 @@ def list_docs(dataset_id, tenant_id): required: false default: true description: Order in descending. + - in: query + name: create_time_from + type: integer + required: false + default: 0 + description: Unix timestamp for filtering documents created after this time. 0 means no filter. + - in: query + name: create_time_to + type: integer + required: false + default: 0 + description: Unix timestamp for filtering documents created before this time. 0 means no filter. - in: header name: Authorization type: string @@ -517,6 +529,17 @@ def list_docs(dataset_id, tenant_id): desc = True docs, tol = DocumentService.get_list(dataset_id, page, page_size, orderby, desc, keywords, id, name) + create_time_from = int(request.args.get("create_time_from", 0)) + create_time_to = int(request.args.get("create_time_to", 0)) + + if create_time_from or create_time_to: + filtered_docs = [] + for doc in docs: + doc_create_time = doc.get("create_time", 0) + if (create_time_from == 0 or doc_create_time >= create_time_from) and (create_time_to == 0 or doc_create_time <= create_time_to): + filtered_docs.append(doc) + docs = filtered_docs + # rename key's name renamed_doc_list = [] key_mapping = { diff --git a/docs/references/http_api_reference.md b/docs/references/http_api_reference.md index 83cb70ee7..1ac9fd80b 100644 --- a/docs/references/http_api_reference.md +++ b/docs/references/http_api_reference.md @@ -1118,14 +1118,14 @@ Failure: ### List documents -**GET** `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}` +**GET** `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}` Lists documents in a specified dataset. #### Request - Method: GET -- URL: `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}` +- URL: `/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp}` - Headers: - `'content-Type: application/json'` - `'Authorization: Bearer '` @@ -1134,7 +1134,7 @@ Lists documents in a specified dataset. ```bash curl --request GET \ - --url http://{address}/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name} \ + --url http://{address}/api/v1/datasets/{dataset_id}/documents?page={page}&page_size={page_size}&orderby={orderby}&desc={desc}&keywords={keywords}&id={document_id}&name={document_name}&create_time_from={timestamp}&create_time_to={timestamp} \ --header 'Authorization: Bearer ' ``` @@ -1156,6 +1156,10 @@ curl --request GET \ Indicates whether the retrieved documents should be sorted in descending order. Defaults to `true`. - `id`: (*Filter parameter*), `string` The ID of the document to retrieve. +- `create_time_from`: (*Filter parameter*), `integer` + Unix timestamp for filtering documents created after this time. 0 means no filter. Defaults to `0`. +- `create_time_to`: (*Filter parameter*), `integer` + Unix timestamp for filtering documents created before this time. 0 means no filter. Defaults to `0`. #### Response diff --git a/docs/references/python_api_reference.md b/docs/references/python_api_reference.md index 12ff48678..f0ad981a3 100644 --- a/docs/references/python_api_reference.md +++ b/docs/references/python_api_reference.md @@ -507,7 +507,16 @@ print(doc) ### List documents ```python -Dataset.list_documents(id:str =None, keywords: str=None, page: int=1, page_size:int = 30, order_by:str = "create_time", desc: bool = True) -> list[Document] +Dataset.list_documents( + id: str = None, + keywords: str = None, + page: int = 1, + page_size: int = 30, + order_by: str = "create_time", + desc: bool = True, + create_time_from: int = 0, + create_time_to: int = 0 +) -> list[Document] ``` Lists documents in the current dataset. @@ -541,6 +550,12 @@ The field by which documents should be sorted. Available options: Indicates whether the retrieved documents should be sorted in descending order. Defaults to `True`. +##### create_time_from: `int` +Unix timestamp for filtering documents created after this time. 0 means no filter. Defaults to 0. + +##### create_time_to: `int` +Unix timestamp for filtering documents created before this time. 0 means no filter. Defaults to 0. + #### Returns - Success: A list of `Document` objects. diff --git a/sdk/python/ragflow_sdk/modules/dataset.py b/sdk/python/ragflow_sdk/modules/dataset.py index fc0bc8f5b..b4367ac3b 100644 --- a/sdk/python/ragflow_sdk/modules/dataset.py +++ b/sdk/python/ragflow_sdk/modules/dataset.py @@ -63,8 +63,30 @@ class DataSet(Base): return doc_list raise Exception(res.get("message")) - def list_documents(self, id: str | None = None, name: str | None = None, keywords: str | None = None, page: int = 1, page_size: int = 30, orderby: str = "create_time", desc: bool = True): - res = self.get(f"/datasets/{self.id}/documents", params={"id": id, "name": name, "keywords": keywords, "page": page, "page_size": page_size, "orderby": orderby, "desc": desc}) + def list_documents( + self, + id: str | None = None, + name: str | None = None, + keywords: str | None = None, + page: int = 1, + page_size: int = 30, + orderby: str = "create_time", + desc: bool = True, + create_time_from: int = 0, + create_time_to: int = 0, + ): + params = { + "id": id, + "name": name, + "keywords": keywords, + "page": page, + "page_size": page_size, + "orderby": orderby, + "desc": desc, + "create_time_from": create_time_from, + "create_time_to": create_time_to, + } + res = self.get(f"/datasets/{self.id}/documents", params=params) res = res.json() documents = [] if res.get("code") == 0: