update sdk document (#2374)

### What problem does this PR solve? _Briefly describe what this PR aims to solve. Include background context that will help reviewers understand the purpose of the PR._ ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2026-01-28 14:16:34 +08:00 · 2024-09-12 14:19:45 +08:00
parent 6000c3e304
commit 6722b3d558
7 changed files with 472 additions and 2 deletions
--- a/sdk/python/ragflow/init.py
+++ b/sdk/python/ragflow/init.py
@ -5,4 +5,5 @@ __version__ = importlib.metadata.version("ragflow")
 from .ragflow import RAGFlow
 from .modules.dataset import DataSet
 from .modules.assistant import Assistant
-from .modules.session import Session
+from .modules.session import Session
+from .modules.document import Document
--- a/sdk/python/ragflow/modules/dataset.py
+++ b/sdk/python/ragflow/modules/dataset.py
@ -1,3 +1,7 @@
+from typing import Optional, List
+
+from .document import Document
+
 from .base import Base


@ -46,3 +50,39 @@ class DataSet(Base):
        res = res.json()
        if res.get("retmsg") == "success": return True
        raise Exception(res["retmsg"])
+
+    def list_docs(self, keywords: Optional[str] = None, offset: int = 0, limit: int = -1) -> List[Document]:
+        """
+        List the documents in the dataset, optionally filtering by keywords, with pagination support.
+
+        Args:
+            keywords (Optional[str]): A string of keywords to filter the documents. Defaults to None.
+            offset (int): The starting point for pagination. Defaults to 0.
+            limit (int): The maximum number of documents to return. Defaults to -1 (no limit).
+
+        Returns:
+            List[Document]: A list of Document objects.
+        """
+        # Construct the request payload for listing documents
+        payload = {
+            "kb_id": self.id,
+            "keywords": keywords,
+            "offset": offset,
+            "limit": limit
+        }
+
+        # Send the request to the server to list documents
+        res = self.get(f'/doc/dataset/{self.id}/documents', payload)
+        res_json = res.json()
+
+        # Handle response and error checking
+        if res_json.get("retmsg") != "success":
+            raise Exception(res_json.get("retmsg"))
+
+        # Parse the document data from the response
+        documents = []
+        for doc_data in res_json["data"].get("docs", []):
+            doc = Document(self.rag, doc_data)
+            documents.append(doc)
+
+        return documents
--- a/sdk/python/ragflow/modules/document.py
+++ b/sdk/python/ragflow/modules/document.py
@ -0,0 +1,75 @@
+
+from .base import Base
+
+
+
+class Document(Base):
+    def __init__(self, rag, res_dict):
+        self.id = ""
+        self.name = ""
+        self.thumbnail = None
+        self.kb_id = None
+        self.parser_method = ""
+        self.parser_config = {"pages": [[1, 1000000]]}
+        self.source_type = "local"
+        self.type = ""
+        self.created_by = ""
+        self.size = 0
+        self.token_num = 0
+        self.chunk_num = 0
+        self.progress = 0.0
+        self.progress_msg = ""
+        self.process_begin_at = None
+        self.process_duration = 0.0
+        for k in list(res_dict.keys()):
+            if k not in self.__dict__:
+                res_dict.pop(k)
+        super().__init__(rag, res_dict)
+
+    def save(self) -> bool:
+        """
+        Save the document details to the server.
+        """
+        res = self.post('/doc/save',
+                        {"id": self.id, "name": self.name, "thumbnail": self.thumbnail, "kb_id": self.kb_id,
+                         "parser_id": self.parser_method, "parser_config": self.parser_config.to_json(),
+                         "source_type": self.source_type, "type": self.type, "created_by": self.created_by,
+                         "size": self.size, "token_num": self.token_num, "chunk_num": self.chunk_num,
+                         "progress": self.progress, "progress_msg": self.progress_msg,
+                         "process_begin_at": self.process_begin_at, "process_duation": self.process_duration
+                         })
+        res = res.json()
+        if res.get("retmsg") == "success":
+            return True
+        raise Exception(res["retmsg"])
+
+    def delete(self) -> bool:
+        """
+        Delete the document from the server.
+        """
+        res = self.rm('/doc/delete',
+                      {"doc_id": self.id})
+        res = res.json()
+        if res.get("retmsg") == "success":
+            return True
+        raise Exception(res["retmsg"])
+
+    def download(self) -> bytes:
+        """
+        Download the document content from the server using the Flask API.
+
+        :return: The downloaded document content in bytes.
+        """
+        # Construct the URL for the API request using the document ID and knowledge base ID
+        res = self.get(f"/doc/{self.kb_id}/documents/{self.id}",
+                       {"headers": self.rag.authorization_header, "id": self.id, "name": self.name, "stream": True})
+
+        # Check the response status code to ensure the request was successful
+        if res.status_code == 200:
+            # Return the document content as bytes
+            return res.content
+        else:
+            # Handle the error and raise an exception
+            raise Exception(
+                f"Failed to download document. Server responded with: {res.status_code}, {res.text}"
+            )
--- a/sdk/python/ragflow/ragflow.py
+++ b/sdk/python/ragflow/ragflow.py
@ -19,7 +19,7 @@ import requests

 from .modules.assistant import Assistant
 from .modules.dataset import DataSet
-
+from .modules.document import Document

 class RAGFlow:
    def __init__(self, user_key, base_url, version='v1'):
@ -142,3 +142,32 @@ class RAGFlow:
                result_list.append(Assistant(self, data))
            return result_list
        raise Exception(res["retmsg"])
+
+    def create_document(self, ds:DataSet, name: str, blob: bytes) -> bool:
+        url = f"/doc/dataset/{ds.id}/documents/upload"
+        files = {
+            'file': (name, blob)
+        }
+        data = {
+            'kb_id': ds.id
+        }
+        headers = {
+            'Authorization': f"Bearer {ds.rag.user_key}"
+        }
+
+        response = requests.post(self.api_url + url, data=data, files=files,
+                                 headers=headers)
+
+        if response.status_code == 200 and response.json().get('retmsg') == 'success':
+            return True
+        else:
+            raise Exception(f"Upload failed: {response.json().get('retmsg')}")
+
+        return False
+    def get_document(self, id: str = None, name: str = None) -> Document:
+        res = self.get("/doc/infos", {"id": id, "name": name})
+        res = res.json()
+        if res.get("retmsg") == "success":
+            return Document(self, res['data'])
+        raise Exception(res["retmsg"])
+