Refactor Document API (#2833)

### What problem does this PR solve? Refactor Document API ### Type of change - [x] Refactoring Co-authored-by: liuhua <10215101452@stu.ecun.edu.cn>
2025-12-08 20:42:30 +08:00 · 2024-10-14 20:03:33 +08:00
parent df223eddf3
commit 6329427ad5
11 changed files with 393 additions and 418 deletions
--- a/sdk/python/ragflow/modules/base.py
+++ b/sdk/python/ragflow/modules/base.py
@ -18,8 +18,8 @@ class Base(object):
                    pr[name] = value
        return pr

-    def post(self, path, json, stream=False):
-        res = self.rag.post(path, json, stream=stream)
+    def post(self, path, json=None, stream=False, files=None):
+        res = self.rag.post(path, json, stream=stream,files=files)
        return res

    def get(self, path, params):
--- a/sdk/python/ragflow/modules/dataset.py
+++ b/sdk/python/ragflow/modules/dataset.py
@ -1,5 +1,7 @@
 from typing import Optional, List

+from transformers.models.bloom.modeling_bloom import bloom_gelu_back
+
 from .document import Document

 from .base import Base
@ -39,39 +41,27 @@ class DataSet(Base):
        if res.get("code") != 0:
            raise Exception(res["message"])

+    def upload_documents(self,document_list: List[dict]):
+        url = f"/dataset/{self.id}/document"
+        files = [("file",(ele["name"],ele["blob"])) for ele in document_list]
+        res = self.post(path=url,json=None,files=files)
+        res = res.json()
+        if res.get("code") != 0:
+            raise Exception(res.get("message"))

-    def list_docs(self, keywords: Optional[str] = None, offset: int = 0, limit: int = -1) -> List[Document]:
-        """
-        List the documents in the dataset, optionally filtering by keywords, with pagination support.
-
-        Args:
-            keywords (Optional[str]): A string of keywords to filter the documents. Defaults to None.
-            offset (int): The starting point for pagination. Defaults to 0.
-            limit (int): The maximum number of documents to return. Defaults to -1 (no limit).
-
-        Returns:
-            List[Document]: A list of Document objects.
-        """
-        # Construct the request payload for listing documents
-        payload = {
-            "knowledgebase_id": self.id,
-            "keywords": keywords,
-            "offset": offset,
-            "limit": limit
-        }
-
-        # Send the request to the server to list documents
-        res = self.get(f'/doc/dataset/{self.id}/documents', payload)
-        res_json = res.json()
-
-        # Handle response and error checking
-        if res_json.get("retmsg") != "success":
-            raise Exception(res_json.get("retmsg"))
-
-        # Parse the document data from the response
+    def list_documents(self, id: str = None, keywords: str = None, offset: int =1, limit: int = 1024, orderby: str = "create_time", desc: bool = True):
+        res = self.get(f"/dataset/{self.id}/info",params={"id": id,"keywords": keywords,"offset": offset,"limit": limit,"orderby": orderby,"desc": desc})
+        res = res.json()
        documents = []
-        for doc_data in res_json["data"].get("docs", []):
-            doc = Document(self.rag, doc_data)
-            documents.append(doc)
+        if res.get("code") == 0:
+            for document in res["data"].get("docs"):
+                documents.append(Document(self.rag,document))
+            return documents
+        raise Exception(res["message"])
+
+    def delete_documents(self,ids: List[str] = None):
+        res = self.rm(f"/dataset/{self.id}/document",{"ids":ids})
+        res = res.json()
+        if res.get("code") != 0:
+            raise Exception(res["message"])

-        return documents
--- a/sdk/python/ragflow/modules/document.py
+++ b/sdk/python/ragflow/modules/document.py
@ -29,18 +29,14 @@ class Document(Base):
                res_dict.pop(k)
        super().__init__(rag, res_dict)

-    def save(self) -> bool:
+    def update(self,update_message:dict) -> bool:
        """
        Save the document details to the server.
        """
-        res = self.post('/doc/save',
-                        {"id": self.id, "name": self.name, "thumbnail": self.thumbnail, "knowledgebase_id": self.knowledgebase_id,
-                         "parser_method": self.parser_method, "parser_config": self.parser_config.to_json(),
-                         })
+        res = self.post(f'/dataset/{self.knowledgebase_id}/info/{self.id}',update_message)
        res = res.json()
-        if res.get("retmsg") == "success":
-            return True
-        raise Exception(res["retmsg"])
+        if res.get("code") != 0:
+            raise Exception(res["message"])

    def delete(self) -> bool:
        """
@ -60,8 +56,7 @@ class Document(Base):
        :return: The downloaded document content in bytes.
        """
        # Construct the URL for the API request using the document ID and knowledge base ID
-        res = self.get(f"/doc/{self.id}",
-                       {"headers": self.rag.authorization_header, "id": self.id, "name": self.name, "stream": True})
+        res = self.get(f"/dataset/{self.knowledgebase_id}/document/{self.id}")

        # Check the response status code to ensure the request was successful
        if res.status_code == 200:
--- a/sdk/python/ragflow/ragflow.py
+++ b/sdk/python/ragflow/ragflow.py
@ -32,12 +32,12 @@ class RAGFlow:
        self.api_url = f"{base_url}/api/{version}"
        self.authorization_header = {"Authorization": "{} {}".format("Bearer", self.user_key)}

-    def post(self, path, json, stream=False):
-        res = requests.post(url=self.api_url + path, json=json, headers=self.authorization_header, stream=stream)
+    def post(self, path, json=None, stream=False, files=None):
+        res = requests.post(url=self.api_url + path, json=json, headers=self.authorization_header, stream=stream,files=files)
        return res

-    def get(self, path, params=None):
-        res = requests.get(url=self.api_url + path, params=params, headers=self.authorization_header)
+    def get(self, path, params=None, json=None):
+        res = requests.get(url=self.api_url + path, params=params, headers=self.authorization_header,json=json)
        return res

    def delete(self, path, json):
@ -151,31 +151,7 @@ class RAGFlow:
            return result_list
        raise Exception(res["message"])

-    def create_document(self, ds: DataSet, name: str, blob: bytes) -> bool:
-        url = f"/doc/dataset/{ds.id}/documents/upload"
-        files = {
-            'file': (name, blob)
-        }
-        headers = {
-            'Authorization': f"Bearer {ds.rag.user_key}"
-        }

-        response = requests.post(self.api_url + url, files=files,
-                                 headers=headers)
-
-        if response.status_code == 200 and response.json().get('retmsg') == 'success':
-            return True
-        else:
-            raise Exception(f"Upload failed: {response.json().get('retmsg')}")
-
-        return False
-
-    def get_document(self, id: str = None, name: str = None) -> Document:
-        res = self.get("/doc/infos", {"id": id, "name": name})
-        res = res.json()
-        if res.get("retmsg") == "success":
-            return Document(self, res['data'])
-        raise Exception(res["retmsg"])

    def async_parse_documents(self, doc_ids):
        """
--- a/sdk/python/test/t_document.py
+++ b/sdk/python/test/t_document.py
@ -21,22 +21,16 @@ class TestDocument(TestSdk):

        # Step 2: Create a new document
        # The blob is the actual file content or a placeholder in this case
-        name = "TestDocument.txt"
        blob = b"Sample document content for ingestion test."
-
-        res = rag.create_document(ds, name=name, blob=blob)
-
+        blob_2 =  b"test_2."
+        list_1 = []
+        list_1.append({"name":"Test_1.txt",
+                       "blob":blob})
+        list_1.append({"name":"Test_2.txt",
+                       "blob":blob_2})
+        res = ds.upload_documents(list_1)
        # Ensure document ingestion was successful
-        assert res is True, f"Failed to create document, error: {res}"
-
-    def test_get_detail_document_with_success(self):
-        """
-        Test getting a document's detail with success
-        """
-        rag = RAGFlow(API_KEY, HOST_ADDRESS)
-        doc = rag.get_document(name="TestDocument.txt")
-        assert isinstance(doc, Document), f"Failed to get dataset, error: {doc}."
-        assert doc.name == "TestDocument.txt", "Name does not match"
+        assert res is None, f"Failed to create document, error: {res}"

    def test_update_document_with_success(self):
        """
@ -44,12 +38,13 @@ class TestDocument(TestSdk):
        Update name or parser_method are supported
        """
        rag = RAGFlow(API_KEY, HOST_ADDRESS)
-        doc = rag.get_document(name="TestDocument.txt")
+        ds = rag.list_datasets(name="God")
+        ds = ds[0]
+        doc = ds.list_documents()
+        doc = doc[0]
        if isinstance(doc, Document):
-            doc.parser_method = "manual"
-            doc.name = "manual.txt"
-            res = doc.save()
-            assert res is True, f"Failed to update document, error: {res}"
+            res = doc.update({"parser_method":"manual","name":"manual.txt"})
+            assert res is None, f"Failed to update document, error: {res}"
        else:
            assert False, f"Failed to get document, error: {doc}"

@ -61,8 +56,10 @@ class TestDocument(TestSdk):
        rag = RAGFlow(API_KEY, HOST_ADDRESS)

        # Retrieve a document
-        doc = rag.get_document(name="manual.txt")
-
+        ds = rag.list_datasets(name="God")
+        ds = ds[0]
+        doc = ds.list_documents(name="manual.txt")
+        doc = doc[0]
        # Check if the retrieved document is of type Document
        if isinstance(doc, Document):
            # Download the document content and save it to a file
@ -81,7 +78,7 @@ class TestDocument(TestSdk):
            # If the document retrieval fails, assert failure
            assert False, f"Failed to get document, error: {doc}"

-    def test_list_all_documents_in_dataset_with_success(self):
+    def test_list_documents_in_dataset_with_success(self):
        """
        Test list all documents into a dataset with success.
        """
@ -101,12 +98,10 @@ class TestDocument(TestSdk):
        blob1 = b"Sample document content for ingestion test111."
        name2 = "Test Document222.txt"
        blob2 = b"Sample document content for ingestion test222."
-
-        rag.create_document(ds, name=name1, blob=blob1)
-        rag.create_document(ds, name=name2, blob=blob2)
+        list_1 = [{"name":name1,"blob":blob1},{"name":name2,"blob":blob2}]
+        ds.upload_documents(list_1)
        for d in ds.list_docs(keywords="test", offset=0, limit=12):
-            assert isinstance(d, Document)
-            print(d)
+            assert isinstance(d, Document), "Failed to upload documents"

    def test_delete_documents_in_dataset_with_success(self):
        """