update sdk document and chunk (#2421)

### What problem does this PR solve? _Briefly describe what this PR aims to solve. Include background context that will help reviewers understand the purpose of the PR._ ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: Kevin Hu <kevinhu.sh@gmail.com>
2026-01-26 13:16:34 +08:00 · 2024-09-14 13:24:21 +08:00
parent 7e75b9d778
commit 99a7c0fb97
12 changed files with 757 additions and 34 deletions
--- a/sdk/python/test/t_document.py
+++ b/sdk/python/test/t_document.py
@ -1,4 +1,4 @@
-from ragflow import RAGFlow, DataSet, Document
+from ragflow import RAGFlow, DataSet, Document, Chunk

 from common import API_KEY, HOST_ADDRESS
 from test_sdkbase import TestSdk
@ -46,6 +46,7 @@ class TestDocument(TestSdk):
        doc = rag.get_document(name="TestDocument.txt")
        if isinstance(doc, Document):
            doc.parser_method = "manual"
+            doc.name = "manual.txt"
            res = doc.save()
            assert res is True, f"Failed to update document, error: {res}"
        else:
@ -126,8 +127,8 @@ class TestDocument(TestSdk):
        blob1 = b"Sample document content for ingestion test333."
        name2 = "Test Document444.txt"
        blob2 = b"Sample document content for ingestion test444."
-        name3='test.txt'
-        path='test_data/test.txt'
+        name3 = 'test.txt'
+        path = 'test_data/test.txt'
        rag.create_document(ds, name=name3, blob=open(path, "rb").read())
        rag.create_document(ds, name=name1, blob=blob1)
        rag.create_document(ds, name=name2, blob=blob2)
@ -138,7 +139,131 @@ class TestDocument(TestSdk):
        remaining_docs = ds.list_docs(keywords="rag", offset=0, limit=12)
        assert len(remaining_docs) == 0, "Documents were not properly deleted."

-
-
+    def test_parse_and_cancel_document(self):
+        # Initialize RAGFlow with API key and host address
+        rag = RAGFlow(API_KEY, HOST_ADDRESS)
+
+        # Create a dataset with a specific name
+        ds = rag.create_dataset(name="God4")
+
+        # Define the document name and path
+        name3 = 'ai.pdf'
+        path = 'test_data/ai.pdf'
+
+        # Create a document in the dataset using the file path
+        rag.create_document(ds, name=name3, blob=open(path, "rb").read())
+
+        # Retrieve the document by name
+        doc = rag.get_document(name="ai.pdf")
+
+        # Initiate asynchronous parsing
+        doc.async_parse()
+
+        # Print message to confirm asynchronous parsing has been initiated
+        print("Async parsing initiated")
+
+        # Use join to wait for parsing to complete and get progress updates
+        for progress, msg in doc.join(interval=5, timeout=10):
+            print(progress, msg)
+            # Assert that the progress is within the valid range (0 to 100)
+            assert 0 <= progress <= 100, f"Invalid progress: {progress}"
+            # Assert that the message is not empty
+            assert msg, "Message should not be empty"
+            # Test cancelling the parsing operation
+        doc.cancel()
+        # Print message to confirm parsing has been cancelled successfully
+        print("Parsing cancelled successfully")
+
+    def test_bulk_parse_and_cancel_documents(self):
+        # Initialize RAGFlow with API key and host address
+        rag = RAGFlow(API_KEY, HOST_ADDRESS)
+
+        # Create a dataset
+        ds = rag.create_dataset(name="God5")
+        assert ds is not None, "Dataset creation failed"
+        assert ds.name == "God5", "Dataset name does not match"
+
+        # Prepare a list of file names and paths
+        documents = [
+            {'name': 'ai1.pdf', 'path': 'test_data/ai1.pdf'},
+            {'name': 'ai2.pdf', 'path': 'test_data/ai2.pdf'},
+            {'name': 'ai3.pdf', 'path': 'test_data/ai3.pdf'}
+        ]
+
+        # Create documents in bulk
+        for doc_info in documents:
+            with open(doc_info['path'], "rb") as file:
+                created_doc = rag.create_document(ds, name=doc_info['name'], blob=file.read())
+                assert created_doc is not None, f"Failed to create document {doc_info['name']}"
+
+                # Retrieve document objects in bulk
+        docs = [rag.get_document(name=doc_info['name']) for doc_info in documents]
+        ids = [doc.id for doc in docs]
+        assert len(docs) == len(documents), "Mismatch between created documents and fetched documents"
+
+        # Initiate asynchronous parsing for all documents
+        rag.async_parse_documents(ids)
+        print("Async bulk parsing initiated")
+
+        # Wait for all documents to finish parsing and check progress
+        for doc in docs:
+            for progress, msg in doc.join(interval=5, timeout=10):
+                print(f"{doc.name}: Progress: {progress}, Message: {msg}")
+
+                # Assert that progress is within the valid range
+                assert 0 <= progress <= 100, f"Invalid progress: {progress} for document {doc.name}"
+
+                # Assert that the message is not empty
+                assert msg, f"Message should not be empty for document {doc.name}"
+
+                # If progress reaches 100%, assert that parsing is completed successfully
+                if progress == 100:
+                    assert "completed" in msg.lower(), f"Document {doc.name} did not complete successfully"
+
+                    # Cancel parsing for all documents in bulk
+        cancel_result = rag.async_cancel_parse_documents(ids)
+        assert cancel_result is None or isinstance(cancel_result, type(None)), "Failed to cancel document parsing"
+        print("Async bulk parsing cancelled")
+
+    def test_parse_document_and_chunk_list(self):
+        rag = RAGFlow(API_KEY, HOST_ADDRESS)
+        ds = rag.create_dataset(name="God7")
+        name='story.txt'
+        path = 'test_data/story.txt'
+        # name = "Test Document rag.txt"
+        # blob = " Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps.Sample document content for rag test66. rag wonderful apple os documents apps.Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps.  Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps. Sample document content for rag test66. rag wonderful apple os documents apps."
+        rag.create_document(ds, name=name, blob=open(path, "rb").read())
+        doc = rag.get_document(name=name)
+        doc.async_parse()
+
+        # Wait for parsing to complete and get progress updates using join
+        for progress, msg in doc.join(interval=5, timeout=30):
+            print(progress, msg)
+            # Assert that progress is within 0 to 100
+            assert 0 <= progress <= 100, f"Invalid progress: {progress}"
+            # Assert that the message is not empty
+            assert msg, "Message should not be empty"
+
+        for c in doc.list_chunks(keywords="rag", offset=0, limit=12):
+            print(c)
+            assert c is not None, "Chunk is None"
+            assert "rag" in c['content_with_weight'].lower(), f"Keyword 'rag' not found in chunk content: {c.content}"
+    def test_add_chunk_to_chunk_list(self):
+        rag = RAGFlow(API_KEY, HOST_ADDRESS)
+        doc = rag.get_document(name='story.txt')
+        chunk = doc.add_chunk(content="assss")
+        assert chunk is not None, "Chunk is None"
+        assert isinstance(chunk, Chunk), "Chunk was not added to chunk list"
+
+    def test_delete_chunk_of_chunk_list(self):
+        rag = RAGFlow(API_KEY, HOST_ADDRESS)
+        doc = rag.get_document(name='story.txt')
+
+        chunk = doc.add_chunk(content="assss")
+        assert chunk is not None, "Chunk is None"
+        assert isinstance(chunk, Chunk), "Chunk was not added to chunk list"
+        chunk_num_before=doc.chunk_num
+        chunk.delete()
+        assert doc.chunk_num == chunk_num_before-1, "Chunk was not deleted"