TEST: Added test cases for Upload Documents HTTP API (#5991)

### What problem does this PR solve? cover upload docments endpoints ### Type of change - [x] add test cases
2026-01-24 12:06:36 +08:00 · 2025-03-12 19:38:52 +08:00
parent 7cd37c37cd
commit bd5eb47441
12 changed files with 782 additions and 123 deletions
--- a/sdk/python/test/test_http_api/common.py
+++ b/sdk/python/test/test_http_api/common.py
@ -0,0 +1,101 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import os
+from pathlib import Path
+
+import requests
+from requests_toolbelt import MultipartEncoder
+
+HEADERS = {"Content-Type": "application/json"}
+HOST_ADDRESS = os.getenv("HOST_ADDRESS", "http://127.0.0.1:9380")
+DATASETS_API_URL = "/api/v1/datasets"
+FILE_API_URL = "/api/v1/datasets/{dataset_id}/documents"
+
+INVALID_API_TOKEN = "invalid_key_123"
+DATASET_NAME_LIMIT = 128
+DOCUMENT_NAME_LIMIT = 128
+
+
+# DATASET MANAGEMENT
+def create_dataset(auth, payload):
+    res = requests.post(
+        url=f"{HOST_ADDRESS}{DATASETS_API_URL}",
+        headers=HEADERS,
+        auth=auth,
+        json=payload,
+    )
+    return res.json()
+
+
+def list_dataset(auth, params=None):
+    res = requests.get(
+        url=f"{HOST_ADDRESS}{DATASETS_API_URL}",
+        headers=HEADERS,
+        auth=auth,
+        params=params,
+    )
+    return res.json()
+
+
+def update_dataset(auth, dataset_id, payload):
+    res = requests.put(
+        url=f"{HOST_ADDRESS}{DATASETS_API_URL}/{dataset_id}",
+        headers=HEADERS,
+        auth=auth,
+        json=payload,
+    )
+    return res.json()
+
+
+def delete_dataset(auth, payload=None):
+    res = requests.delete(
+        url=f"{HOST_ADDRESS}{DATASETS_API_URL}",
+        headers=HEADERS,
+        auth=auth,
+        json=payload,
+    )
+    return res.json()
+
+
+def create_datasets(auth, num):
+    ids = []
+    for i in range(num):
+        res = create_dataset(auth, {"name": f"dataset_{i}"})
+        ids.append(res["data"]["id"])
+    return ids
+
+
+# FILE MANAGEMENT WITHIN DATASET
+def upload_documnets(auth, dataset_id, files_path=None):
+    url = f"{HOST_ADDRESS}{FILE_API_URL}".format(dataset_id=dataset_id)
+
+    if files_path is None:
+        files_path = []
+
+    fields = []
+    for i, fp in enumerate(files_path):
+        p = Path(fp)
+        fields.append(("file", (p.name, p.open("rb"))))
+    m = MultipartEncoder(fields=fields)
+
+    res = requests.post(
+        url=url,
+        headers={"Content-Type": m.content_type},
+        auth=auth,
+        data=m,
+    )
+    return res.json()
--- a/sdk/python/test/test_http_api/conftest.py
+++ b/sdk/python/test/test_http_api/conftest.py
@ -0,0 +1,73 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+
+import pytest
+from common import delete_dataset
+from libs.utils.file_utils import (
+    create_docx_file,
+    create_eml_file,
+    create_excel_file,
+    create_html_file,
+    create_image_file,
+    create_json_file,
+    create_md_file,
+    create_pdf_file,
+    create_ppt_file,
+    create_txt_file,
+)
+
+
+@pytest.fixture(scope="function", autouse=True)
+def clear_datasets(get_http_api_auth):
+    yield
+    delete_dataset(get_http_api_auth)
+
+
+@pytest.fixture
+def generate_test_files(tmp_path):
+    files = {}
+    files["docx"] = tmp_path / "ragflow_test.docx"
+    create_docx_file(files["docx"])
+
+    files["excel"] = tmp_path / "ragflow_test.xlsx"
+    create_excel_file(files["excel"])
+
+    files["ppt"] = tmp_path / "ragflow_test.pptx"
+    create_ppt_file(files["ppt"])
+
+    files["image"] = tmp_path / "ragflow_test.png"
+    create_image_file(files["image"])
+
+    files["pdf"] = tmp_path / "ragflow_test.pdf"
+    create_pdf_file(files["pdf"])
+
+    files["txt"] = tmp_path / "ragflow_test.txt"
+    create_txt_file(files["txt"])
+
+    files["md"] = tmp_path / "ragflow_test.md"
+    create_md_file(files["md"])
+
+    files["json"] = tmp_path / "ragflow_test.json"
+    create_json_file(files["json"])
+
+    files["eml"] = tmp_path / "ragflow_test.eml"
+    create_eml_file(files["eml"])
+
+    files["html"] = tmp_path / "ragflow_test.html"
+    create_html_file(files["html"])
+
+    return files
--- a/sdk/python/test/test_http_api/test_dataset_mangement/common.py
+++ b/sdk/python/test/test_http_api/test_dataset_mangement/common.py
@ -1,57 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import os
-
-import requests
-
-HOST_ADDRESS = os.getenv("HOST_ADDRESS", "http://127.0.0.1:9380")
-API_URL = f"{HOST_ADDRESS}/api/v1/datasets"
-HEADERS = {"Content-Type": "application/json"}
-
-
-INVALID_API_TOKEN = "invalid_key_123"
-DATASET_NAME_LIMIT = 128
-
-
-def create_dataset(auth, payload):
-    res = requests.post(url=API_URL, headers=HEADERS, auth=auth, json=payload)
-    return res.json()
-
-
-def list_dataset(auth, params=None):
-    res = requests.get(url=API_URL, headers=HEADERS, auth=auth, params=params)
-    return res.json()
-
-
-def update_dataset(auth, dataset_id, payload):
-    res = requests.put(
-        url=f"{API_URL}/{dataset_id}", headers=HEADERS, auth=auth, json=payload 
-    )
-    return res.json()
-
-
-def delete_dataset(auth, payload=None):
-    res = requests.delete(url=API_URL, headers=HEADERS, auth=auth, json=payload)
-    return res.json()
-
-
-def create_datasets(auth, num):
-    ids = []
-    for i in range(num):
-        res = create_dataset(auth, {"name": f"dataset_{i}"})
-        ids.append(res["data"]["id"])
-    return ids
--- a/sdk/python/test/test_http_api/test_dataset_mangement/conftest.py
+++ b/sdk/python/test/test_http_api/test_dataset_mangement/conftest.py
@ -1,24 +0,0 @@
-#
-#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
-#
-#  Licensed under the Apache License, Version 2.0 (the "License");
-#  you may not use this file except in compliance with the License.
-#  You may obtain a copy of the License at
-#
-#      http://www.apache.org/licenses/LICENSE-2.0
-#
-#  Unless required by applicable law or agreed to in writing, software
-#  distributed under the License is distributed on an "AS IS" BASIS,
-#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#  See the License for the specific language governing permissions and
-#  limitations under the License.
-#
-
-import pytest
-from common import delete_dataset
-
-
-@pytest.fixture(scope="function", autouse=True)
-def clear_datasets(get_http_api_auth):
-    yield
-    delete_dataset(get_http_api_auth)
--- a/sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py
+++ b/sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py
@ -13,12 +13,12 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import base64
-from pathlib import Path

 import pytest
 from common import DATASET_NAME_LIMIT, INVALID_API_TOKEN, create_dataset
 from libs.auth import RAGFlowHttpApiAuth
+from libs.utils import encode_avatar
+from libs.utils.file_utils import create_image_file


 class TestAuthorization:
@ -75,18 +75,11 @@ class TestDatasetCreation:


 class TestAdvancedConfigurations:
-    def test_avatar(self, get_http_api_auth, request):
-        def encode_avatar(image_path):
-            with Path.open(image_path, "rb") as file:
-                binary_data = file.read()
-            base64_encoded = base64.b64encode(binary_data).decode("utf-8")
-            return base64_encoded
-
+    def test_avatar(self, get_http_api_auth, tmp_path):
+        fn = create_image_file(tmp_path / "ragflow_test.png")
        payload = {
            "name": "avatar_test",
-            "avatar": encode_avatar(
-                Path(request.config.rootdir) / "test/data/logo.svg"
-            ),
+            "avatar": encode_avatar(fn),
        }
        res = create_dataset(get_http_api_auth, payload)
        assert res["code"] == 0
--- a/sdk/python/test/test_http_api/test_dataset_mangement/test_update_dataset.py
+++ b/sdk/python/test/test_http_api/test_dataset_mangement/test_update_dataset.py
@ -13,9 +13,7 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 #
-import base64
 from concurrent.futures import ThreadPoolExecutor
-from pathlib import Path

 import pytest
 from common import (
@ -26,6 +24,8 @@ from common import (
    update_dataset,
 )
 from libs.auth import RAGFlowHttpApiAuth
+from libs.utils import encode_avatar
+from libs.utils.file_utils import create_image_file

 # TODO: Missing scenario for updating embedding_model with chunk_count != 0

@ -171,19 +171,10 @@ class TestDatasetUpdate:
        else:
            assert res["message"] == expected_message

-    def test_avatar(self, get_http_api_auth, request):
-        def encode_avatar(image_path):
-            with Path.open(image_path, "rb") as file:
-                binary_data = file.read()
-            base64_encoded = base64.b64encode(binary_data).decode("utf-8")
-            return base64_encoded
-
+    def test_avatar(self, get_http_api_auth, tmp_path):
        ids = create_datasets(get_http_api_auth, 1)
-        payload = {
-            "avatar": encode_avatar(
-                Path(request.config.rootdir) / "test/data/logo.svg"
-            ),
-        }
+        fn = create_image_file(tmp_path / "ragflow_test.png")
+        payload = {"avatar": encode_avatar(fn)}
        res = update_dataset(get_http_api_auth, ids[0], payload)
        assert res["code"] == 0

--- a/sdk/python/test/test_http_api/test_file_management_within_dataset/test_upload_documents.py
+++ b/sdk/python/test/test_http_api/test_file_management_within_dataset/test_upload_documents.py
@ -0,0 +1,230 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import string
+from concurrent.futures import ThreadPoolExecutor
+
+import pytest
+import requests
+from common import (
+    DOCUMENT_NAME_LIMIT,
+    FILE_API_URL,
+    HOST_ADDRESS,
+    INVALID_API_TOKEN,
+    create_datasets,
+    list_dataset,
+    upload_documnets,
+)
+from libs.auth import RAGFlowHttpApiAuth
+from libs.utils.file_utils import create_txt_file
+from requests_toolbelt import MultipartEncoder
+
+
+class TestAuthorization:
+    @pytest.mark.parametrize(
+        "auth, expected_code, expected_message",
+        [
+            (None, 0, "`Authorization` can't be empty"),
+            (
+                RAGFlowHttpApiAuth(INVALID_API_TOKEN),
+                109,
+                "Authentication error: API key is invalid!",
+            ),
+        ],
+    )
+    def test_invalid_auth(
+        self, get_http_api_auth, auth, expected_code, expected_message
+    ):
+        ids = create_datasets(get_http_api_auth, 1)
+        res = upload_documnets(auth, ids[0])
+        assert res["code"] == expected_code
+        assert res["message"] == expected_message
+
+
+class TestUploadDocuments:
+    def test_valid_single_upload(self, get_http_api_auth, tmp_path):
+        ids = create_datasets(get_http_api_auth, 1)
+        fp = create_txt_file(tmp_path / "ragflow_test.txt")
+        res = upload_documnets(get_http_api_auth, ids[0], [fp])
+        assert res["code"] == 0
+        assert res["data"][0]["dataset_id"] == ids[0]
+        assert res["data"][0]["name"] == fp.name
+
+    @pytest.mark.parametrize(
+        "file_type",
+        [
+            "docx",
+            "excel",
+            "ppt",
+            "image",
+            "pdf",
+            "txt",
+            "md",
+            "json",
+            "eml",
+            "html",
+        ],
+    )
+    def test_file_type_validation(
+        self, get_http_api_auth, generate_test_files, file_type
+    ):
+        ids = create_datasets(get_http_api_auth, 1)
+        fp = generate_test_files[file_type]
+        res = upload_documnets(get_http_api_auth, ids[0], [fp])
+        assert res["code"] == 0
+        assert res["data"][0]["dataset_id"] == ids[0]
+        assert res["data"][0]["name"] == fp.name
+
+    @pytest.mark.parametrize(
+        "file_type",
+        ["exe", "unknown"],
+    )
+    def test_unsupported_file_type(self, get_http_api_auth, tmp_path, file_type):
+        ids = create_datasets(get_http_api_auth, 1)
+        fp = tmp_path / f"ragflow_test.{file_type}"
+        fp.touch()
+        res = upload_documnets(get_http_api_auth, ids[0], [fp])
+        assert res["code"] == 500
+        assert (
+            res["message"]
+            == f"ragflow_test.{file_type}: This type of file has not been supported yet!"
+        )
+
+    def test_missing_file(self, get_http_api_auth):
+        ids = create_datasets(get_http_api_auth, 1)
+        res = upload_documnets(get_http_api_auth, ids[0])
+        assert res["code"] == 101
+        assert res["message"] == "No file part!"
+
+    def test_empty_file(self, get_http_api_auth, tmp_path):
+        ids = create_datasets(get_http_api_auth, 1)
+        fp = tmp_path / "empty.txt"
+        fp.touch()
+
+        res = upload_documnets(get_http_api_auth, ids[0], [fp])
+        assert res["code"] == 0
+        assert res["data"][0]["size"] == 0
+
+    def test_filename_empty(self, get_http_api_auth, tmp_path):
+        ids = create_datasets(get_http_api_auth, 1)
+        fp = create_txt_file(tmp_path / "ragflow_test.txt")
+        url = f"{HOST_ADDRESS}{FILE_API_URL}".format(dataset_id=ids[0])
+        fields = (("file", ("", fp.open("rb"))),)
+        m = MultipartEncoder(fields=fields)
+        res = requests.post(
+            url=url,
+            headers={"Content-Type": m.content_type},
+            auth=get_http_api_auth,
+            data=m,
+        )
+        assert res.json()["code"] == 101
+        assert res.json()["message"] == "No file selected!"
+
+    def test_filename_exceeds_max_length(self, get_http_api_auth, tmp_path):
+        ids = create_datasets(get_http_api_auth, 1)
+        # filename_length = 129
+        fp = create_txt_file(tmp_path / f"{'a' * (DOCUMENT_NAME_LIMIT - 3)}.txt")
+        res = upload_documnets(get_http_api_auth, ids[0], [fp])
+        assert res["code"] == 500
+        assert (
+            res["message"]
+            == f"{'a' * (DOCUMENT_NAME_LIMIT - 3)}.txt: Exceed the maximum length of file name!"
+        )
+
+    def test_invalid_dataset_id(self, get_http_api_auth, tmp_path):
+        fp = create_txt_file(tmp_path / "ragflow_test.txt")
+        res = upload_documnets(get_http_api_auth, "invalid_dataset_id", [fp])
+        assert res["code"] == 100
+        assert (
+            res["message"]
+            == """LookupError("Can\'t find the dataset with ID invalid_dataset_id!")"""
+        )
+
+    def test_duplicate_files(self, get_http_api_auth, tmp_path):
+        ids = create_datasets(get_http_api_auth, 1)
+        fp = create_txt_file(tmp_path / "ragflow_test.txt")
+        res = upload_documnets(get_http_api_auth, ids[0], [fp, fp])
+        assert res["code"] == 0
+        assert len(res["data"]) == 2
+        for i in range(len(res["data"])):
+            assert res["data"][i]["dataset_id"] == ids[0]
+            expected_name = fp.name
+            if i != 0:
+                expected_name = f"{fp.stem}({i}){fp.suffix}"
+            assert res["data"][i]["name"] == expected_name
+
+    def test_same_file_repeat(self, get_http_api_auth, tmp_path):
+        ids = create_datasets(get_http_api_auth, 1)
+        fp = create_txt_file(tmp_path / "ragflow_test.txt")
+        for i in range(10):
+            res = upload_documnets(get_http_api_auth, ids[0], [fp])
+            assert res["code"] == 0
+            assert len(res["data"]) == 1
+            assert res["data"][0]["dataset_id"] == ids[0]
+            expected_name = fp.name
+            if i != 0:
+                expected_name = f"{fp.stem}({i}){fp.suffix}"
+            assert res["data"][0]["name"] == expected_name
+
+    def test_filename_special_characters(self, get_http_api_auth, tmp_path):
+        ids = create_datasets(get_http_api_auth, 1)
+        illegal_chars = '<>:"/\\|?*'
+        translation_table = str.maketrans({char: "_" for char in illegal_chars})
+        safe_filename = string.punctuation.translate(translation_table)
+        fp = tmp_path / f"{safe_filename}.txt"
+        fp.write_text("Sample text content")
+
+        res = upload_documnets(get_http_api_auth, ids[0], [fp])
+        assert res["code"] == 0
+        assert len(res["data"]) == 1
+        assert res["data"][0]["dataset_id"] == ids[0]
+        assert res["data"][0]["name"] == fp.name
+
+    def test_multiple_files(self, get_http_api_auth, tmp_path):
+        ids = create_datasets(get_http_api_auth, 1)
+        expected_document_count = 20
+        fps = []
+        for i in range(expected_document_count):
+            fp = create_txt_file(tmp_path / f"ragflow_test_{i}.txt")
+            fps.append(fp)
+        res = upload_documnets(get_http_api_auth, ids[0], fps)
+        assert res["code"] == 0
+
+        res = list_dataset(get_http_api_auth, {"id": ids[0]})
+        assert res["data"][0]["document_count"] == expected_document_count
+
+    @pytest.mark.xfail
+    def test_concurrent_upload(self, get_http_api_auth, tmp_path):
+        ids = create_datasets(get_http_api_auth, 1)
+
+        expected_document_count = 20
+        fps = []
+        for i in range(expected_document_count):
+            fp = create_txt_file(tmp_path / f"ragflow_test_{i}.txt")
+            fps.append(fp)
+
+        with ThreadPoolExecutor(max_workers=5) as executor:
+            futures = [
+                executor.submit(
+                    upload_documnets, get_http_api_auth, ids[0], fps[i : i + 1]
+                )
+                for i in range(expected_document_count)
+            ]
+        responses = [f.result() for f in futures]
+        assert all(r["code"] == 0 for r in responses)
+
+        res = list_dataset(get_http_api_auth, {"id": ids[0]})
+        assert res["data"][0]["document_count"] == expected_document_count