TEST: Added test cases for Create Dataset HTTP API (#5724)

### What problem does this PR solve? 1. add test cases 2. integrate workflows/tests.yml into CI pipeline ### Type of change - [x] add testcases
2026-01-27 05:36:33 +08:00 · 2025-03-06 20:22:17 +08:00
parent 27153dde85
commit 4f9504305a
8 changed files with 414 additions and 1 deletions
--- a/sdk/python/test/test_http_api/test_dataset_mangement/common.py
+++ b/sdk/python/test/test_http_api/test_dataset_mangement/common.py
@ -0,0 +1,47 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import os
+import requests
+
+HOST_ADDRESS = os.getenv('HOST_ADDRESS', 'http://127.0.0.1:9380')
+API_URL = f'{HOST_ADDRESS}/api/v1/datasets'
+HEADERS = {"Content-Type": "application/json"}
+
+
+INVALID_API_TOKEN = "invalid_key_123"
+DATASET_NAME_LIMIT = 128
+
+
+def create_dataset(auth, payload):
+    res = requests.post(url=API_URL, headers=HEADERS, auth=auth, json=payload)
+    return res.json()
+
+
+def list_dataset(auth, params):
+    res = requests.get(url=API_URL, headers=HEADERS, auth=auth, params=params)
+    return res.json()
+
+
+def update_dataset(auth, dataset_id, payload):
+    res = requests.put(url=f"{API_URL}/{dataset_id}",
+                       headers=HEADERS, auth=auth, json=payload)
+    return res.json()
+
+
+def delete_dataset(auth, payload=None):
+    res = requests.delete(url=API_URL, headers=HEADERS, auth=auth, json=payload)
+    return res.json()
--- a/sdk/python/test/test_http_api/test_dataset_mangement/conftest.py
+++ b/sdk/python/test/test_http_api/test_dataset_mangement/conftest.py
@ -0,0 +1,25 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+
+import pytest
+
+from common import delete_dataset
+
+
+@pytest.fixture(scope="function", autouse=True)
+def clear_datasets(get_http_api_auth):
+    yield
+    delete_dataset(get_http_api_auth)
--- a/sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py
+++ b/sdk/python/test/test_http_api/test_dataset_mangement/test_create_dataset.py
@ -0,0 +1,256 @@
+#
+#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+#
+import pytest
+import base64
+
+from pathlib import Path
+
+from common import create_dataset, INVALID_API_TOKEN, DATASET_NAME_LIMIT
+from libs.auth import RAGFlowHttpApiAuth
+
+
+class TestAuthorization:
+    def test_invalid_auth(self):
+        INVALID_API_KEY = RAGFlowHttpApiAuth(INVALID_API_TOKEN)
+        res = create_dataset(INVALID_API_KEY, {"name": "auth_test"})
+
+        assert res["code"] == 109
+        assert res["message"] == 'Authentication error: API key is invalid!'
+
+
+class TestDatasetCreation:
+    @pytest.mark.parametrize("payload, expected_code", [
+        ({"name": "valid_name"}, 0),
+        ({"name": "a"*(DATASET_NAME_LIMIT+1)}, 102),
+        ({"name": 0}, 100),
+        ({"name": ""}, 102),
+        ({"name": "duplicated_name"}, 102),
+        ({"name": "case_insensitive"}, 102),
+    ])
+    def test_basic_scenarios(self, get_http_api_auth, payload, expected_code):
+        if payload["name"] == "duplicated_name":
+            create_dataset(get_http_api_auth, payload)
+        elif payload["name"] == "case_insensitive":
+            create_dataset(get_http_api_auth, {
+                           "name": payload["name"].upper()})
+
+        res = create_dataset(get_http_api_auth, payload)
+
+        assert res["code"] == expected_code
+        if expected_code == 0:
+            assert res["data"]["name"] == payload["name"]
+
+        if payload["name"] in ["duplicated_name", "case_insensitive"]:
+            assert res["message"] == "Duplicated dataset name in creating dataset."
+
+    @pytest.mark.slow
+    def test_dataset_10k(self, get_http_api_auth):
+        for i in range(10000):
+            payload = {"name": f"dataset_{i}"}
+            res = create_dataset(get_http_api_auth, payload)
+            assert res["code"] == 0, f"Failed to create dataset {i}"
+
+
+class TestAdvancedConfigurations:
+    def test_avatar(self, get_http_api_auth, request):
+        def encode_avatar(image_path):
+            with Path.open(image_path, "rb") as file:
+                binary_data = file.read()
+            base64_encoded = base64.b64encode(binary_data).decode("utf-8")
+            return base64_encoded
+
+        payload = {
+            "name": "avatar_test",
+            "avatar": encode_avatar(Path(request.config.rootdir) / 'test/data/logo.svg')
+        }
+        res = create_dataset(get_http_api_auth, payload)
+        assert res["code"] == 0
+
+    def test_description(self, get_http_api_auth):
+        payload = {
+            "name": "description_test",
+            "description": "a" * 65536
+        }
+        res = create_dataset(get_http_api_auth, payload)
+        assert res["code"] == 0
+
+    @pytest.mark.parametrize("name, permission, expected_code", [
+        ("me", "me", 0),
+        ("team", "team", 0),
+        pytest.param("empty_permission", "", 0,
+                     marks=pytest.mark.xfail(reason='issue#5709')),
+        ("me_upercase", "ME", 102),
+        ("team_upercase", "TEAM", 102),
+        ("other_permission", "other_permission", 102)
+    ])
+    def test_permission(self, get_http_api_auth, name, permission, expected_code):
+        payload = {
+            "name": name,
+            "permission": permission
+        }
+        res = create_dataset(get_http_api_auth, payload)
+        assert res["code"] == expected_code
+        if expected_code == 0 and permission != "":
+            assert res["data"]["permission"] == permission
+        if permission == "":
+            assert res["data"]["permission"] == "me"
+
+    @pytest.mark.parametrize("name, chunk_method, expected_code", [
+        ("naive", "naive", 0),
+        ("manual", "manual", 0),
+        ("qa", "qa", 0),
+        ("table", "table", 0),
+        ("paper", "paper", 0),
+        ("book", "book", 0),
+        ("laws", "laws", 0),
+        ("presentation", "presentation", 0),
+        ("picture", "picture", 0),
+        ("one", "one", 0),
+        ("picknowledge_graphture", "knowledge_graph", 0),
+        ("email", "email", 0),
+        ("tag", "tag", 0),
+        pytest.param("empty_chunk_method", "", 0,
+                     marks=pytest.mark.xfail(reason='issue#5709')),
+        ("other_chunk_method", "other_chunk_method", 102)
+    ])
+    def test_chunk_method(self, get_http_api_auth, name, chunk_method, expected_code):
+        payload = {
+            "name": name,
+            "chunk_method": chunk_method
+        }
+        res = create_dataset(get_http_api_auth, payload)
+        assert res["code"] == expected_code
+        if expected_code == 0 and chunk_method != "":
+            assert res["data"]["chunk_method"] == chunk_method
+        if chunk_method == "":
+            assert res["data"]["chunk_method"] == "naive"
+
+    @pytest.mark.parametrize("name, embedding_model, expected_code", [
+        ("BAAI/bge-large-zh-v1.5",
+         "BAAI/bge-large-zh-v1.5", 0),
+        ("BAAI/bge-base-en-v1.5",
+         "BAAI/bge-base-en-v1.5", 0),
+        ("BAAI/bge-large-en-v1.5",
+         "BAAI/bge-large-en-v1.5", 0),
+        ("BAAI/bge-small-en-v1.5",
+         "BAAI/bge-small-en-v1.5", 0),
+        ("BAAI/bge-small-zh-v1.5",
+         "BAAI/bge-small-zh-v1.5", 0),
+        ("jinaai/jina-embeddings-v2-base-en",
+         "jinaai/jina-embeddings-v2-base-en", 0),
+        ("jinaai/jina-embeddings-v2-small-en",
+         "jinaai/jina-embeddings-v2-small-en", 0),
+        ("nomic-ai/nomic-embed-text-v1.5",
+         "nomic-ai/nomic-embed-text-v1.5", 0),
+        ("sentence-transformers/all-MiniLM-L6-v2",
+         "sentence-transformers/all-MiniLM-L6-v2", 0),
+        ("text-embedding-v2",
+         "text-embedding-v2", 0),
+        ("text-embedding-v3",
+         "text-embedding-v3", 0),
+        ("maidalun1020/bce-embedding-base_v1",
+         "maidalun1020/bce-embedding-base_v1", 0),
+        ("other_embedding_model",
+         "other_embedding_model", 102)
+    ])
+    def test_embedding_model(self, get_http_api_auth, name, embedding_model, expected_code):
+        payload = {
+            "name": name,
+            "embedding_model": embedding_model
+        }
+        res = create_dataset(get_http_api_auth, payload)
+        assert res["code"] == expected_code
+        if expected_code == 0:
+            assert res["data"]["embedding_model"] == embedding_model
+
+    @pytest.mark.parametrize("name, chunk_method, parser_config, expected_code", [
+        ("naive_default", "naive",
+         {"chunk_token_count": 128,
+          "layout_recognize": "DeepDOC",
+          "html4excel": False,
+          "delimiter": "\n!?。；！？",
+          "task_page_size": 12,
+          "raptor": {"use_raptor": False}
+          },
+         0),
+        ("naive_empty", "naive", {}, 0),
+        pytest.param("naive_chunk_token_count_negative", "naive",
+                     {"chunk_token_count": -1},
+                     102, marks=pytest.mark.xfail(reason='issue#5719')),
+        pytest.param("naive_chunk_token_count_zero", "naive",
+                     {"chunk_token_count": 0},
+                     102, marks=pytest.mark.xfail(reason='issue#5719')),
+        pytest.param("naive_chunk_token_count_float", "naive",
+                     {"chunk_token_count": 3.14},
+                     102, marks=pytest.mark.xfail(reason='issue#5719')),
+        pytest.param("naive_chunk_token_count_max", "naive",
+                     {"chunk_token_count": 1024*1024*1024},
+                     102, marks=pytest.mark.xfail(reason='issue#5719')),
+        pytest.param("naive_chunk_token_count_str", "naive",
+                     {"chunk_token_count": '1024'},
+                     102, marks=pytest.mark.xfail(reason='issue#5719')),
+        ("naive_layout_recognize_DeepDOC", "naive",
+         {"layout_recognize": "DeepDOC"}, 0),
+        ("naive_layout_recognize_Naive", "naive",
+         {"layout_recognize": "Naive"}, 0),
+        ("naive_html4excel_true", "naive", {"html4excel": True}, 0),
+        ("naive_html4excel_false", "naive", {"html4excel": False}, 0),
+        pytest.param("naive_html4excel_not_bool", "naive", {
+                     "html4excel": 1}, 102, marks=pytest.mark.xfail(reason='issue#5719')),
+        ("naive_delimiter_empty", "naive", {"delimiter": ""}, 0),
+        ("naive_delimiter_backticks", "naive", {"delimiter": "`##`"}, 0),
+        pytest.param("naive_delimiterl_not_str", "naive", {
+                     "delimiterl": 1}, 102, marks=pytest.mark.xfail(reason='issue#5719')),
+        pytest.param("naive_task_page_size_negative", "naive",
+                     {"task_page_size": -1},
+                     102, marks=pytest.mark.xfail(reason='issue#5719')),
+        pytest.param("naive_task_page_size_zero", "naive",
+                     {"task_page_size": 0},
+                     102, marks=pytest.mark.xfail(reason='issue#5719')),
+        pytest.param("naive_task_page_size_float", "naive",
+                     {"task_page_size": 3.14},
+                     102, marks=pytest.mark.xfail(reason='issue#5719')),
+        pytest.param("naive_task_page_size_max", "naive",
+                     {"task_page_size": 1024*1024*1024},
+                     102, marks=pytest.mark.xfail(reason='issue#5719')),
+        pytest.param("naive_task_page_size_str", "naive",
+                     {"task_page_size": '1024'},
+                     102, marks=pytest.mark.xfail(reason='issue#5719')),
+        ("naive_raptor_true", "naive", {"raptor": {"use_raptor": True}}, 0),
+        ("naive_raptor_false", "naive", {"raptor": {"use_raptor": False}}, 0),
+        ("knowledge_graph_entity_types_default", "knowledge_graph", {
+         "entity_types": ["organization", "person", "location", "event", "time"]}, 0),
+        pytest.param("knowledge_graph_entity_types_not_list", "knowledge_graph", {
+                     "entity_types": "organization,person,location,event,time"}, 102, marks=pytest.mark.xfail(reason='issue#5719'))
+    ])
+    def test_parser_configs(self, get_http_api_auth, name, chunk_method, parser_config, expected_code):
+        payload = {
+            "name": name,
+            "chunk_method": chunk_method,
+            "parser_config": parser_config
+        }
+        res = create_dataset(get_http_api_auth, payload)
+        # print(res)
+        assert res["code"] == expected_code
+        if expected_code == 0 and parser_config != {}:
+            for k, v in parser_config.items():
+                assert res["data"]["parser_config"][k] == v
+        if parser_config == {}:
+            assert res["data"]["parser_config"] == {"chunk_token_num": 128,
+                                                    "delimiter": "\\n!?;。；！？",
+                                                    "html4excel": False,
+                                                    "layout_recognize": "DeepDOC",
+                                                    "raptor": {"use_raptor": False}}