Fix: Create dataset performance unmatched between HTTP api and web ui (#10960)

### What problem does this PR solve? Fix: Create dataset performance unmatched between HTTP api and web ui #10925 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-02-01 08:05:07 +08:00 · 2025-11-04 13:45:14 +08:00
parent 1e45137284
commit 19f71a961a
12 changed files with 201 additions and 222 deletions
--- a/test/testcases/configs.py
+++ b/test/testcases/configs.py
@ -34,3 +34,32 @@ DATASET_NAME_LIMIT = 128
 DOCUMENT_NAME_LIMIT = 255
 CHAT_ASSISTANT_NAME_LIMIT = 255
 SESSION_WITH_CHAT_NAME_LIMIT = 255
+
+DEFAULT_PARSER_CONFIG = {
+    "layout_recognize": "DeepDOC",
+    "chunk_token_num": 512,
+    "delimiter": "\n",
+    "auto_keywords": 0,
+    "auto_questions": 0,
+    "html4excel": False,
+    "topn_tags": 3,
+    "raptor": {
+        "use_raptor": True,
+        "prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n      {cluster_content}\nThe above is the content you need to summarize.",
+        "max_token": 256,
+        "threshold": 0.1,
+        "max_cluster": 64,
+        "random_seed": 0,
+    },
+    "graphrag": {
+        "use_graphrag": True,
+        "entity_types": [
+            "organization",
+            "person",
+            "geo",
+            "event",
+            "category",
+        ],
+        "method": "light",
+    },
+}
--- a/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py
+++ b/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py
@ -23,7 +23,7 @@ from libs.auth import RAGFlowHttpApiAuth
 from utils import encode_avatar
 from utils.file_utils import create_image_file
 from utils.hypothesis_utils import valid_names
-
+from configs import DEFAULT_PARSER_CONFIG

@pytest.mark.usefixtures("clear_datasets")
 class TestAuthorization:
@ -637,42 +637,21 @@ class TestDatasetCreate:
        payload = {"name": "parser_config_empty", "parser_config": {}}
        res = create_dataset(HttpApiAuth, payload)
        assert res["code"] == 0, res
-        assert res["data"]["parser_config"] == {
-            "chunk_token_num": 512,
-            "delimiter": r"\n",
-            "html4excel": False,
-            "layout_recognize": "DeepDOC",
-            "raptor": {"use_raptor": False},
-            "graphrag": {"use_graphrag": False},
-        }, res
+        assert res["data"]["parser_config"] == DEFAULT_PARSER_CONFIG, res

    @pytest.mark.p2
    def test_parser_config_unset(self, HttpApiAuth):
        payload = {"name": "parser_config_unset"}
        res = create_dataset(HttpApiAuth, payload)
        assert res["code"] == 0, res
-        assert res["data"]["parser_config"] == {
-            "chunk_token_num": 512,
-            "delimiter": r"\n",
-            "html4excel": False,
-            "layout_recognize": "DeepDOC",
-            "raptor": {"use_raptor": False},
-            "graphrag": {"use_graphrag": False},
-        }, res
+        assert res["data"]["parser_config"] == DEFAULT_PARSER_CONFIG, res

    @pytest.mark.p3
    def test_parser_config_none(self, HttpApiAuth):
        payload = {"name": "parser_config_none", "parser_config": None}
        res = create_dataset(HttpApiAuth, payload)
        assert res["code"] == 0, res
-        assert res["data"]["parser_config"] == {
-            "chunk_token_num": 512,
-            "delimiter": "\\n",
-            "html4excel": False,
-            "layout_recognize": "DeepDOC",
-            "raptor": {"use_raptor": False},
-            "graphrag": {"use_graphrag": False},
-        }, res
+        assert res["data"]["parser_config"] == DEFAULT_PARSER_CONFIG, res

    @pytest.mark.p2
    @pytest.mark.parametrize(
--- a/test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py
+++ b/test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py
@ -25,7 +25,7 @@ from libs.auth import RAGFlowHttpApiAuth
 from utils import encode_avatar
 from utils.file_utils import create_image_file
 from utils.hypothesis_utils import valid_names
-
+from configs import DEFAULT_PARSER_CONFIG
 # TODO: Missing scenario for updating embedding_model with chunk_count != 0


@ -748,14 +748,7 @@ class TestDatasetUpdate:

        res = list_datasets(HttpApiAuth)
        assert res["code"] == 0, res
-        assert res["data"][0]["parser_config"] == {
-            "chunk_token_num": 512,
-            "delimiter": r"\n",
-            "html4excel": False,
-            "layout_recognize": "DeepDOC",
-            "raptor": {"use_raptor": False},
-            "graphrag": {"use_graphrag": False},
-        }, res
+        assert res["data"][0]["parser_config"] == DEFAULT_PARSER_CONFIG, res

    @pytest.mark.p3
    def test_parser_config_none(self, HttpApiAuth, add_dataset_func):
@ -766,14 +759,7 @@ class TestDatasetUpdate:

        res = list_datasets(HttpApiAuth, {"id": dataset_id})
        assert res["code"] == 0, res
-        assert res["data"][0]["parser_config"] == {
-            "chunk_token_num": 512,
-            "delimiter": r"\n",
-            "html4excel": False,
-            "layout_recognize": "DeepDOC",
-            "raptor": {"use_raptor": False},
-            "graphrag": {"use_graphrag": False},
-        }, res
+        assert res["data"][0]["parser_config"] == DEFAULT_PARSER_CONFIG, res

    @pytest.mark.p3
    def test_parser_config_empty_with_chunk_method_change(self, HttpApiAuth, add_dataset_func):
--- a/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py
+++ b/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py
@ -19,7 +19,7 @@ import pytest
 from common import list_documents, update_document
 from configs import DOCUMENT_NAME_LIMIT, INVALID_API_TOKEN
 from libs.auth import RAGFlowHttpApiAuth
-
+from configs import DEFAULT_PARSER_CONFIG

@pytest.mark.p1
 class TestAuthorization:
@ -308,14 +308,7 @@ class TestUpdateDocumentParserConfig:
            ("naive", {}, 0, ""),
            (
                "naive",
-                {
-                    "chunk_token_num": 512,
-                    "layout_recognize": "DeepDOC",
-                    "html4excel": False,
-                    "delimiter": r"\n",
-                    "task_page_size": 12,
-                    "raptor": {"use_raptor": False},
-                },
+                DEFAULT_PARSER_CONFIG,
                0,
                "",
            ),
@ -419,7 +412,14 @@ class TestUpdateDocumentParserConfig:
                "",
                marks=pytest.mark.skip(reason="issues/6098"),
            ),
-            ("naive", {"raptor": {"use_raptor": True}}, 0, ""),
+            ("naive", {"raptor": {"use_raptor": {
+                "use_raptor": True,
+                "prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n      {cluster_content}\nThe above is the content you need to summarize.",
+                "max_token": 256,
+                "threshold": 0.1,
+                "max_cluster": 64,
+                "random_seed": 0,
+            },}}, 0, ""),
            ("naive", {"raptor": {"use_raptor": False}}, 0, ""),
            pytest.param(
                "naive",
@ -534,14 +534,7 @@ class TestUpdateDocumentParserConfig:
        if expected_code == 0:
            res = list_documents(HttpApiAuth, dataset_id, {"id": document_ids[0]})
            if parser_config == {}:
-                assert res["data"]["docs"][0]["parser_config"] == {
-                    "chunk_token_num": 512,
-                    "delimiter": r"\n",
-                    "html4excel": False,
-                    "layout_recognize": "DeepDOC",
-                    "raptor": {"use_raptor": False},
-                    "graphrag": {"use_graphrag": False},
-                }
+                assert res["data"]["docs"][0]["parser_config"] == DEFAULT_PARSER_CONFIG
            else:
                for k, v in parser_config.items():
                    assert res["data"]["docs"][0]["parser_config"][k] == v
--- a/test/testcases/test_sdk_api/test_dataset_mangement/test_create_dataset.py
+++ b/test/testcases/test_sdk_api/test_dataset_mangement/test_create_dataset.py
@ -23,7 +23,7 @@ from ragflow_sdk import DataSet, RAGFlow
 from utils import encode_avatar
 from utils.file_utils import create_image_file
 from utils.hypothesis_utils import valid_names
-
+from configs import DEFAULT_PARSER_CONFIG

@pytest.mark.usefixtures("clear_datasets")
 class TestAuthorization:
@ -586,14 +586,7 @@ class TestDatasetCreate:
    def test_parser_config_empty(self, client):
        excepted_value = DataSet.ParserConfig(
            client,
-            {
-                "chunk_token_num": 512,
-                "delimiter": r"\n",
-                "html4excel": False,
-                "layout_recognize": "DeepDOC",
-                "raptor": {"use_raptor": False},
-                "graphrag": {"use_graphrag": False},
-            },
+            DEFAULT_PARSER_CONFIG,
        )
        parser_config_o = DataSet.ParserConfig(client, {})
        payload = {"name": "parser_config_empty", "parser_config": parser_config_o}
@ -604,14 +597,7 @@ class TestDatasetCreate:
    def test_parser_config_unset(self, client):
        excepted_value = DataSet.ParserConfig(
            client,
-            {
-                "chunk_token_num": 512,
-                "delimiter": r"\n",
-                "html4excel": False,
-                "layout_recognize": "DeepDOC",
-                "raptor": {"use_raptor": False},
-                "graphrag": {"use_graphrag": False},
-            },
+            DEFAULT_PARSER_CONFIG,
        )
        payload = {"name": "parser_config_unset"}
        dataset = client.create_dataset(**payload)
@ -621,14 +607,7 @@ class TestDatasetCreate:
    def test_parser_config_none(self, client):
        excepted_value = DataSet.ParserConfig(
            client,
-            {
-                "chunk_token_num": 512,
-                "delimiter": r"\n",
-                "html4excel": False,
-                "layout_recognize": "DeepDOC",
-                "raptor": {"use_raptor": False},
-                "graphrag": {"use_graphrag": False},
-            },
+            DEFAULT_PARSER_CONFIG,
        )
        payload = {"name": "parser_config_empty", "parser_config": None}
        dataset = client.create_dataset(**payload)
--- a/test/testcases/test_sdk_api/test_dataset_mangement/test_update_dataset.py
+++ b/test/testcases/test_sdk_api/test_dataset_mangement/test_update_dataset.py
@ -24,7 +24,7 @@ from ragflow_sdk import DataSet
 from utils import encode_avatar
 from utils.file_utils import create_image_file
 from utils.hypothesis_utils import valid_names
-
+from configs import DEFAULT_PARSER_CONFIG

 class TestRquest:
    @pytest.mark.p2
@ -634,14 +634,7 @@ class TestDatasetUpdate:
        dataset = add_dataset_func
        expected_config = DataSet.ParserConfig(
            client,
-            {
-                "chunk_token_num": 512,
-                "delimiter": r"\n",
-                "html4excel": False,
-                "layout_recognize": "DeepDOC",
-                "raptor": {"use_raptor": False},
-                "graphrag": {"use_graphrag": False},
-            },
+            DEFAULT_PARSER_CONFIG,
        )
        dataset.update({"parser_config": {}})
        assert str(dataset.parser_config) == str(expected_config), str(dataset)
@ -654,14 +647,7 @@ class TestDatasetUpdate:
        dataset = add_dataset_func
        expected_config = DataSet.ParserConfig(
            client,
-            {
-                "chunk_token_num": 512,
-                "delimiter": r"\n",
-                "html4excel": False,
-                "layout_recognize": "DeepDOC",
-                "raptor": {"use_raptor": False},
-                "graphrag": {"use_graphrag": False},
-            },
+            DEFAULT_PARSER_CONFIG,
        )
        dataset.update({"parser_config": None})
        assert str(dataset.parser_config) == str(expected_config), str(dataset)
--- a/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py
+++ b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py
@ -17,7 +17,7 @@
 import pytest
 from configs import DOCUMENT_NAME_LIMIT
 from ragflow_sdk import DataSet
-
+from configs import DEFAULT_PARSER_CONFIG  

 class TestDocumentsUpdated:
    @pytest.mark.p1
@ -206,14 +206,7 @@ class TestUpdateDocumentParserConfig:
            ("naive", {}, ""),
            (
                "naive",
-                {
-                    "chunk_token_num": 512,
-                    "layout_recognize": "DeepDOC",
-                    "html4excel": False,
-                    "delimiter": r"\n",
-                    "task_page_size": 12,
-                    "raptor": {"use_raptor": False},
-                },
+                DEFAULT_PARSER_CONFIG,
                "",
            ),
            pytest.param(
@ -294,7 +287,12 @@ class TestUpdateDocumentParserConfig:
                "",
                marks=pytest.mark.skip(reason="issues/6098"),
            ),
-            ("naive", {"raptor": {"use_raptor": True}}, ""),
+            ("naive", {"raptor": {"use_raptor": True,                 
+                                "prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n      {cluster_content}\nThe above is the content you need to summarize.",
+                                "max_token": 256,
+                                "threshold": 0.1,
+                                "max_cluster": 64,
+                                "random_seed": 0,}}, ""),
            ("naive", {"raptor": {"use_raptor": False}}, ""),
            pytest.param(
                "naive",
@ -400,13 +398,6 @@ class TestUpdateDocumentParserConfig:
            else:
                expected_config = DataSet.ParserConfig(
                    client,
-                    {
-                        "chunk_token_num": 512,
-                        "delimiter": r"\n",
-                        "html4excel": False,
-                        "layout_recognize": "DeepDOC",
-                        "raptor": {"use_raptor": False},
-                        "graphrag": {"use_graphrag": False},
-                    },
+                    DEFAULT_PARSER_CONFIG,
                )
                assert str(updated_doc.parser_config) == str(expected_config), str(updated_doc)