Fix: Create dataset performance unmatched between HTTP api and web ui (#10960)

### What problem does this PR solve? Fix: Create dataset performance unmatched between HTTP api and web ui #10925 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-01-30 23:26:36 +08:00 · 2025-11-04 13:45:14 +08:00
parent 1e45137284
commit 19f71a961a
12 changed files with 201 additions and 222 deletions
--- a/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py
+++ b/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py
@ -23,7 +23,7 @@ from libs.auth import RAGFlowHttpApiAuth
 from utils import encode_avatar
 from utils.file_utils import create_image_file
 from utils.hypothesis_utils import valid_names
-
+from configs import DEFAULT_PARSER_CONFIG

@pytest.mark.usefixtures("clear_datasets")
 class TestAuthorization:
@ -637,42 +637,21 @@ class TestDatasetCreate:
        payload = {"name": "parser_config_empty", "parser_config": {}}
        res = create_dataset(HttpApiAuth, payload)
        assert res["code"] == 0, res
-        assert res["data"]["parser_config"] == {
-            "chunk_token_num": 512,
-            "delimiter": r"\n",
-            "html4excel": False,
-            "layout_recognize": "DeepDOC",
-            "raptor": {"use_raptor": False},
-            "graphrag": {"use_graphrag": False},
-        }, res
+        assert res["data"]["parser_config"] == DEFAULT_PARSER_CONFIG, res

    @pytest.mark.p2
    def test_parser_config_unset(self, HttpApiAuth):
        payload = {"name": "parser_config_unset"}
        res = create_dataset(HttpApiAuth, payload)
        assert res["code"] == 0, res
-        assert res["data"]["parser_config"] == {
-            "chunk_token_num": 512,
-            "delimiter": r"\n",
-            "html4excel": False,
-            "layout_recognize": "DeepDOC",
-            "raptor": {"use_raptor": False},
-            "graphrag": {"use_graphrag": False},
-        }, res
+        assert res["data"]["parser_config"] == DEFAULT_PARSER_CONFIG, res

    @pytest.mark.p3
    def test_parser_config_none(self, HttpApiAuth):
        payload = {"name": "parser_config_none", "parser_config": None}
        res = create_dataset(HttpApiAuth, payload)
        assert res["code"] == 0, res
-        assert res["data"]["parser_config"] == {
-            "chunk_token_num": 512,
-            "delimiter": "\\n",
-            "html4excel": False,
-            "layout_recognize": "DeepDOC",
-            "raptor": {"use_raptor": False},
-            "graphrag": {"use_graphrag": False},
-        }, res
+        assert res["data"]["parser_config"] == DEFAULT_PARSER_CONFIG, res

    @pytest.mark.p2
    @pytest.mark.parametrize(
--- a/test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py
+++ b/test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py
@ -25,7 +25,7 @@ from libs.auth import RAGFlowHttpApiAuth
 from utils import encode_avatar
 from utils.file_utils import create_image_file
 from utils.hypothesis_utils import valid_names
-
+from configs import DEFAULT_PARSER_CONFIG
 # TODO: Missing scenario for updating embedding_model with chunk_count != 0


@ -748,14 +748,7 @@ class TestDatasetUpdate:

        res = list_datasets(HttpApiAuth)
        assert res["code"] == 0, res
-        assert res["data"][0]["parser_config"] == {
-            "chunk_token_num": 512,
-            "delimiter": r"\n",
-            "html4excel": False,
-            "layout_recognize": "DeepDOC",
-            "raptor": {"use_raptor": False},
-            "graphrag": {"use_graphrag": False},
-        }, res
+        assert res["data"][0]["parser_config"] == DEFAULT_PARSER_CONFIG, res

    @pytest.mark.p3
    def test_parser_config_none(self, HttpApiAuth, add_dataset_func):
@ -766,14 +759,7 @@ class TestDatasetUpdate:

        res = list_datasets(HttpApiAuth, {"id": dataset_id})
        assert res["code"] == 0, res
-        assert res["data"][0]["parser_config"] == {
-            "chunk_token_num": 512,
-            "delimiter": r"\n",
-            "html4excel": False,
-            "layout_recognize": "DeepDOC",
-            "raptor": {"use_raptor": False},
-            "graphrag": {"use_graphrag": False},
-        }, res
+        assert res["data"][0]["parser_config"] == DEFAULT_PARSER_CONFIG, res

    @pytest.mark.p3
    def test_parser_config_empty_with_chunk_method_change(self, HttpApiAuth, add_dataset_func):
--- a/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py
+++ b/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py
@ -19,7 +19,7 @@ import pytest
 from common import list_documents, update_document
 from configs import DOCUMENT_NAME_LIMIT, INVALID_API_TOKEN
 from libs.auth import RAGFlowHttpApiAuth
-
+from configs import DEFAULT_PARSER_CONFIG

@pytest.mark.p1
 class TestAuthorization:
@ -308,14 +308,7 @@ class TestUpdateDocumentParserConfig:
            ("naive", {}, 0, ""),
            (
                "naive",
-                {
-                    "chunk_token_num": 512,
-                    "layout_recognize": "DeepDOC",
-                    "html4excel": False,
-                    "delimiter": r"\n",
-                    "task_page_size": 12,
-                    "raptor": {"use_raptor": False},
-                },
+                DEFAULT_PARSER_CONFIG,
                0,
                "",
            ),
@ -419,7 +412,14 @@ class TestUpdateDocumentParserConfig:
                "",
                marks=pytest.mark.skip(reason="issues/6098"),
            ),
-            ("naive", {"raptor": {"use_raptor": True}}, 0, ""),
+            ("naive", {"raptor": {"use_raptor": {
+                "use_raptor": True,
+                "prompt": "Please summarize the following paragraphs. Be careful with the numbers, do not make things up. Paragraphs as following:\n      {cluster_content}\nThe above is the content you need to summarize.",
+                "max_token": 256,
+                "threshold": 0.1,
+                "max_cluster": 64,
+                "random_seed": 0,
+            },}}, 0, ""),
            ("naive", {"raptor": {"use_raptor": False}}, 0, ""),
            pytest.param(
                "naive",
@ -534,14 +534,7 @@ class TestUpdateDocumentParserConfig:
        if expected_code == 0:
            res = list_documents(HttpApiAuth, dataset_id, {"id": document_ids[0]})
            if parser_config == {}:
-                assert res["data"]["docs"][0]["parser_config"] == {
-                    "chunk_token_num": 512,
-                    "delimiter": r"\n",
-                    "html4excel": False,
-                    "layout_recognize": "DeepDOC",
-                    "raptor": {"use_raptor": False},
-                    "graphrag": {"use_graphrag": False},
-                }
+                assert res["data"]["docs"][0]["parser_config"] == DEFAULT_PARSER_CONFIG
            else:
                for k, v in parser_config.items():
                    assert res["data"]["docs"][0]["parser_config"][k] == v