From f8524462b0d32894bdee5c653beb27fecc9ddf6b Mon Sep 17 00:00:00 2001 From: Liu An Date: Thu, 10 Jul 2025 09:34:03 +0800 Subject: [PATCH] Fix: Increase default `chunk_token_num` from 128 to 512 in parser config (#8753) ### What problem does this PR solve? Updated the default `chunk_token_num` value in `api_utils.py` and `validation_utils.py` to 512 to accommodate larger text chunks. Adjusted corresponding test cases in HTTP and SDK API tests to reflect this change. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- api/utils/api_utils.py | 2 +- api/utils/validation_utils.py | 2 +- .../test_dataset_mangement/test_create_dataset.py | 6 +++--- .../test_dataset_mangement/test_update_dataset.py | 4 ++-- .../test_update_document.py | 4 ++-- .../test_dataset_mangement/test_create_dataset.py | 6 +++--- .../test_dataset_mangement/test_update_dataset.py | 4 ++-- .../test_update_document.py | 4 ++-- 8 files changed, 16 insertions(+), 16 deletions(-) diff --git a/api/utils/api_utils.py b/api/utils/api_utils.py index 8368d9ad4..71e354a34 100644 --- a/api/utils/api_utils.py +++ b/api/utils/api_utils.py @@ -348,7 +348,7 @@ def get_parser_config(chunk_method, parser_config): if not chunk_method: chunk_method = "naive" key_mapping = { - "naive": {"chunk_token_num": 128, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}}, + "naive": {"chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}}, "qa": {"raptor": {"use_raptor": False}}, "tag": None, "resume": None, diff --git a/api/utils/validation_utils.py b/api/utils/validation_utils.py index d87d8945d..d331c4b7e 100644 --- a/api/utils/validation_utils.py +++ b/api/utils/validation_utils.py @@ -363,7 +363,7 @@ class GraphragConfig(Base): class ParserConfig(Base): auto_keywords: int = Field(default=0, ge=0, le=32) auto_questions: int = Field(default=0, ge=0, le=10) - chunk_token_num: int = Field(default=128, ge=1, le=2048) + chunk_token_num: int = Field(default=512, ge=1, le=2048) delimiter: str = Field(default=r"\n", min_length=1) graphrag: GraphragConfig | None = None html4excel: bool = False diff --git a/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py b/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py index 22772ad68..feae812d1 100644 --- a/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py +++ b/test/testcases/test_http_api/test_dataset_mangement/test_create_dataset.py @@ -639,7 +639,7 @@ class TestDatasetCreate: res = create_dataset(HttpApiAuth, payload) assert res["code"] == 0, res assert res["data"]["parser_config"] == { - "chunk_token_num": 128, + "chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", @@ -652,7 +652,7 @@ class TestDatasetCreate: res = create_dataset(HttpApiAuth, payload) assert res["code"] == 0, res assert res["data"]["parser_config"] == { - "chunk_token_num": 128, + "chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", @@ -665,7 +665,7 @@ class TestDatasetCreate: res = create_dataset(HttpApiAuth, payload) assert res["code"] == 0, res assert res["data"]["parser_config"] == { - "chunk_token_num": 128, + "chunk_token_num": 512, "delimiter": "\\n", "html4excel": False, "layout_recognize": "DeepDOC", diff --git a/test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py b/test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py index 36d55795f..c5892b579 100644 --- a/test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py +++ b/test/testcases/test_http_api/test_dataset_mangement/test_update_dataset.py @@ -750,7 +750,7 @@ class TestDatasetUpdate: res = list_datasets(HttpApiAuth) assert res["code"] == 0, res assert res["data"][0]["parser_config"] == { - "chunk_token_num": 128, + "chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", @@ -767,7 +767,7 @@ class TestDatasetUpdate: res = list_datasets(HttpApiAuth, {"id": dataset_id}) assert res["code"] == 0, res assert res["data"][0]["parser_config"] == { - "chunk_token_num": 128, + "chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", diff --git a/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py b/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py index e2ab5d031..f75d7de66 100644 --- a/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py +++ b/test/testcases/test_http_api/test_file_management_within_dataset/test_update_document.py @@ -309,7 +309,7 @@ class TestUpdateDocumentParserConfig: ( "naive", { - "chunk_token_num": 128, + "chunk_token_num": 512, "layout_recognize": "DeepDOC", "html4excel": False, "delimiter": r"\n", @@ -535,7 +535,7 @@ class TestUpdateDocumentParserConfig: res = list_documents(HttpApiAuth, dataset_id, {"id": document_ids[0]}) if parser_config == {}: assert res["data"]["docs"][0]["parser_config"] == { - "chunk_token_num": 128, + "chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", diff --git a/test/testcases/test_sdk_api/test_dataset_mangement/test_create_dataset.py b/test/testcases/test_sdk_api/test_dataset_mangement/test_create_dataset.py index ffeaaf103..985a26978 100644 --- a/test/testcases/test_sdk_api/test_dataset_mangement/test_create_dataset.py +++ b/test/testcases/test_sdk_api/test_dataset_mangement/test_create_dataset.py @@ -588,7 +588,7 @@ class TestDatasetCreate: excepted_value = DataSet.ParserConfig( client, { - "chunk_token_num": 128, + "chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", @@ -605,7 +605,7 @@ class TestDatasetCreate: excepted_value = DataSet.ParserConfig( client, { - "chunk_token_num": 128, + "chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", @@ -621,7 +621,7 @@ class TestDatasetCreate: excepted_value = DataSet.ParserConfig( client, { - "chunk_token_num": 128, + "chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", diff --git a/test/testcases/test_sdk_api/test_dataset_mangement/test_update_dataset.py b/test/testcases/test_sdk_api/test_dataset_mangement/test_update_dataset.py index 94c4ddb37..164e473e8 100644 --- a/test/testcases/test_sdk_api/test_dataset_mangement/test_update_dataset.py +++ b/test/testcases/test_sdk_api/test_dataset_mangement/test_update_dataset.py @@ -636,7 +636,7 @@ class TestDatasetUpdate: expected_config = DataSet.ParserConfig( client, { - "chunk_token_num": 128, + "chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", @@ -655,7 +655,7 @@ class TestDatasetUpdate: expected_config = DataSet.ParserConfig( client, { - "chunk_token_num": 128, + "chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", diff --git a/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py index 1f738bea2..16d05aaad 100644 --- a/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py +++ b/test/testcases/test_sdk_api/test_file_management_within_dataset/test_update_document.py @@ -207,7 +207,7 @@ class TestUpdateDocumentParserConfig: ( "naive", { - "chunk_token_num": 128, + "chunk_token_num": 512, "layout_recognize": "DeepDOC", "html4excel": False, "delimiter": r"\n", @@ -401,7 +401,7 @@ class TestUpdateDocumentParserConfig: expected_config = DataSet.ParserConfig( client, { - "chunk_token_num": 128, + "chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC",