Fix: Increase default chunk_token_num from 128 to 512 in parser config (#8753)

### What problem does this PR solve?

Updated the default `chunk_token_num` value in `api_utils.py` and
`validation_utils.py` to 512 to accommodate larger text chunks. Adjusted
corresponding test cases in HTTP and SDK API tests to reflect this
change.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Liu An
2025-07-10 09:34:03 +08:00
committed by Zhichang Yu
parent aae9fbb9de
commit f8524462b0
8 changed files with 16 additions and 16 deletions

View File

@ -348,7 +348,7 @@ def get_parser_config(chunk_method, parser_config):
if not chunk_method: if not chunk_method:
chunk_method = "naive" chunk_method = "naive"
key_mapping = { key_mapping = {
"naive": {"chunk_token_num": 128, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}}, "naive": {"chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}},
"qa": {"raptor": {"use_raptor": False}}, "qa": {"raptor": {"use_raptor": False}},
"tag": None, "tag": None,
"resume": None, "resume": None,

View File

@ -363,7 +363,7 @@ class GraphragConfig(Base):
class ParserConfig(Base): class ParserConfig(Base):
auto_keywords: int = Field(default=0, ge=0, le=32) auto_keywords: int = Field(default=0, ge=0, le=32)
auto_questions: int = Field(default=0, ge=0, le=10) auto_questions: int = Field(default=0, ge=0, le=10)
chunk_token_num: int = Field(default=128, ge=1, le=2048) chunk_token_num: int = Field(default=512, ge=1, le=2048)
delimiter: str = Field(default=r"\n", min_length=1) delimiter: str = Field(default=r"\n", min_length=1)
graphrag: GraphragConfig | None = None graphrag: GraphragConfig | None = None
html4excel: bool = False html4excel: bool = False

View File

@ -639,7 +639,7 @@ class TestDatasetCreate:
res = create_dataset(HttpApiAuth, payload) res = create_dataset(HttpApiAuth, payload)
assert res["code"] == 0, res assert res["code"] == 0, res
assert res["data"]["parser_config"] == { assert res["data"]["parser_config"] == {
"chunk_token_num": 128, "chunk_token_num": 512,
"delimiter": r"\n", "delimiter": r"\n",
"html4excel": False, "html4excel": False,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",
@ -652,7 +652,7 @@ class TestDatasetCreate:
res = create_dataset(HttpApiAuth, payload) res = create_dataset(HttpApiAuth, payload)
assert res["code"] == 0, res assert res["code"] == 0, res
assert res["data"]["parser_config"] == { assert res["data"]["parser_config"] == {
"chunk_token_num": 128, "chunk_token_num": 512,
"delimiter": r"\n", "delimiter": r"\n",
"html4excel": False, "html4excel": False,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",
@ -665,7 +665,7 @@ class TestDatasetCreate:
res = create_dataset(HttpApiAuth, payload) res = create_dataset(HttpApiAuth, payload)
assert res["code"] == 0, res assert res["code"] == 0, res
assert res["data"]["parser_config"] == { assert res["data"]["parser_config"] == {
"chunk_token_num": 128, "chunk_token_num": 512,
"delimiter": "\\n", "delimiter": "\\n",
"html4excel": False, "html4excel": False,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",

View File

@ -750,7 +750,7 @@ class TestDatasetUpdate:
res = list_datasets(HttpApiAuth) res = list_datasets(HttpApiAuth)
assert res["code"] == 0, res assert res["code"] == 0, res
assert res["data"][0]["parser_config"] == { assert res["data"][0]["parser_config"] == {
"chunk_token_num": 128, "chunk_token_num": 512,
"delimiter": r"\n", "delimiter": r"\n",
"html4excel": False, "html4excel": False,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",
@ -767,7 +767,7 @@ class TestDatasetUpdate:
res = list_datasets(HttpApiAuth, {"id": dataset_id}) res = list_datasets(HttpApiAuth, {"id": dataset_id})
assert res["code"] == 0, res assert res["code"] == 0, res
assert res["data"][0]["parser_config"] == { assert res["data"][0]["parser_config"] == {
"chunk_token_num": 128, "chunk_token_num": 512,
"delimiter": r"\n", "delimiter": r"\n",
"html4excel": False, "html4excel": False,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",

View File

@ -309,7 +309,7 @@ class TestUpdateDocumentParserConfig:
( (
"naive", "naive",
{ {
"chunk_token_num": 128, "chunk_token_num": 512,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",
"html4excel": False, "html4excel": False,
"delimiter": r"\n", "delimiter": r"\n",
@ -535,7 +535,7 @@ class TestUpdateDocumentParserConfig:
res = list_documents(HttpApiAuth, dataset_id, {"id": document_ids[0]}) res = list_documents(HttpApiAuth, dataset_id, {"id": document_ids[0]})
if parser_config == {}: if parser_config == {}:
assert res["data"]["docs"][0]["parser_config"] == { assert res["data"]["docs"][0]["parser_config"] == {
"chunk_token_num": 128, "chunk_token_num": 512,
"delimiter": r"\n", "delimiter": r"\n",
"html4excel": False, "html4excel": False,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",

View File

@ -588,7 +588,7 @@ class TestDatasetCreate:
excepted_value = DataSet.ParserConfig( excepted_value = DataSet.ParserConfig(
client, client,
{ {
"chunk_token_num": 128, "chunk_token_num": 512,
"delimiter": r"\n", "delimiter": r"\n",
"html4excel": False, "html4excel": False,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",
@ -605,7 +605,7 @@ class TestDatasetCreate:
excepted_value = DataSet.ParserConfig( excepted_value = DataSet.ParserConfig(
client, client,
{ {
"chunk_token_num": 128, "chunk_token_num": 512,
"delimiter": r"\n", "delimiter": r"\n",
"html4excel": False, "html4excel": False,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",
@ -621,7 +621,7 @@ class TestDatasetCreate:
excepted_value = DataSet.ParserConfig( excepted_value = DataSet.ParserConfig(
client, client,
{ {
"chunk_token_num": 128, "chunk_token_num": 512,
"delimiter": r"\n", "delimiter": r"\n",
"html4excel": False, "html4excel": False,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",

View File

@ -636,7 +636,7 @@ class TestDatasetUpdate:
expected_config = DataSet.ParserConfig( expected_config = DataSet.ParserConfig(
client, client,
{ {
"chunk_token_num": 128, "chunk_token_num": 512,
"delimiter": r"\n", "delimiter": r"\n",
"html4excel": False, "html4excel": False,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",
@ -655,7 +655,7 @@ class TestDatasetUpdate:
expected_config = DataSet.ParserConfig( expected_config = DataSet.ParserConfig(
client, client,
{ {
"chunk_token_num": 128, "chunk_token_num": 512,
"delimiter": r"\n", "delimiter": r"\n",
"html4excel": False, "html4excel": False,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",

View File

@ -207,7 +207,7 @@ class TestUpdateDocumentParserConfig:
( (
"naive", "naive",
{ {
"chunk_token_num": 128, "chunk_token_num": 512,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",
"html4excel": False, "html4excel": False,
"delimiter": r"\n", "delimiter": r"\n",
@ -401,7 +401,7 @@ class TestUpdateDocumentParserConfig:
expected_config = DataSet.ParserConfig( expected_config = DataSet.ParserConfig(
client, client,
{ {
"chunk_token_num": 128, "chunk_token_num": 512,
"delimiter": r"\n", "delimiter": r"\n",
"html4excel": False, "html4excel": False,
"layout_recognize": "DeepDOC", "layout_recognize": "DeepDOC",