mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
Fix: Increase default chunk_token_num from 128 to 512 in parser config (#8753)
### What problem does this PR solve? Updated the default `chunk_token_num` value in `api_utils.py` and `validation_utils.py` to 512 to accommodate larger text chunks. Adjusted corresponding test cases in HTTP and SDK API tests to reflect this change. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -639,7 +639,7 @@ class TestDatasetCreate:
|
||||
res = create_dataset(HttpApiAuth, payload)
|
||||
assert res["code"] == 0, res
|
||||
assert res["data"]["parser_config"] == {
|
||||
"chunk_token_num": 128,
|
||||
"chunk_token_num": 512,
|
||||
"delimiter": r"\n",
|
||||
"html4excel": False,
|
||||
"layout_recognize": "DeepDOC",
|
||||
@ -652,7 +652,7 @@ class TestDatasetCreate:
|
||||
res = create_dataset(HttpApiAuth, payload)
|
||||
assert res["code"] == 0, res
|
||||
assert res["data"]["parser_config"] == {
|
||||
"chunk_token_num": 128,
|
||||
"chunk_token_num": 512,
|
||||
"delimiter": r"\n",
|
||||
"html4excel": False,
|
||||
"layout_recognize": "DeepDOC",
|
||||
@ -665,7 +665,7 @@ class TestDatasetCreate:
|
||||
res = create_dataset(HttpApiAuth, payload)
|
||||
assert res["code"] == 0, res
|
||||
assert res["data"]["parser_config"] == {
|
||||
"chunk_token_num": 128,
|
||||
"chunk_token_num": 512,
|
||||
"delimiter": "\\n",
|
||||
"html4excel": False,
|
||||
"layout_recognize": "DeepDOC",
|
||||
|
||||
@ -750,7 +750,7 @@ class TestDatasetUpdate:
|
||||
res = list_datasets(HttpApiAuth)
|
||||
assert res["code"] == 0, res
|
||||
assert res["data"][0]["parser_config"] == {
|
||||
"chunk_token_num": 128,
|
||||
"chunk_token_num": 512,
|
||||
"delimiter": r"\n",
|
||||
"html4excel": False,
|
||||
"layout_recognize": "DeepDOC",
|
||||
@ -767,7 +767,7 @@ class TestDatasetUpdate:
|
||||
res = list_datasets(HttpApiAuth, {"id": dataset_id})
|
||||
assert res["code"] == 0, res
|
||||
assert res["data"][0]["parser_config"] == {
|
||||
"chunk_token_num": 128,
|
||||
"chunk_token_num": 512,
|
||||
"delimiter": r"\n",
|
||||
"html4excel": False,
|
||||
"layout_recognize": "DeepDOC",
|
||||
|
||||
@ -309,7 +309,7 @@ class TestUpdateDocumentParserConfig:
|
||||
(
|
||||
"naive",
|
||||
{
|
||||
"chunk_token_num": 128,
|
||||
"chunk_token_num": 512,
|
||||
"layout_recognize": "DeepDOC",
|
||||
"html4excel": False,
|
||||
"delimiter": r"\n",
|
||||
@ -535,7 +535,7 @@ class TestUpdateDocumentParserConfig:
|
||||
res = list_documents(HttpApiAuth, dataset_id, {"id": document_ids[0]})
|
||||
if parser_config == {}:
|
||||
assert res["data"]["docs"][0]["parser_config"] == {
|
||||
"chunk_token_num": 128,
|
||||
"chunk_token_num": 512,
|
||||
"delimiter": r"\n",
|
||||
"html4excel": False,
|
||||
"layout_recognize": "DeepDOC",
|
||||
|
||||
@ -588,7 +588,7 @@ class TestDatasetCreate:
|
||||
excepted_value = DataSet.ParserConfig(
|
||||
client,
|
||||
{
|
||||
"chunk_token_num": 128,
|
||||
"chunk_token_num": 512,
|
||||
"delimiter": r"\n",
|
||||
"html4excel": False,
|
||||
"layout_recognize": "DeepDOC",
|
||||
@ -605,7 +605,7 @@ class TestDatasetCreate:
|
||||
excepted_value = DataSet.ParserConfig(
|
||||
client,
|
||||
{
|
||||
"chunk_token_num": 128,
|
||||
"chunk_token_num": 512,
|
||||
"delimiter": r"\n",
|
||||
"html4excel": False,
|
||||
"layout_recognize": "DeepDOC",
|
||||
@ -621,7 +621,7 @@ class TestDatasetCreate:
|
||||
excepted_value = DataSet.ParserConfig(
|
||||
client,
|
||||
{
|
||||
"chunk_token_num": 128,
|
||||
"chunk_token_num": 512,
|
||||
"delimiter": r"\n",
|
||||
"html4excel": False,
|
||||
"layout_recognize": "DeepDOC",
|
||||
|
||||
@ -636,7 +636,7 @@ class TestDatasetUpdate:
|
||||
expected_config = DataSet.ParserConfig(
|
||||
client,
|
||||
{
|
||||
"chunk_token_num": 128,
|
||||
"chunk_token_num": 512,
|
||||
"delimiter": r"\n",
|
||||
"html4excel": False,
|
||||
"layout_recognize": "DeepDOC",
|
||||
@ -655,7 +655,7 @@ class TestDatasetUpdate:
|
||||
expected_config = DataSet.ParserConfig(
|
||||
client,
|
||||
{
|
||||
"chunk_token_num": 128,
|
||||
"chunk_token_num": 512,
|
||||
"delimiter": r"\n",
|
||||
"html4excel": False,
|
||||
"layout_recognize": "DeepDOC",
|
||||
|
||||
@ -207,7 +207,7 @@ class TestUpdateDocumentParserConfig:
|
||||
(
|
||||
"naive",
|
||||
{
|
||||
"chunk_token_num": 128,
|
||||
"chunk_token_num": 512,
|
||||
"layout_recognize": "DeepDOC",
|
||||
"html4excel": False,
|
||||
"delimiter": r"\n",
|
||||
@ -401,7 +401,7 @@ class TestUpdateDocumentParserConfig:
|
||||
expected_config = DataSet.ParserConfig(
|
||||
client,
|
||||
{
|
||||
"chunk_token_num": 128,
|
||||
"chunk_token_num": 512,
|
||||
"delimiter": r"\n",
|
||||
"html4excel": False,
|
||||
"layout_recognize": "DeepDOC",
|
||||
|
||||
Reference in New Issue
Block a user