Fix: Refactor parser config handling and add GraphRAG defaults (#8778)

### What problem does this PR solve?

- Update `get_parser_config` to merge provided configs with defaults
- Add GraphRAG configuration defaults for all chunk methods
- Make raptor and graphrag fields non-nullable in ParserConfig schema
- Update related test cases to reflect config changes
- Ensure backward compatibility while adding new GraphRAG support
- #8396

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Liu An
2025-07-23 09:29:37 +08:00
committed by GitHub
parent c3b8d8b4ba
commit 0020c50000
8 changed files with 179 additions and 28 deletions

View File

@ -593,6 +593,7 @@ class TestDatasetCreate:
"html4excel": False,
"layout_recognize": "DeepDOC",
"raptor": {"use_raptor": False},
"graphrag": {"use_graphrag": False},
},
)
parser_config_o = DataSet.ParserConfig(client, {})
@ -610,6 +611,7 @@ class TestDatasetCreate:
"html4excel": False,
"layout_recognize": "DeepDOC",
"raptor": {"use_raptor": False},
"graphrag": {"use_graphrag": False},
},
)
payload = {"name": "parser_config_unset"}
@ -626,6 +628,7 @@ class TestDatasetCreate:
"html4excel": False,
"layout_recognize": "DeepDOC",
"raptor": {"use_raptor": False},
"graphrag": {"use_graphrag": False},
},
)
payload = {"name": "parser_config_empty", "parser_config": None}
@ -655,3 +658,64 @@ class TestDatasetCreate:
with pytest.raises(Exception) as excinfo:
client.create_dataset(**payload)
assert "got an unexpected keyword argument" in str(excinfo.value), str(excinfo.value)
@pytest.mark.usefixtures("clear_datasets")
class TestParserConfigBugFix:
@pytest.mark.p1
def test_parser_config_missing_raptor_and_graphrag(self, client):
parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024})
payload = {"name": "test_parser_config_missing_fields_sdk", "parser_config": parser_config}
dataset = client.create_dataset(**payload)
config = dataset.parser_config
assert hasattr(config, "raptor"), "raptor field should be present"
assert hasattr(config, "graphrag"), "graphrag field should be present"
assert config.raptor.use_raptor is False, "raptor.use_raptor should default to False"
assert config.graphrag.use_graphrag is False, "graphrag.use_graphrag should default to False"
assert config.chunk_token_num == 1024, "User-provided chunk_token_num should be preserved"
@pytest.mark.p1
def test_parser_config_with_only_raptor(self, client):
parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024, "raptor": {"use_raptor": True}})
payload = {"name": "test_parser_config_only_raptor_sdk", "parser_config": parser_config}
dataset = client.create_dataset(**payload)
config = dataset.parser_config
assert config.raptor.use_raptor is True, "User-provided raptor.use_raptor should be preserved"
assert hasattr(config, "graphrag"), "graphrag field should be present"
assert config.graphrag.use_graphrag is False, "graphrag.use_graphrag should default to False"
@pytest.mark.p1
def test_parser_config_with_only_graphrag(self, client):
parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024, "graphrag": {"use_graphrag": True}})
payload = {"name": "test_parser_config_only_graphrag_sdk", "parser_config": parser_config}
dataset = client.create_dataset(**payload)
config = dataset.parser_config
assert hasattr(config, "raptor"), "raptor field should be present"
assert config.raptor.use_raptor is False, "raptor.use_raptor should default to False"
assert config.graphrag.use_graphrag is True, "User-provided graphrag.use_graphrag should be preserved"
@pytest.mark.p1
def test_parser_config_with_both_fields(self, client):
parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024, "raptor": {"use_raptor": True}, "graphrag": {"use_graphrag": True}})
payload = {"name": "test_parser_config_both_fields_sdk", "parser_config": parser_config}
dataset = client.create_dataset(**payload)
config = dataset.parser_config
assert config.raptor.use_raptor is True, "User-provided raptor.use_raptor should be preserved"
assert config.graphrag.use_graphrag is True, "User-provided graphrag.use_graphrag should be preserved"
@pytest.mark.p2
@pytest.mark.parametrize("chunk_method", ["qa", "manual", "paper", "book", "laws", "presentation"])
def test_parser_config_different_chunk_methods(self, client, chunk_method):
parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 512})
payload = {"name": f"test_parser_config_{chunk_method}_sdk", "chunk_method": chunk_method, "parser_config": parser_config}
dataset = client.create_dataset(**payload)
config = dataset.parser_config
assert hasattr(config, "raptor"), f"raptor field should be present for {chunk_method}"
assert hasattr(config, "graphrag"), f"graphrag field should be present for {chunk_method}"
assert config.raptor.use_raptor is False, f"raptor.use_raptor should default to False for {chunk_method}"
assert config.graphrag.use_graphrag is False, f"graphrag.use_graphrag should default to False for {chunk_method}"

View File

@ -641,6 +641,7 @@ class TestDatasetUpdate:
"html4excel": False,
"layout_recognize": "DeepDOC",
"raptor": {"use_raptor": False},
"graphrag": {"use_graphrag": False},
},
)
dataset.update({"parser_config": {}})
@ -660,6 +661,7 @@ class TestDatasetUpdate:
"html4excel": False,
"layout_recognize": "DeepDOC",
"raptor": {"use_raptor": False},
"graphrag": {"use_graphrag": False},
},
)
dataset.update({"parser_config": None})
@ -675,6 +677,7 @@ class TestDatasetUpdate:
client,
{
"raptor": {"use_raptor": False},
"graphrag": {"use_graphrag": False},
},
)
dataset.update({"chunk_method": "qa", "parser_config": {}})

View File

@ -406,6 +406,7 @@ class TestUpdateDocumentParserConfig:
"html4excel": False,
"layout_recognize": "DeepDOC",
"raptor": {"use_raptor": False},
"graphrag": {"use_graphrag": False},
},
)
assert str(updated_doc.parser_config) == str(expected_config), str(updated_doc)