mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix: Refactor parser config handling and add GraphRAG defaults (#8778)
### What problem does this PR solve? - Update `get_parser_config` to merge provided configs with defaults - Add GraphRAG configuration defaults for all chunk methods - Make raptor and graphrag fields non-nullable in ParserConfig schema - Update related test cases to reflect config changes - Ensure backward compatibility while adding new GraphRAG support - #8396 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -351,28 +351,47 @@ def generate_confirmation_token(tenant_id):
|
|||||||
|
|
||||||
|
|
||||||
def get_parser_config(chunk_method, parser_config):
|
def get_parser_config(chunk_method, parser_config):
|
||||||
if parser_config:
|
|
||||||
return parser_config
|
|
||||||
if not chunk_method:
|
if not chunk_method:
|
||||||
chunk_method = "naive"
|
chunk_method = "naive"
|
||||||
|
|
||||||
|
# Define default configurations for each chunk method
|
||||||
key_mapping = {
|
key_mapping = {
|
||||||
"naive": {"chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}},
|
"naive": {"chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||||
"qa": {"raptor": {"use_raptor": False}},
|
"qa": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||||
"tag": None,
|
"tag": None,
|
||||||
"resume": None,
|
"resume": None,
|
||||||
"manual": {"raptor": {"use_raptor": False}},
|
"manual": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||||
"table": None,
|
"table": None,
|
||||||
"paper": {"raptor": {"use_raptor": False}},
|
"paper": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||||
"book": {"raptor": {"use_raptor": False}},
|
"book": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||||
"laws": {"raptor": {"use_raptor": False}},
|
"laws": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||||
"presentation": {"raptor": {"use_raptor": False}},
|
"presentation": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||||
"one": None,
|
"one": None,
|
||||||
"knowledge_graph": {"chunk_token_num": 8192, "delimiter": r"\n", "entity_types": ["organization", "person", "location", "event", "time"]},
|
"knowledge_graph": {
|
||||||
|
"chunk_token_num": 8192,
|
||||||
|
"delimiter": r"\n",
|
||||||
|
"entity_types": ["organization", "person", "location", "event", "time"],
|
||||||
|
"raptor": {"use_raptor": False},
|
||||||
|
"graphrag": {"use_graphrag": False},
|
||||||
|
},
|
||||||
"email": None,
|
"email": None,
|
||||||
"picture": None,
|
"picture": None,
|
||||||
}
|
}
|
||||||
parser_config = key_mapping[chunk_method]
|
|
||||||
return parser_config
|
default_config = key_mapping[chunk_method]
|
||||||
|
|
||||||
|
# If no parser_config provided, return default
|
||||||
|
if not parser_config:
|
||||||
|
return default_config
|
||||||
|
|
||||||
|
# If parser_config is provided, merge with defaults to ensure required fields exist
|
||||||
|
if default_config is None:
|
||||||
|
return parser_config
|
||||||
|
|
||||||
|
# Ensure raptor and graphrag fields have default values if not provided
|
||||||
|
merged_config = deep_merge(default_config, parser_config)
|
||||||
|
|
||||||
|
return merged_config
|
||||||
|
|
||||||
|
|
||||||
def get_data_openai(
|
def get_data_openai(
|
||||||
@ -602,17 +621,14 @@ def get_mcp_tools(mcp_servers: list, timeout: float | int = 10) -> tuple[dict, s
|
|||||||
|
|
||||||
TimeoutException = Union[Type[BaseException], BaseException]
|
TimeoutException = Union[Type[BaseException], BaseException]
|
||||||
OnTimeoutCallback = Union[Callable[..., Any], Coroutine[Any, Any, Any]]
|
OnTimeoutCallback = Union[Callable[..., Any], Coroutine[Any, Any, Any]]
|
||||||
def timeout(
|
|
||||||
seconds: float |int = None,
|
|
||||||
attempts: int = 2,
|
def timeout(seconds: float | int = None, attempts: int = 2, *, exception: Optional[TimeoutException] = None, on_timeout: Optional[OnTimeoutCallback] = None):
|
||||||
*,
|
|
||||||
exception: Optional[TimeoutException] = None,
|
|
||||||
on_timeout: Optional[OnTimeoutCallback] = None
|
|
||||||
):
|
|
||||||
def decorator(func):
|
def decorator(func):
|
||||||
@wraps(func)
|
@wraps(func)
|
||||||
def wrapper(*args, **kwargs):
|
def wrapper(*args, **kwargs):
|
||||||
result_queue = queue.Queue(maxsize=1)
|
result_queue = queue.Queue(maxsize=1)
|
||||||
|
|
||||||
def target():
|
def target():
|
||||||
try:
|
try:
|
||||||
result = func(*args, **kwargs)
|
result = func(*args, **kwargs)
|
||||||
@ -644,7 +660,7 @@ def timeout(
|
|||||||
with trio.fail_after(seconds):
|
with trio.fail_after(seconds):
|
||||||
return await func(*args, **kwargs)
|
return await func(*args, **kwargs)
|
||||||
except trio.TooSlowError:
|
except trio.TooSlowError:
|
||||||
if a < attempts -1:
|
if a < attempts - 1:
|
||||||
continue
|
continue
|
||||||
if on_timeout is not None:
|
if on_timeout is not None:
|
||||||
if callable(on_timeout):
|
if callable(on_timeout):
|
||||||
@ -668,11 +684,11 @@ def timeout(
|
|||||||
if asyncio.iscoroutinefunction(func):
|
if asyncio.iscoroutinefunction(func):
|
||||||
return async_wrapper
|
return async_wrapper
|
||||||
return wrapper
|
return wrapper
|
||||||
|
|
||||||
return decorator
|
return decorator
|
||||||
|
|
||||||
|
|
||||||
async def is_strong_enough(chat_model, embedding_model):
|
async def is_strong_enough(chat_model, embedding_model):
|
||||||
|
|
||||||
@timeout(30, 2)
|
@timeout(30, 2)
|
||||||
async def _is_strong_enough():
|
async def _is_strong_enough():
|
||||||
nonlocal chat_model, embedding_model
|
nonlocal chat_model, embedding_model
|
||||||
@ -681,11 +697,11 @@ async def is_strong_enough(chat_model, embedding_model):
|
|||||||
_ = await trio.to_thread.run_sync(lambda: embedding_model.encode(["Are you strong enough!?"]))
|
_ = await trio.to_thread.run_sync(lambda: embedding_model.encode(["Are you strong enough!?"]))
|
||||||
if chat_model:
|
if chat_model:
|
||||||
with trio.fail_after(30):
|
with trio.fail_after(30):
|
||||||
res = await trio.to_thread.run_sync(lambda: chat_model.chat("Nothing special.", [{"role":"user", "content": "Are you strong enough!?"}], {}))
|
res = await trio.to_thread.run_sync(lambda: chat_model.chat("Nothing special.", [{"role": "user", "content": "Are you strong enough!?"}], {}))
|
||||||
if res.find("**ERROR**") >= 0:
|
if res.find("**ERROR**") >= 0:
|
||||||
raise Exception(res)
|
raise Exception(res)
|
||||||
|
|
||||||
# Pressure test for GraphRAG task
|
# Pressure test for GraphRAG task
|
||||||
async with trio.open_nursery() as nursery:
|
async with trio.open_nursery() as nursery:
|
||||||
for _ in range(32):
|
for _ in range(32):
|
||||||
nursery.start_soon(_is_strong_enough)
|
nursery.start_soon(_is_strong_enough)
|
||||||
|
|||||||
@ -365,10 +365,10 @@ class ParserConfig(Base):
|
|||||||
auto_questions: int = Field(default=0, ge=0, le=10)
|
auto_questions: int = Field(default=0, ge=0, le=10)
|
||||||
chunk_token_num: int = Field(default=512, ge=1, le=2048)
|
chunk_token_num: int = Field(default=512, ge=1, le=2048)
|
||||||
delimiter: str = Field(default=r"\n", min_length=1)
|
delimiter: str = Field(default=r"\n", min_length=1)
|
||||||
graphrag: GraphragConfig | None = None
|
graphrag: GraphragConfig = Field(default_factory=lambda: GraphragConfig(use_graphrag=False))
|
||||||
html4excel: bool = False
|
html4excel: bool = False
|
||||||
layout_recognize: str = "DeepDOC"
|
layout_recognize: str = "DeepDOC"
|
||||||
raptor: RaptorConfig | None = None
|
raptor: RaptorConfig = Field(default_factory=lambda: RaptorConfig(use_raptor=False))
|
||||||
tag_kb_ids: list[str] = Field(default_factory=list)
|
tag_kb_ids: list[str] = Field(default_factory=list)
|
||||||
topn_tags: int = Field(default=1, ge=1, le=10)
|
topn_tags: int = Field(default=1, ge=1, le=10)
|
||||||
filename_embd_weight: float | None = Field(default=0.1, ge=0.0, le=1.0)
|
filename_embd_weight: float | None = Field(default=0.1, ge=0.0, le=1.0)
|
||||||
|
|||||||
@ -644,6 +644,7 @@ class TestDatasetCreate:
|
|||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
"layout_recognize": "DeepDOC",
|
"layout_recognize": "DeepDOC",
|
||||||
"raptor": {"use_raptor": False},
|
"raptor": {"use_raptor": False},
|
||||||
|
"graphrag": {"use_graphrag": False},
|
||||||
}, res
|
}, res
|
||||||
|
|
||||||
@pytest.mark.p2
|
@pytest.mark.p2
|
||||||
@ -657,6 +658,7 @@ class TestDatasetCreate:
|
|||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
"layout_recognize": "DeepDOC",
|
"layout_recognize": "DeepDOC",
|
||||||
"raptor": {"use_raptor": False},
|
"raptor": {"use_raptor": False},
|
||||||
|
"graphrag": {"use_graphrag": False},
|
||||||
}, res
|
}, res
|
||||||
|
|
||||||
@pytest.mark.p3
|
@pytest.mark.p3
|
||||||
@ -670,6 +672,7 @@ class TestDatasetCreate:
|
|||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
"layout_recognize": "DeepDOC",
|
"layout_recognize": "DeepDOC",
|
||||||
"raptor": {"use_raptor": False},
|
"raptor": {"use_raptor": False},
|
||||||
|
"graphrag": {"use_graphrag": False},
|
||||||
}, res
|
}, res
|
||||||
|
|
||||||
@pytest.mark.p2
|
@pytest.mark.p2
|
||||||
@ -695,3 +698,64 @@ class TestDatasetCreate:
|
|||||||
res = create_dataset(HttpApiAuth, payload)
|
res = create_dataset(HttpApiAuth, payload)
|
||||||
assert res["code"] == 101, res
|
assert res["code"] == 101, res
|
||||||
assert "Extra inputs are not permitted" in res["message"], res
|
assert "Extra inputs are not permitted" in res["message"], res
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("clear_datasets")
|
||||||
|
class TestParserConfigBugFix:
|
||||||
|
@pytest.mark.p1
|
||||||
|
def test_parser_config_missing_raptor_and_graphrag(self, HttpApiAuth):
|
||||||
|
payload = {"name": "test_parser_config_missing_fields", "parser_config": {"chunk_token_num": 1024}}
|
||||||
|
res = create_dataset(HttpApiAuth, payload)
|
||||||
|
assert res["code"] == 0, res
|
||||||
|
|
||||||
|
parser_config = res["data"]["parser_config"]
|
||||||
|
assert "raptor" in parser_config, "raptor field should be present"
|
||||||
|
assert "graphrag" in parser_config, "graphrag field should be present"
|
||||||
|
assert parser_config["raptor"]["use_raptor"] is False, "raptor.use_raptor should default to False"
|
||||||
|
assert parser_config["graphrag"]["use_graphrag"] is False, "graphrag.use_graphrag should default to False"
|
||||||
|
assert parser_config["chunk_token_num"] == 1024, "User-provided chunk_token_num should be preserved"
|
||||||
|
|
||||||
|
@pytest.mark.p1
|
||||||
|
def test_parser_config_with_only_raptor(self, HttpApiAuth):
|
||||||
|
payload = {"name": "test_parser_config_only_raptor", "parser_config": {"chunk_token_num": 1024, "raptor": {"use_raptor": True}}}
|
||||||
|
res = create_dataset(HttpApiAuth, payload)
|
||||||
|
assert res["code"] == 0, res
|
||||||
|
|
||||||
|
parser_config = res["data"]["parser_config"]
|
||||||
|
assert parser_config["raptor"]["use_raptor"] is True, "User-provided raptor.use_raptor should be preserved"
|
||||||
|
assert "graphrag" in parser_config, "graphrag field should be present"
|
||||||
|
assert parser_config["graphrag"]["use_graphrag"] is False, "graphrag.use_graphrag should default to False"
|
||||||
|
|
||||||
|
@pytest.mark.p1
|
||||||
|
def test_parser_config_with_only_graphrag(self, HttpApiAuth):
|
||||||
|
payload = {"name": "test_parser_config_only_graphrag", "parser_config": {"chunk_token_num": 1024, "graphrag": {"use_graphrag": True}}}
|
||||||
|
res = create_dataset(HttpApiAuth, payload)
|
||||||
|
assert res["code"] == 0, res
|
||||||
|
|
||||||
|
parser_config = res["data"]["parser_config"]
|
||||||
|
assert "raptor" in parser_config, "raptor field should be present"
|
||||||
|
assert parser_config["raptor"]["use_raptor"] is False, "raptor.use_raptor should default to False"
|
||||||
|
assert parser_config["graphrag"]["use_graphrag"] is True, "User-provided graphrag.use_graphrag should be preserved"
|
||||||
|
|
||||||
|
@pytest.mark.p1
|
||||||
|
def test_parser_config_with_both_fields(self, HttpApiAuth):
|
||||||
|
payload = {"name": "test_parser_config_both_fields", "parser_config": {"chunk_token_num": 1024, "raptor": {"use_raptor": True}, "graphrag": {"use_graphrag": True}}}
|
||||||
|
res = create_dataset(HttpApiAuth, payload)
|
||||||
|
assert res["code"] == 0, res
|
||||||
|
|
||||||
|
parser_config = res["data"]["parser_config"]
|
||||||
|
assert parser_config["raptor"]["use_raptor"] is True, "User-provided raptor.use_raptor should be preserved"
|
||||||
|
assert parser_config["graphrag"]["use_graphrag"] is True, "User-provided graphrag.use_graphrag should be preserved"
|
||||||
|
|
||||||
|
@pytest.mark.p2
|
||||||
|
@pytest.mark.parametrize("chunk_method", ["qa", "manual", "paper", "book", "laws", "presentation"])
|
||||||
|
def test_parser_config_different_chunk_methods(self, HttpApiAuth, chunk_method):
|
||||||
|
payload = {"name": f"test_parser_config_{chunk_method}", "chunk_method": chunk_method, "parser_config": {"chunk_token_num": 512}}
|
||||||
|
res = create_dataset(HttpApiAuth, payload)
|
||||||
|
assert res["code"] == 0, res
|
||||||
|
|
||||||
|
parser_config = res["data"]["parser_config"]
|
||||||
|
assert "raptor" in parser_config, f"raptor field should be present for {chunk_method}"
|
||||||
|
assert "graphrag" in parser_config, f"graphrag field should be present for {chunk_method}"
|
||||||
|
assert parser_config["raptor"]["use_raptor"] is False, f"raptor.use_raptor should default to False for {chunk_method}"
|
||||||
|
assert parser_config["graphrag"]["use_graphrag"] is False, f"graphrag.use_graphrag should default to False for {chunk_method}"
|
||||||
|
|||||||
@ -755,6 +755,7 @@ class TestDatasetUpdate:
|
|||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
"layout_recognize": "DeepDOC",
|
"layout_recognize": "DeepDOC",
|
||||||
"raptor": {"use_raptor": False},
|
"raptor": {"use_raptor": False},
|
||||||
|
"graphrag": {"use_graphrag": False},
|
||||||
}, res
|
}, res
|
||||||
|
|
||||||
@pytest.mark.p3
|
@pytest.mark.p3
|
||||||
@ -772,6 +773,7 @@ class TestDatasetUpdate:
|
|||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
"layout_recognize": "DeepDOC",
|
"layout_recognize": "DeepDOC",
|
||||||
"raptor": {"use_raptor": False},
|
"raptor": {"use_raptor": False},
|
||||||
|
"graphrag": {"use_graphrag": False},
|
||||||
}, res
|
}, res
|
||||||
|
|
||||||
@pytest.mark.p3
|
@pytest.mark.p3
|
||||||
@ -783,7 +785,7 @@ class TestDatasetUpdate:
|
|||||||
|
|
||||||
res = list_datasets(HttpApiAuth)
|
res = list_datasets(HttpApiAuth)
|
||||||
assert res["code"] == 0, res
|
assert res["code"] == 0, res
|
||||||
assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res
|
assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, res
|
||||||
|
|
||||||
@pytest.mark.p3
|
@pytest.mark.p3
|
||||||
def test_parser_config_unset_with_chunk_method_change(self, HttpApiAuth, add_dataset_func):
|
def test_parser_config_unset_with_chunk_method_change(self, HttpApiAuth, add_dataset_func):
|
||||||
@ -794,7 +796,7 @@ class TestDatasetUpdate:
|
|||||||
|
|
||||||
res = list_datasets(HttpApiAuth)
|
res = list_datasets(HttpApiAuth)
|
||||||
assert res["code"] == 0, res
|
assert res["code"] == 0, res
|
||||||
assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res
|
assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, res
|
||||||
|
|
||||||
@pytest.mark.p3
|
@pytest.mark.p3
|
||||||
def test_parser_config_none_with_chunk_method_change(self, HttpApiAuth, add_dataset_func):
|
def test_parser_config_none_with_chunk_method_change(self, HttpApiAuth, add_dataset_func):
|
||||||
@ -805,7 +807,7 @@ class TestDatasetUpdate:
|
|||||||
|
|
||||||
res = list_datasets(HttpApiAuth, {"id": dataset_id})
|
res = list_datasets(HttpApiAuth, {"id": dataset_id})
|
||||||
assert res["code"] == 0, res
|
assert res["code"] == 0, res
|
||||||
assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}}, res
|
assert res["data"][0]["parser_config"] == {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}}, res
|
||||||
|
|
||||||
@pytest.mark.p2
|
@pytest.mark.p2
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
|||||||
@ -540,6 +540,7 @@ class TestUpdateDocumentParserConfig:
|
|||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
"layout_recognize": "DeepDOC",
|
"layout_recognize": "DeepDOC",
|
||||||
"raptor": {"use_raptor": False},
|
"raptor": {"use_raptor": False},
|
||||||
|
"graphrag": {"use_graphrag": False},
|
||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
for k, v in parser_config.items():
|
for k, v in parser_config.items():
|
||||||
|
|||||||
@ -593,6 +593,7 @@ class TestDatasetCreate:
|
|||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
"layout_recognize": "DeepDOC",
|
"layout_recognize": "DeepDOC",
|
||||||
"raptor": {"use_raptor": False},
|
"raptor": {"use_raptor": False},
|
||||||
|
"graphrag": {"use_graphrag": False},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
parser_config_o = DataSet.ParserConfig(client, {})
|
parser_config_o = DataSet.ParserConfig(client, {})
|
||||||
@ -610,6 +611,7 @@ class TestDatasetCreate:
|
|||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
"layout_recognize": "DeepDOC",
|
"layout_recognize": "DeepDOC",
|
||||||
"raptor": {"use_raptor": False},
|
"raptor": {"use_raptor": False},
|
||||||
|
"graphrag": {"use_graphrag": False},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
payload = {"name": "parser_config_unset"}
|
payload = {"name": "parser_config_unset"}
|
||||||
@ -626,6 +628,7 @@ class TestDatasetCreate:
|
|||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
"layout_recognize": "DeepDOC",
|
"layout_recognize": "DeepDOC",
|
||||||
"raptor": {"use_raptor": False},
|
"raptor": {"use_raptor": False},
|
||||||
|
"graphrag": {"use_graphrag": False},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
payload = {"name": "parser_config_empty", "parser_config": None}
|
payload = {"name": "parser_config_empty", "parser_config": None}
|
||||||
@ -655,3 +658,64 @@ class TestDatasetCreate:
|
|||||||
with pytest.raises(Exception) as excinfo:
|
with pytest.raises(Exception) as excinfo:
|
||||||
client.create_dataset(**payload)
|
client.create_dataset(**payload)
|
||||||
assert "got an unexpected keyword argument" in str(excinfo.value), str(excinfo.value)
|
assert "got an unexpected keyword argument" in str(excinfo.value), str(excinfo.value)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.usefixtures("clear_datasets")
|
||||||
|
class TestParserConfigBugFix:
|
||||||
|
@pytest.mark.p1
|
||||||
|
def test_parser_config_missing_raptor_and_graphrag(self, client):
|
||||||
|
parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024})
|
||||||
|
payload = {"name": "test_parser_config_missing_fields_sdk", "parser_config": parser_config}
|
||||||
|
dataset = client.create_dataset(**payload)
|
||||||
|
|
||||||
|
config = dataset.parser_config
|
||||||
|
assert hasattr(config, "raptor"), "raptor field should be present"
|
||||||
|
assert hasattr(config, "graphrag"), "graphrag field should be present"
|
||||||
|
assert config.raptor.use_raptor is False, "raptor.use_raptor should default to False"
|
||||||
|
assert config.graphrag.use_graphrag is False, "graphrag.use_graphrag should default to False"
|
||||||
|
assert config.chunk_token_num == 1024, "User-provided chunk_token_num should be preserved"
|
||||||
|
|
||||||
|
@pytest.mark.p1
|
||||||
|
def test_parser_config_with_only_raptor(self, client):
|
||||||
|
parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024, "raptor": {"use_raptor": True}})
|
||||||
|
payload = {"name": "test_parser_config_only_raptor_sdk", "parser_config": parser_config}
|
||||||
|
dataset = client.create_dataset(**payload)
|
||||||
|
|
||||||
|
config = dataset.parser_config
|
||||||
|
assert config.raptor.use_raptor is True, "User-provided raptor.use_raptor should be preserved"
|
||||||
|
assert hasattr(config, "graphrag"), "graphrag field should be present"
|
||||||
|
assert config.graphrag.use_graphrag is False, "graphrag.use_graphrag should default to False"
|
||||||
|
|
||||||
|
@pytest.mark.p1
|
||||||
|
def test_parser_config_with_only_graphrag(self, client):
|
||||||
|
parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024, "graphrag": {"use_graphrag": True}})
|
||||||
|
payload = {"name": "test_parser_config_only_graphrag_sdk", "parser_config": parser_config}
|
||||||
|
dataset = client.create_dataset(**payload)
|
||||||
|
|
||||||
|
config = dataset.parser_config
|
||||||
|
assert hasattr(config, "raptor"), "raptor field should be present"
|
||||||
|
assert config.raptor.use_raptor is False, "raptor.use_raptor should default to False"
|
||||||
|
assert config.graphrag.use_graphrag is True, "User-provided graphrag.use_graphrag should be preserved"
|
||||||
|
|
||||||
|
@pytest.mark.p1
|
||||||
|
def test_parser_config_with_both_fields(self, client):
|
||||||
|
parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 1024, "raptor": {"use_raptor": True}, "graphrag": {"use_graphrag": True}})
|
||||||
|
payload = {"name": "test_parser_config_both_fields_sdk", "parser_config": parser_config}
|
||||||
|
dataset = client.create_dataset(**payload)
|
||||||
|
|
||||||
|
config = dataset.parser_config
|
||||||
|
assert config.raptor.use_raptor is True, "User-provided raptor.use_raptor should be preserved"
|
||||||
|
assert config.graphrag.use_graphrag is True, "User-provided graphrag.use_graphrag should be preserved"
|
||||||
|
|
||||||
|
@pytest.mark.p2
|
||||||
|
@pytest.mark.parametrize("chunk_method", ["qa", "manual", "paper", "book", "laws", "presentation"])
|
||||||
|
def test_parser_config_different_chunk_methods(self, client, chunk_method):
|
||||||
|
parser_config = DataSet.ParserConfig(client, {"chunk_token_num": 512})
|
||||||
|
payload = {"name": f"test_parser_config_{chunk_method}_sdk", "chunk_method": chunk_method, "parser_config": parser_config}
|
||||||
|
dataset = client.create_dataset(**payload)
|
||||||
|
|
||||||
|
config = dataset.parser_config
|
||||||
|
assert hasattr(config, "raptor"), f"raptor field should be present for {chunk_method}"
|
||||||
|
assert hasattr(config, "graphrag"), f"graphrag field should be present for {chunk_method}"
|
||||||
|
assert config.raptor.use_raptor is False, f"raptor.use_raptor should default to False for {chunk_method}"
|
||||||
|
assert config.graphrag.use_graphrag is False, f"graphrag.use_graphrag should default to False for {chunk_method}"
|
||||||
|
|||||||
@ -641,6 +641,7 @@ class TestDatasetUpdate:
|
|||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
"layout_recognize": "DeepDOC",
|
"layout_recognize": "DeepDOC",
|
||||||
"raptor": {"use_raptor": False},
|
"raptor": {"use_raptor": False},
|
||||||
|
"graphrag": {"use_graphrag": False},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
dataset.update({"parser_config": {}})
|
dataset.update({"parser_config": {}})
|
||||||
@ -660,6 +661,7 @@ class TestDatasetUpdate:
|
|||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
"layout_recognize": "DeepDOC",
|
"layout_recognize": "DeepDOC",
|
||||||
"raptor": {"use_raptor": False},
|
"raptor": {"use_raptor": False},
|
||||||
|
"graphrag": {"use_graphrag": False},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
dataset.update({"parser_config": None})
|
dataset.update({"parser_config": None})
|
||||||
@ -675,6 +677,7 @@ class TestDatasetUpdate:
|
|||||||
client,
|
client,
|
||||||
{
|
{
|
||||||
"raptor": {"use_raptor": False},
|
"raptor": {"use_raptor": False},
|
||||||
|
"graphrag": {"use_graphrag": False},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
dataset.update({"chunk_method": "qa", "parser_config": {}})
|
dataset.update({"chunk_method": "qa", "parser_config": {}})
|
||||||
|
|||||||
@ -406,6 +406,7 @@ class TestUpdateDocumentParserConfig:
|
|||||||
"html4excel": False,
|
"html4excel": False,
|
||||||
"layout_recognize": "DeepDOC",
|
"layout_recognize": "DeepDOC",
|
||||||
"raptor": {"use_raptor": False},
|
"raptor": {"use_raptor": False},
|
||||||
|
"graphrag": {"use_graphrag": False},
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
assert str(updated_doc.parser_config) == str(expected_config), str(updated_doc)
|
assert str(updated_doc.parser_config) == str(expected_config), str(updated_doc)
|
||||||
|
|||||||
Reference in New Issue
Block a user