mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix: Refactor parser config handling and add GraphRAG defaults (#8778)
### What problem does this PR solve? - Update `get_parser_config` to merge provided configs with defaults - Add GraphRAG configuration defaults for all chunk methods - Make raptor and graphrag fields non-nullable in ParserConfig schema - Update related test cases to reflect config changes - Ensure backward compatibility while adding new GraphRAG support - #8396 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -351,28 +351,47 @@ def generate_confirmation_token(tenant_id):
|
||||
|
||||
|
||||
def get_parser_config(chunk_method, parser_config):
|
||||
if parser_config:
|
||||
return parser_config
|
||||
if not chunk_method:
|
||||
chunk_method = "naive"
|
||||
|
||||
# Define default configurations for each chunk method
|
||||
key_mapping = {
|
||||
"naive": {"chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}},
|
||||
"qa": {"raptor": {"use_raptor": False}},
|
||||
"naive": {"chunk_token_num": 512, "delimiter": r"\n", "html4excel": False, "layout_recognize": "DeepDOC", "raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||
"qa": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||
"tag": None,
|
||||
"resume": None,
|
||||
"manual": {"raptor": {"use_raptor": False}},
|
||||
"manual": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||
"table": None,
|
||||
"paper": {"raptor": {"use_raptor": False}},
|
||||
"book": {"raptor": {"use_raptor": False}},
|
||||
"laws": {"raptor": {"use_raptor": False}},
|
||||
"presentation": {"raptor": {"use_raptor": False}},
|
||||
"paper": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||
"book": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||
"laws": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||
"presentation": {"raptor": {"use_raptor": False}, "graphrag": {"use_graphrag": False}},
|
||||
"one": None,
|
||||
"knowledge_graph": {"chunk_token_num": 8192, "delimiter": r"\n", "entity_types": ["organization", "person", "location", "event", "time"]},
|
||||
"knowledge_graph": {
|
||||
"chunk_token_num": 8192,
|
||||
"delimiter": r"\n",
|
||||
"entity_types": ["organization", "person", "location", "event", "time"],
|
||||
"raptor": {"use_raptor": False},
|
||||
"graphrag": {"use_graphrag": False},
|
||||
},
|
||||
"email": None,
|
||||
"picture": None,
|
||||
}
|
||||
parser_config = key_mapping[chunk_method]
|
||||
return parser_config
|
||||
|
||||
default_config = key_mapping[chunk_method]
|
||||
|
||||
# If no parser_config provided, return default
|
||||
if not parser_config:
|
||||
return default_config
|
||||
|
||||
# If parser_config is provided, merge with defaults to ensure required fields exist
|
||||
if default_config is None:
|
||||
return parser_config
|
||||
|
||||
# Ensure raptor and graphrag fields have default values if not provided
|
||||
merged_config = deep_merge(default_config, parser_config)
|
||||
|
||||
return merged_config
|
||||
|
||||
|
||||
def get_data_openai(
|
||||
@ -602,17 +621,14 @@ def get_mcp_tools(mcp_servers: list, timeout: float | int = 10) -> tuple[dict, s
|
||||
|
||||
TimeoutException = Union[Type[BaseException], BaseException]
|
||||
OnTimeoutCallback = Union[Callable[..., Any], Coroutine[Any, Any, Any]]
|
||||
def timeout(
|
||||
seconds: float |int = None,
|
||||
attempts: int = 2,
|
||||
*,
|
||||
exception: Optional[TimeoutException] = None,
|
||||
on_timeout: Optional[OnTimeoutCallback] = None
|
||||
):
|
||||
|
||||
|
||||
def timeout(seconds: float | int = None, attempts: int = 2, *, exception: Optional[TimeoutException] = None, on_timeout: Optional[OnTimeoutCallback] = None):
|
||||
def decorator(func):
|
||||
@wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
result_queue = queue.Queue(maxsize=1)
|
||||
|
||||
def target():
|
||||
try:
|
||||
result = func(*args, **kwargs)
|
||||
@ -644,7 +660,7 @@ def timeout(
|
||||
with trio.fail_after(seconds):
|
||||
return await func(*args, **kwargs)
|
||||
except trio.TooSlowError:
|
||||
if a < attempts -1:
|
||||
if a < attempts - 1:
|
||||
continue
|
||||
if on_timeout is not None:
|
||||
if callable(on_timeout):
|
||||
@ -668,11 +684,11 @@ def timeout(
|
||||
if asyncio.iscoroutinefunction(func):
|
||||
return async_wrapper
|
||||
return wrapper
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
async def is_strong_enough(chat_model, embedding_model):
|
||||
|
||||
@timeout(30, 2)
|
||||
async def _is_strong_enough():
|
||||
nonlocal chat_model, embedding_model
|
||||
@ -681,11 +697,11 @@ async def is_strong_enough(chat_model, embedding_model):
|
||||
_ = await trio.to_thread.run_sync(lambda: embedding_model.encode(["Are you strong enough!?"]))
|
||||
if chat_model:
|
||||
with trio.fail_after(30):
|
||||
res = await trio.to_thread.run_sync(lambda: chat_model.chat("Nothing special.", [{"role":"user", "content": "Are you strong enough!?"}], {}))
|
||||
res = await trio.to_thread.run_sync(lambda: chat_model.chat("Nothing special.", [{"role": "user", "content": "Are you strong enough!?"}], {}))
|
||||
if res.find("**ERROR**") >= 0:
|
||||
raise Exception(res)
|
||||
|
||||
# Pressure test for GraphRAG task
|
||||
async with trio.open_nursery() as nursery:
|
||||
for _ in range(32):
|
||||
nursery.start_soon(_is_strong_enough)
|
||||
nursery.start_soon(_is_strong_enough)
|
||||
|
||||
@ -365,10 +365,10 @@ class ParserConfig(Base):
|
||||
auto_questions: int = Field(default=0, ge=0, le=10)
|
||||
chunk_token_num: int = Field(default=512, ge=1, le=2048)
|
||||
delimiter: str = Field(default=r"\n", min_length=1)
|
||||
graphrag: GraphragConfig | None = None
|
||||
graphrag: GraphragConfig = Field(default_factory=lambda: GraphragConfig(use_graphrag=False))
|
||||
html4excel: bool = False
|
||||
layout_recognize: str = "DeepDOC"
|
||||
raptor: RaptorConfig | None = None
|
||||
raptor: RaptorConfig = Field(default_factory=lambda: RaptorConfig(use_raptor=False))
|
||||
tag_kb_ids: list[str] = Field(default_factory=list)
|
||||
topn_tags: int = Field(default=1, ge=1, le=10)
|
||||
filename_embd_weight: float | None = Field(default=0.1, ge=0.0, le=1.0)
|
||||
|
||||
Reference in New Issue
Block a user