From 42f9ac997ff47e76991c75119f639882cf1de309 Mon Sep 17 00:00:00 2001 From: Jin Hai Date: Mon, 22 Dec 2025 12:59:37 +0800 Subject: [PATCH 1/6] Remove Chinese comments and fix function arguments errors (#12052) ### What problem does this PR solve? As title ### Type of change - [x] Refactoring Signed-off-by: Jin Hai --- api/utils/api_utils.py | 3 ++- common/data_source/confluence_connector.py | 4 ++-- common/data_source/models.py | 1 + common/data_source/slack_connector.py | 5 ----- 4 files changed, 5 insertions(+), 8 deletions(-) diff --git a/api/utils/api_utils.py b/api/utils/api_utils.py index abc69973b..afb4ff772 100644 --- a/api/utils/api_utils.py +++ b/api/utils/api_utils.py @@ -163,6 +163,7 @@ def validate_request(*args, **kwargs): if error_arguments: error_string += "required argument values: {}".format(",".join(["{}={}".format(a[0], a[1]) for a in error_arguments])) return error_string + return None def wrapper(func): @wraps(func) @@ -409,7 +410,7 @@ def get_parser_config(chunk_method, parser_config): if default_config is None: return deep_merge(base_defaults, parser_config) - # Ensure raptor and graphrag fields have default values if not provided + # Ensure raptor and graph_rag fields have default values if not provided merged_config = deep_merge(base_defaults, default_config) merged_config = deep_merge(merged_config, parser_config) diff --git a/common/data_source/confluence_connector.py b/common/data_source/confluence_connector.py index 643bf8e36..d2494c3de 100644 --- a/common/data_source/confluence_connector.py +++ b/common/data_source/confluence_connector.py @@ -186,7 +186,7 @@ class OnyxConfluence: # between the db and redis everywhere the credentials might be updated new_credential_str = json.dumps(new_credentials) self.redis_client.set( - self.credential_key, new_credential_str, nx=True, ex=self.CREDENTIAL_TTL + self.credential_key, new_credential_str, exp=self.CREDENTIAL_TTL ) self._credentials_provider.set_credentials(new_credentials) @@ -1599,8 +1599,8 @@ class ConfluenceConnector( semantic_identifier=semantic_identifier, extension=".html", # Confluence pages are HTML blob=page_content.encode("utf-8"), # Encode page content as bytes - size_bytes=len(page_content.encode("utf-8")), # Calculate size in bytes doc_updated_at=datetime_from_string(page["version"]["when"]), + size_bytes=len(page_content.encode("utf-8")), # Calculate size in bytes primary_owners=primary_owners if primary_owners else None, metadata=metadata if metadata else None, ) diff --git a/common/data_source/models.py b/common/data_source/models.py index a5ca05859..ad825eafa 100644 --- a/common/data_source/models.py +++ b/common/data_source/models.py @@ -94,6 +94,7 @@ class Document(BaseModel): blob: bytes doc_updated_at: datetime size_bytes: int + primary_owners: list metadata: Optional[dict[str, Any]] = None diff --git a/common/data_source/slack_connector.py b/common/data_source/slack_connector.py index b6aa80bdd..5fabc3d00 100644 --- a/common/data_source/slack_connector.py +++ b/common/data_source/slack_connector.py @@ -167,7 +167,6 @@ def get_latest_message_time(thread: ThreadType) -> datetime: def _build_doc_id(channel_id: str, thread_ts: str) -> str: - """构建文档ID""" return f"{channel_id}__{thread_ts}" @@ -179,7 +178,6 @@ def thread_to_doc( user_cache: dict[str, BasicExpertInfo | None], channel_access: Any | None, ) -> Document: - """将线程转换为文档""" channel_id = channel["id"] initial_sender_expert_info = expert_info_from_slack_id( @@ -237,7 +235,6 @@ def filter_channels( channels_to_connect: list[str] | None, regex_enabled: bool, ) -> list[ChannelType]: - """过滤频道""" if not channels_to_connect: return all_channels @@ -381,7 +378,6 @@ def _process_message( [MessageType], SlackMessageFilterReason | None ] = default_msg_filter, ) -> ProcessedSlackMessage: - """处理消息""" thread_ts = message.get("thread_ts") thread_or_message_ts = thread_ts or message["ts"] try: @@ -536,7 +532,6 @@ class SlackConnector( end: SecondsSinceUnixEpoch | None = None, callback: Any = None, ) -> GenerateSlimDocumentOutput: - """获取所有简化文档(带权限同步)""" if self.client is None: raise ConnectorMissingCredentialError("Slack") From f911aa2997f606612ed460a553ca42b6163fdd55 Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Mon, 22 Dec 2025 13:08:44 +0800 Subject: [PATCH 2/6] Fix: list MCP tools may block (#12067) ### What problem does this PR solve? List MCP tools may block. #12043 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- api/apps/mcp_server_app.py | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/api/apps/mcp_server_app.py b/api/apps/mcp_server_app.py index 863aac963..2c656df18 100644 --- a/api/apps/mcp_server_app.py +++ b/api/apps/mcp_server_app.py @@ -13,6 +13,8 @@ # See the License for the specific language governing permissions and # limitations under the License. # +import asyncio + from quart import Response, request from api.apps import current_user, login_required @@ -106,7 +108,7 @@ async def create() -> Response: return get_data_error_result(message="Tenant not found.") mcp_server = MCPServer(id=server_name, name=server_name, url=url, server_type=server_type, variables=variables, headers=headers) - server_tools, err_message = get_mcp_tools([mcp_server], timeout) + server_tools, err_message = await asyncio.to_thread(get_mcp_tools, [mcp_server], timeout) if err_message: return get_data_error_result(err_message) @@ -158,7 +160,7 @@ async def update() -> Response: req["id"] = mcp_id mcp_server = MCPServer(id=server_name, name=server_name, url=url, server_type=server_type, variables=variables, headers=headers) - server_tools, err_message = get_mcp_tools([mcp_server], timeout) + server_tools, err_message = await asyncio.to_thread(get_mcp_tools, [mcp_server], timeout) if err_message: return get_data_error_result(err_message) @@ -242,7 +244,7 @@ async def import_multiple() -> Response: headers = {"authorization_token": config["authorization_token"]} if "authorization_token" in config else {} variables = {k: v for k, v in config.items() if k not in {"type", "url", "headers"}} mcp_server = MCPServer(id=new_name, name=new_name, url=config["url"], server_type=config["type"], variables=variables, headers=headers) - server_tools, err_message = get_mcp_tools([mcp_server], timeout) + server_tools, err_message = await asyncio.to_thread(get_mcp_tools, [mcp_server], timeout) if err_message: results.append({"server": base_name, "success": False, "message": err_message}) continue @@ -322,7 +324,7 @@ async def list_tools() -> Response: tool_call_sessions.append(tool_call_session) try: - tools = tool_call_session.get_tools(timeout) + tools = await asyncio.to_thread(tool_call_session.get_tools, timeout) except Exception as e: tools = [] return get_data_error_result(message=f"MCP list tools error: {e}") @@ -340,7 +342,7 @@ async def list_tools() -> Response: return server_error_response(e) finally: # PERF: blocking call to close sessions — consider moving to background thread or task queue - close_multiple_mcp_toolcall_sessions(tool_call_sessions) + await asyncio.to_thread(close_multiple_mcp_toolcall_sessions, tool_call_sessions) @manager.route("/test_tool", methods=["POST"]) # noqa: F821 @@ -367,10 +369,10 @@ async def test_tool() -> Response: tool_call_session = MCPToolCallSession(mcp_server, mcp_server.variables) tool_call_sessions.append(tool_call_session) - result = tool_call_session.tool_call(tool_name, arguments, timeout) + result = await asyncio.to_thread(tool_call_session.tool_call, tool_name, arguments, timeout) # PERF: blocking call to close sessions — consider moving to background thread or task queue - close_multiple_mcp_toolcall_sessions(tool_call_sessions) + await asyncio.to_thread(close_multiple_mcp_toolcall_sessions, tool_call_sessions) return get_json_result(data=result) except Exception as e: return server_error_response(e) @@ -424,13 +426,13 @@ async def test_mcp() -> Response: tool_call_session = MCPToolCallSession(mcp_server, mcp_server.variables) try: - tools = tool_call_session.get_tools(timeout) + tools = await asyncio.to_thread(tool_call_session.get_tools, timeout) except Exception as e: tools = [] return get_data_error_result(message=f"Test MCP error: {e}") finally: # PERF: blocking call to close sessions — consider moving to background thread or task queue - close_multiple_mcp_toolcall_sessions([tool_call_session]) + await asyncio.to_thread(close_multiple_mcp_toolcall_sessions, [tool_call_session]) for tool in tools: tool_dict = tool.model_dump() From ba7e087aef37b4bfb83a0b42514ae618ed42a9a4 Mon Sep 17 00:00:00 2001 From: Stephen Hu <812791840@qq.com> Date: Mon, 22 Dec 2025 13:09:42 +0800 Subject: [PATCH 3/6] Refactor:remove useless try catch for ppt parser (#12063) ### What problem does this PR solve? remove useless try catch for ppt parser ### Type of change - [x] Refactoring --- deepdoc/parser/ppt_parser.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/deepdoc/parser/ppt_parser.py b/deepdoc/parser/ppt_parser.py index 39eab03a6..1b04b4d7c 100644 --- a/deepdoc/parser/ppt_parser.py +++ b/deepdoc/parser/ppt_parser.py @@ -88,12 +88,9 @@ class RAGFlowPptParser: texts = [] for shape in sorted( slide.shapes, key=lambda x: ((x.top if x.top is not None else 0) // 10, x.left if x.left is not None else 0)): - try: - txt = self.__extract(shape) - if txt: - texts.append(txt) - except Exception as e: - logging.exception(e) + txt = self.__extract(shape) + if txt: + texts.append(txt) txts.append("\n".join(texts)) return txts From 74adf3d59c9715ad693d7ec6e333849485dd4f7f Mon Sep 17 00:00:00 2001 From: Yingfeng Date: Mon, 22 Dec 2025 13:31:03 +0800 Subject: [PATCH 4/6] Potential fix for code scanning alert no. 57: Clear-text logging of sensitive information (#12071) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Potential fix for [https://github.com/infiniflow/ragflow/security/code-scanning/57](https://github.com/infiniflow/ragflow/security/code-scanning/57) In general, the safest fix is to ensure that any logging of request URLs from `async_request` (and similar helpers) cannot include secrets. This can be done by (a) suppressing logging entirely for URLs considered sensitive, or (b) logging only a non-sensitive subset (e.g., scheme + host + path) and never query strings or credentials. The minimal, backward-compatible change here is to strengthen `_redact_sensitive_url_params` and `_is_sensitive_url` / the logging call so that we never log query parameters at all. Instead of logging the full URL (with redacted query), we can log only `scheme://netloc/path` and optionally strip userinfo. This retains useful observability (which endpoint, which method, response code, timing) while guaranteeing that no secrets in query strings or path segments appear in logs. Concretely: - Update `_redact_sensitive_url_params` to *not* include the query string in the returned value, and to drop any embedded userinfo (`username:password@host`). - Continue to wrap logging in a “sensitive URL” guard, but now the redaction routine itself ensures no secrets from query are present. - Leave callers (e.g., `github_callback`, `feishu_callback`) unchanged, since they only pass URLs and do not control the logging behavior directly. All changes are confined to `common/http_client.py` inside the provided snippet. No new imports are necessary. _Suggested fixes powered by Copilot Autofix. Review carefully before merging._ --------- Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- common/http_client.py | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/common/http_client.py b/common/http_client.py index d98db0e82..28c988ef6 100644 --- a/common/http_client.py +++ b/common/http_client.py @@ -16,7 +16,7 @@ import logging import os import time from typing import Any, Dict, Optional -from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse +from urllib.parse import urlparse, urlunparse from common import settings import httpx @@ -58,21 +58,34 @@ def _get_delay(backoff_factor: float, attempt: int) -> float: _SENSITIVE_QUERY_KEYS = {"client_secret", "secret", "code", "access_token", "refresh_token", "password", "token", "app_secret"} def _redact_sensitive_url_params(url: str) -> str: + """ + Return a version of the URL that is safe to log. + + We intentionally drop query parameters and userinfo to avoid leaking + credentials or tokens via logs. Only scheme, host, port and path + are preserved. + """ try: parsed = urlparse(url) - if not parsed.query: - return url - clean_query = [] - for k, v in parse_qsl(parsed.query, keep_blank_values=True): - if k.lower() in _SENSITIVE_QUERY_KEYS: - clean_query.append((k, "***REDACTED***")) - else: - clean_query.append((k, v)) - new_query = urlencode(clean_query, doseq=True) - redacted_url = urlunparse(parsed._replace(query=new_query)) - return redacted_url + # Remove any potential userinfo (username:password@) + netloc = parsed.hostname or "" + if parsed.port: + netloc = f"{netloc}:{parsed.port}" + # Reconstruct URL without query, params, fragment, or userinfo. + safe_url = urlunparse( + ( + parsed.scheme, + netloc, + parsed.path, + "", # params + "", # query + "", # fragment + ) + ) + return safe_url except Exception: - return url + # If parsing fails, fall back to omitting the URL entirely. + return "" def _is_sensitive_url(url: str) -> bool: """Return True if URL is one of the configured OAuth endpoints.""" From bfef96d56e52f96ac520abdc55b771b803daf6fb Mon Sep 17 00:00:00 2001 From: Yingfeng Date: Mon, 22 Dec 2025 13:31:25 +0800 Subject: [PATCH 5/6] Potential fix for code scanning alert no. 58: Clear-text logging of sensitive information (#12070) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Potential fix for [https://github.com/infiniflow/ragflow/security/code-scanning/58](https://github.com/infiniflow/ragflow/security/code-scanning/58) General approach: avoid logging potentially sensitive URLs (especially at warning level) or ensure they are fully and robustly redacted before logging. Since this client is shared and used with OAuth endpoints, the safest minimal-change fix is to stop including the URL in warning logs (retries exhausted and retry attempts) and only log the HTTP method and a generic message. Debug logs can continue using the existing redaction helper for non-sensitive URLs if desired. Best concrete fix without changing functionality: in `common/http_client.py`, in `async_request`, change the retry-exhausted and retry-attempt warning log statements so that they no longer interpolate `log_url` (and thus the tainted `url`). We can still compute `log_url` if needed elsewhere, but the log string itself should not contain `log_url`. This directly removes the tainted data from the sink while preserving information about errors and retry behavior. No changes are required in `common/settings.py` or `api/apps/user_app.py`, and we do not need new imports or helpers. Specifically: - In `common/http_client.py`, around line 152–163, replace the two warning logs: - `logger.warning(f"async_request exhausted retries for {method} {log_url}")` - `logger.warning(f"async_request attempt {attempt + 1}/{retries + 1} failed for {method} {log_url}; retrying in {delay:.2f}s")` with versions that omit `{log_url}`, such as: - `logger.warning(f"async_request exhausted retries for {method}")` - `logger.warning(f"async_request attempt {attempt + 1}/{retries + 1} failed for {method}; retrying in {delay:.2f}s")` This ensures no URL-derived data flows into these warning logs, addressing all variants of the alert, since they all trace to the same sink. --- _Suggested fixes powered by Copilot Autofix. Review carefully before merging._ Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- common/http_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/http_client.py b/common/http_client.py index 28c988ef6..b74676da6 100644 --- a/common/http_client.py +++ b/common/http_client.py @@ -166,13 +166,13 @@ async def async_request( if attempt >= retries: if not _is_sensitive_url(url): log_url = _redact_sensitive_url_params(url) - logger.warning(f"async_request exhausted retries for {method} {log_url}") + logger.warning(f"async_request exhausted retries for {method}") raise delay = _get_delay(backoff_factor, attempt) if not _is_sensitive_url(url): log_url = _redact_sensitive_url_params(url) logger.warning( - f"async_request attempt {attempt + 1}/{retries + 1} failed for {method} {log_url}; retrying in {delay:.2f}s" + f"async_request attempt {attempt + 1}/{retries + 1} failed for {method}; retrying in {delay:.2f}s" ) await asyncio.sleep(delay) raise last_exc # pragma: no cover From 6c9afd1ffb7f8fb0e13f65d87465f26303fdde9a Mon Sep 17 00:00:00 2001 From: Yingfeng Date: Mon, 22 Dec 2025 13:31:39 +0800 Subject: [PATCH 6/6] Potential fix for code scanning alert no. 60: Clear-text logging of sensitive information (#12068) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Potential fix for [https://github.com/infiniflow/ragflow/security/code-scanning/60](https://github.com/infiniflow/ragflow/security/code-scanning/60) In general, the correct fix is to ensure that no sensitive data (passwords, API keys, full connection strings with embedded credentials, etc.) is ever written to logs. This can be done by (1) whitelisting only clearly non-sensitive fields for logging, and/or (2) explicitly scrubbing or masking any value that might contain credentials before logging, and (3) not relying on later deletion from the dictionary to protect against logging, since the log call already happened. For this function, the best minimal fix is: - Keep the idea of a safe key whitelist, but strengthen it so we are absolutely sure we never log `password` or `connection_string`, even indirectly. - Avoid building the logged dict from the same potentially-tainted `kwargs` object before we have removed sensitive keys, or relying solely on key names that might change. - Construct a separate, small log context that is obviously safe: scheme, host, port, database, table, and possibly a boolean like `has_password` instead of the password itself. - Optionally, add a small helper to derive this safe log context, but given the scope we can keep it inline. Concretely in `rag/utils/opendal_conn.py`: - Replace the current `SAFE_LOG_KEYS` / `loggable_kwargs` / `logging.info(...)` block so that: - We do not pass through arbitrary `kwargs` values by key filtering alone. - We instead build a new dict with explicitly chosen, non-sensitive fields, e.g.: ```python safe_log_info = { "scheme": kwargs.get("scheme"), "host": kwargs.get("host"), "port": kwargs.get("port"), "database": kwargs.get("database"), "table": kwargs.get("table"), "has_password": "password" in kwargs or "connection_string" in kwargs, } logging.info("Loaded OpenDAL configuration (non sensitive fields only): %s", safe_log_info) ``` - This makes sure that neither the password nor a connection string containing it is ever logged, while still retaining useful diagnostic information. - Keep the existing deletion of `password` and `connection_string` from `kwargs` after logging, as an additional safety measure for any later use of `kwargs`. No new imports or external libraries are required; we only modify lines 45–56 of the shown snippet. _Suggested fixes powered by Copilot Autofix. Review carefully before merging._ Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- rag/utils/opendal_conn.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/rag/utils/opendal_conn.py b/rag/utils/opendal_conn.py index c261665f7..b188346c7 100644 --- a/rag/utils/opendal_conn.py +++ b/rag/utils/opendal_conn.py @@ -45,9 +45,16 @@ def get_opendal_config(): # Only include non-sensitive keys in logs. Do NOT # add 'password' or any key containing embedded credentials # (like 'connection_string'). - SAFE_LOG_KEYS = ['scheme', 'host', 'port', 'database', 'table'] # explicitly non-sensitive - loggable_kwargs = {k: v for k, v in kwargs.items() if k in SAFE_LOG_KEYS} - logging.info("Loaded OpenDAL configuration (non sensitive fields only): %s", loggable_kwargs) + safe_log_info = { + "scheme": kwargs.get("scheme"), + "host": kwargs.get("host"), + "port": kwargs.get("port"), + "database": kwargs.get("database"), + "table": kwargs.get("table"), + # indicate presence of credentials without logging them + "has_credentials": any(k in kwargs for k in ("password", "connection_string")), + } + logging.info("Loaded OpenDAL configuration (non sensitive fields only): %s", safe_log_info) # For safety, explicitly remove sensitive keys from kwargs after use if "password" in kwargs: