From 74adf3d59c9715ad693d7ec6e333849485dd4f7f Mon Sep 17 00:00:00 2001 From: Yingfeng Date: Mon, 22 Dec 2025 13:31:03 +0800 Subject: [PATCH] Potential fix for code scanning alert no. 57: Clear-text logging of sensitive information (#12071) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Potential fix for [https://github.com/infiniflow/ragflow/security/code-scanning/57](https://github.com/infiniflow/ragflow/security/code-scanning/57) In general, the safest fix is to ensure that any logging of request URLs from `async_request` (and similar helpers) cannot include secrets. This can be done by (a) suppressing logging entirely for URLs considered sensitive, or (b) logging only a non-sensitive subset (e.g., scheme + host + path) and never query strings or credentials. The minimal, backward-compatible change here is to strengthen `_redact_sensitive_url_params` and `_is_sensitive_url` / the logging call so that we never log query parameters at all. Instead of logging the full URL (with redacted query), we can log only `scheme://netloc/path` and optionally strip userinfo. This retains useful observability (which endpoint, which method, response code, timing) while guaranteeing that no secrets in query strings or path segments appear in logs. Concretely: - Update `_redact_sensitive_url_params` to *not* include the query string in the returned value, and to drop any embedded userinfo (`username:password@host`). - Continue to wrap logging in a “sensitive URL” guard, but now the redaction routine itself ensures no secrets from query are present. - Leave callers (e.g., `github_callback`, `feishu_callback`) unchanged, since they only pass URLs and do not control the logging behavior directly. All changes are confined to `common/http_client.py` inside the provided snippet. No new imports are necessary. _Suggested fixes powered by Copilot Autofix. Review carefully before merging._ --------- Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com> --- common/http_client.py | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) diff --git a/common/http_client.py b/common/http_client.py index d98db0e82..28c988ef6 100644 --- a/common/http_client.py +++ b/common/http_client.py @@ -16,7 +16,7 @@ import logging import os import time from typing import Any, Dict, Optional -from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse +from urllib.parse import urlparse, urlunparse from common import settings import httpx @@ -58,21 +58,34 @@ def _get_delay(backoff_factor: float, attempt: int) -> float: _SENSITIVE_QUERY_KEYS = {"client_secret", "secret", "code", "access_token", "refresh_token", "password", "token", "app_secret"} def _redact_sensitive_url_params(url: str) -> str: + """ + Return a version of the URL that is safe to log. + + We intentionally drop query parameters and userinfo to avoid leaking + credentials or tokens via logs. Only scheme, host, port and path + are preserved. + """ try: parsed = urlparse(url) - if not parsed.query: - return url - clean_query = [] - for k, v in parse_qsl(parsed.query, keep_blank_values=True): - if k.lower() in _SENSITIVE_QUERY_KEYS: - clean_query.append((k, "***REDACTED***")) - else: - clean_query.append((k, v)) - new_query = urlencode(clean_query, doseq=True) - redacted_url = urlunparse(parsed._replace(query=new_query)) - return redacted_url + # Remove any potential userinfo (username:password@) + netloc = parsed.hostname or "" + if parsed.port: + netloc = f"{netloc}:{parsed.port}" + # Reconstruct URL without query, params, fragment, or userinfo. + safe_url = urlunparse( + ( + parsed.scheme, + netloc, + parsed.path, + "", # params + "", # query + "", # fragment + ) + ) + return safe_url except Exception: - return url + # If parsing fails, fall back to omitting the URL entirely. + return "" def _is_sensitive_url(url: str) -> bool: """Return True if URL is one of the configured OAuth endpoints."""