mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-22 22:26:43 +08:00
Potential fix for [https://github.com/infiniflow/ragflow/security/code-scanning/57](https://github.com/infiniflow/ragflow/security/code-scanning/57) In general, the safest fix is to ensure that any logging of request URLs from `async_request` (and similar helpers) cannot include secrets. This can be done by (a) suppressing logging entirely for URLs considered sensitive, or (b) logging only a non-sensitive subset (e.g., scheme + host + path) and never query strings or credentials. The minimal, backward-compatible change here is to strengthen `_redact_sensitive_url_params` and `_is_sensitive_url` / the logging call so that we never log query parameters at all. Instead of logging the full URL (with redacted query), we can log only `scheme://netloc/path` and optionally strip userinfo. This retains useful observability (which endpoint, which method, response code, timing) while guaranteeing that no secrets in query strings or path segments appear in logs. Concretely: - Update `_redact_sensitive_url_params` to *not* include the query string in the returned value, and to drop any embedded userinfo (`username:password@host`). - Continue to wrap logging in a “sensitive URL” guard, but now the redaction routine itself ensures no secrets from query are present. - Leave callers (e.g., `github_callback`, `feishu_callback`) unchanged, since they only pass URLs and do not control the logging behavior directly. All changes are confined to `common/http_client.py` inside the provided snippet. No new imports are necessary. _Suggested fixes powered by Copilot Autofix. Review carefully before merging._ --------- Co-authored-by: Copilot Autofix powered by AI <62310815+github-advanced-security[bot]@users.noreply.github.com>
252 lines
9.4 KiB
Python
252 lines
9.4 KiB
Python
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import time
|
|
from typing import Any, Dict, Optional
|
|
from urllib.parse import urlparse, urlunparse
|
|
|
|
from common import settings
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Default knobs; keep conservative to avoid unexpected behavioural changes.
|
|
DEFAULT_TIMEOUT = float(os.environ.get("HTTP_CLIENT_TIMEOUT", "15"))
|
|
# Align with requests default: follow redirects with a max of 30 unless overridden.
|
|
DEFAULT_FOLLOW_REDIRECTS = bool(
|
|
int(os.environ.get("HTTP_CLIENT_FOLLOW_REDIRECTS", "1"))
|
|
)
|
|
DEFAULT_MAX_REDIRECTS = int(os.environ.get("HTTP_CLIENT_MAX_REDIRECTS", "30"))
|
|
DEFAULT_MAX_RETRIES = int(os.environ.get("HTTP_CLIENT_MAX_RETRIES", "2"))
|
|
DEFAULT_BACKOFF_FACTOR = float(os.environ.get("HTTP_CLIENT_BACKOFF_FACTOR", "0.5"))
|
|
DEFAULT_PROXY = os.environ.get("HTTP_CLIENT_PROXY")
|
|
DEFAULT_USER_AGENT = os.environ.get("HTTP_CLIENT_USER_AGENT", "ragflow-http-client")
|
|
|
|
|
|
def _clean_headers(
|
|
headers: Optional[Dict[str, str]], auth_token: Optional[str] = None
|
|
) -> Optional[Dict[str, str]]:
|
|
merged_headers: Dict[str, str] = {}
|
|
if DEFAULT_USER_AGENT:
|
|
merged_headers["User-Agent"] = DEFAULT_USER_AGENT
|
|
if auth_token:
|
|
merged_headers["Authorization"] = auth_token
|
|
if headers is None:
|
|
return merged_headers or None
|
|
merged_headers.update({str(k): str(v) for k, v in headers.items() if v is not None})
|
|
return merged_headers or None
|
|
|
|
|
|
def _get_delay(backoff_factor: float, attempt: int) -> float:
|
|
return backoff_factor * (2**attempt)
|
|
|
|
|
|
# List of sensitive parameters to redact from URLs before logging
|
|
_SENSITIVE_QUERY_KEYS = {"client_secret", "secret", "code", "access_token", "refresh_token", "password", "token", "app_secret"}
|
|
|
|
def _redact_sensitive_url_params(url: str) -> str:
|
|
"""
|
|
Return a version of the URL that is safe to log.
|
|
|
|
We intentionally drop query parameters and userinfo to avoid leaking
|
|
credentials or tokens via logs. Only scheme, host, port and path
|
|
are preserved.
|
|
"""
|
|
try:
|
|
parsed = urlparse(url)
|
|
# Remove any potential userinfo (username:password@)
|
|
netloc = parsed.hostname or ""
|
|
if parsed.port:
|
|
netloc = f"{netloc}:{parsed.port}"
|
|
# Reconstruct URL without query, params, fragment, or userinfo.
|
|
safe_url = urlunparse(
|
|
(
|
|
parsed.scheme,
|
|
netloc,
|
|
parsed.path,
|
|
"", # params
|
|
"", # query
|
|
"", # fragment
|
|
)
|
|
)
|
|
return safe_url
|
|
except Exception:
|
|
# If parsing fails, fall back to omitting the URL entirely.
|
|
return "<redacted-url>"
|
|
|
|
def _is_sensitive_url(url: str) -> bool:
|
|
"""Return True if URL is one of the configured OAuth endpoints."""
|
|
# Collect known sensitive endpoint URLs from settings
|
|
oauth_urls = set()
|
|
# GitHub OAuth endpoints
|
|
try:
|
|
if settings.GITHUB_OAUTH is not None:
|
|
url_val = settings.GITHUB_OAUTH.get("url")
|
|
if url_val:
|
|
oauth_urls.add(url_val)
|
|
except Exception:
|
|
pass
|
|
# Feishu OAuth endpoints
|
|
try:
|
|
if settings.FEISHU_OAUTH is not None:
|
|
for k in ("app_access_token_url", "user_access_token_url"):
|
|
url_val = settings.FEISHU_OAUTH.get(k)
|
|
if url_val:
|
|
oauth_urls.add(url_val)
|
|
except Exception:
|
|
pass
|
|
# Defensive normalization: compare only scheme+netloc+path
|
|
url_obj = urlparse(url)
|
|
for sensitive_url in oauth_urls:
|
|
sensitive_obj = urlparse(sensitive_url)
|
|
if (url_obj.scheme, url_obj.netloc, url_obj.path) == (sensitive_obj.scheme, sensitive_obj.netloc, sensitive_obj.path):
|
|
return True
|
|
return False
|
|
|
|
async def async_request(
|
|
method: str,
|
|
url: str,
|
|
*,
|
|
request_timeout: float | httpx.Timeout | None = None,
|
|
follow_redirects: bool | None = None,
|
|
max_redirects: Optional[int] = None,
|
|
headers: Optional[Dict[str, str]] = None,
|
|
auth_token: Optional[str] = None,
|
|
retries: Optional[int] = None,
|
|
backoff_factor: Optional[float] = None,
|
|
proxy: Any = None,
|
|
**kwargs: Any,
|
|
) -> httpx.Response:
|
|
"""Lightweight async HTTP wrapper using httpx.AsyncClient with safe defaults."""
|
|
timeout = request_timeout if request_timeout is not None else DEFAULT_TIMEOUT
|
|
follow_redirects = (
|
|
DEFAULT_FOLLOW_REDIRECTS if follow_redirects is None else follow_redirects
|
|
)
|
|
max_redirects = DEFAULT_MAX_REDIRECTS if max_redirects is None else max_redirects
|
|
retries = DEFAULT_MAX_RETRIES if retries is None else max(retries, 0)
|
|
backoff_factor = (
|
|
DEFAULT_BACKOFF_FACTOR if backoff_factor is None else backoff_factor
|
|
)
|
|
headers = _clean_headers(headers, auth_token=auth_token)
|
|
proxy = DEFAULT_PROXY if proxy is None else proxy
|
|
|
|
async with httpx.AsyncClient(
|
|
timeout=timeout,
|
|
follow_redirects=follow_redirects,
|
|
max_redirects=max_redirects,
|
|
proxy=proxy,
|
|
) as client:
|
|
last_exc: Exception | None = None
|
|
for attempt in range(retries + 1):
|
|
try:
|
|
start = time.monotonic()
|
|
response = await client.request(
|
|
method=method, url=url, headers=headers, **kwargs
|
|
)
|
|
duration = time.monotonic() - start
|
|
if not _is_sensitive_url(url):
|
|
log_url = _redact_sensitive_url_params(url)
|
|
logger.debug(f"async_request {method} {log_url} -> {response.status_code} in {duration:.3f}s")
|
|
return response
|
|
except httpx.RequestError as exc:
|
|
last_exc = exc
|
|
if attempt >= retries:
|
|
if not _is_sensitive_url(url):
|
|
log_url = _redact_sensitive_url_params(url)
|
|
logger.warning(f"async_request exhausted retries for {method} {log_url}")
|
|
raise
|
|
delay = _get_delay(backoff_factor, attempt)
|
|
if not _is_sensitive_url(url):
|
|
log_url = _redact_sensitive_url_params(url)
|
|
logger.warning(
|
|
f"async_request attempt {attempt + 1}/{retries + 1} failed for {method} {log_url}; retrying in {delay:.2f}s"
|
|
)
|
|
await asyncio.sleep(delay)
|
|
raise last_exc # pragma: no cover
|
|
|
|
|
|
def sync_request(
|
|
method: str,
|
|
url: str,
|
|
*,
|
|
timeout: float | httpx.Timeout | None = None,
|
|
follow_redirects: bool | None = None,
|
|
max_redirects: Optional[int] = None,
|
|
headers: Optional[Dict[str, str]] = None,
|
|
auth_token: Optional[str] = None,
|
|
retries: Optional[int] = None,
|
|
backoff_factor: Optional[float] = None,
|
|
proxy: Any = None,
|
|
**kwargs: Any,
|
|
) -> httpx.Response:
|
|
"""Synchronous counterpart to async_request, for CLI/tests or sync contexts."""
|
|
timeout = timeout if timeout is not None else DEFAULT_TIMEOUT
|
|
follow_redirects = (
|
|
DEFAULT_FOLLOW_REDIRECTS if follow_redirects is None else follow_redirects
|
|
)
|
|
max_redirects = DEFAULT_MAX_REDIRECTS if max_redirects is None else max_redirects
|
|
retries = DEFAULT_MAX_RETRIES if retries is None else max(retries, 0)
|
|
backoff_factor = (
|
|
DEFAULT_BACKOFF_FACTOR if backoff_factor is None else backoff_factor
|
|
)
|
|
headers = _clean_headers(headers, auth_token=auth_token)
|
|
proxy = DEFAULT_PROXY if proxy is None else proxy
|
|
|
|
with httpx.Client(
|
|
timeout=timeout,
|
|
follow_redirects=follow_redirects,
|
|
max_redirects=max_redirects,
|
|
proxy=proxy,
|
|
) as client:
|
|
last_exc: Exception | None = None
|
|
for attempt in range(retries + 1):
|
|
try:
|
|
start = time.monotonic()
|
|
response = client.request(
|
|
method=method, url=url, headers=headers, **kwargs
|
|
)
|
|
duration = time.monotonic() - start
|
|
logger.debug(
|
|
f"sync_request {method} {url} -> {response.status_code} in {duration:.3f}s"
|
|
)
|
|
return response
|
|
except httpx.RequestError as exc:
|
|
last_exc = exc
|
|
if attempt >= retries:
|
|
logger.warning(
|
|
f"sync_request exhausted retries for {method} {url}: {exc}"
|
|
)
|
|
raise
|
|
delay = _get_delay(backoff_factor, attempt)
|
|
logger.warning(
|
|
f"sync_request attempt {attempt + 1}/{retries + 1} failed for {method} {url}: {exc}; retrying in {delay:.2f}s"
|
|
)
|
|
time.sleep(delay)
|
|
raise last_exc # pragma: no cover
|
|
|
|
|
|
__all__ = [
|
|
"async_request",
|
|
"sync_request",
|
|
"DEFAULT_TIMEOUT",
|
|
"DEFAULT_FOLLOW_REDIRECTS",
|
|
"DEFAULT_MAX_REDIRECTS",
|
|
"DEFAULT_MAX_RETRIES",
|
|
"DEFAULT_BACKOFF_FACTOR",
|
|
"DEFAULT_PROXY",
|
|
"DEFAULT_USER_AGENT",
|
|
]
|