Merge branch 'main' into alert-autofix-59

2025-12-29 16:05:35 +08:00 · 2025-12-22 13:33:47 +08:00
parent 15eccb445d 6c9afd1ffb
commit 5749abfd89
8 changed files with 63 additions and 41 deletions
--- a/common/data_source/confluence_connector.py
+++ b/common/data_source/confluence_connector.py
@ -186,7 +186,7 @@ class OnyxConfluence:
        # between the db and redis everywhere the credentials might be updated
        new_credential_str = json.dumps(new_credentials)
        self.redis_client.set(
-            self.credential_key, new_credential_str, nx=True, ex=self.CREDENTIAL_TTL
+            self.credential_key, new_credential_str, exp=self.CREDENTIAL_TTL
        )
        self._credentials_provider.set_credentials(new_credentials)

@ -1599,8 +1599,8 @@ class ConfluenceConnector(
                semantic_identifier=semantic_identifier,
                extension=".html",  # Confluence pages are HTML
                blob=page_content.encode("utf-8"),  # Encode page content as bytes
-                size_bytes=len(page_content.encode("utf-8")),  # Calculate size in bytes
                doc_updated_at=datetime_from_string(page["version"]["when"]),
+                size_bytes=len(page_content.encode("utf-8")),  # Calculate size in bytes
                primary_owners=primary_owners if primary_owners else None,
                metadata=metadata if metadata else None,
            )
--- a/common/data_source/models.py
+++ b/common/data_source/models.py
@ -94,6 +94,7 @@ class Document(BaseModel):
    blob: bytes
    doc_updated_at: datetime
    size_bytes: int
+    primary_owners: list
    metadata: Optional[dict[str, Any]] = None


--- a/common/data_source/slack_connector.py
+++ b/common/data_source/slack_connector.py
@ -167,7 +167,6 @@ def get_latest_message_time(thread: ThreadType) -> datetime:


 def _build_doc_id(channel_id: str, thread_ts: str) -> str:
-    """构建文档ID"""
    return f"{channel_id}__{thread_ts}"


@ -179,7 +178,6 @@ def thread_to_doc(
    user_cache: dict[str, BasicExpertInfo | None],
    channel_access: Any | None,
 ) -> Document:
-    """将线程转换为文档"""
    channel_id = channel["id"]

    initial_sender_expert_info = expert_info_from_slack_id(
@ -237,7 +235,6 @@ def filter_channels(
    channels_to_connect: list[str] | None,
    regex_enabled: bool,
 ) -> list[ChannelType]:
-    """过滤频道"""
    if not channels_to_connect:
        return all_channels

@ -381,7 +378,6 @@ def _process_message(
        [MessageType], SlackMessageFilterReason | None
    ] = default_msg_filter,
 ) -> ProcessedSlackMessage:
-    """处理消息"""
    thread_ts = message.get("thread_ts")
    thread_or_message_ts = thread_ts or message["ts"]
    try:
@ -536,7 +532,6 @@ class SlackConnector(
        end: SecondsSinceUnixEpoch | None = None,
        callback: Any = None,
    ) -> GenerateSlimDocumentOutput:
-        """获取所有简化文档（带权限同步）"""
        if self.client is None:
            raise ConnectorMissingCredentialError("Slack")

--- a/common/http_client.py
+++ b/common/http_client.py
@ -16,7 +16,7 @@ import logging
 import os
 import time
 from typing import Any, Dict, Optional
-from urllib.parse import parse_qsl, urlencode, urlparse, urlunparse
+from urllib.parse import urlparse, urlunparse

 from common import settings
 import httpx
@ -58,21 +58,34 @@ def _get_delay(backoff_factor: float, attempt: int) -> float:
 _SENSITIVE_QUERY_KEYS = {"client_secret", "secret", "code", "access_token", "refresh_token", "password", "token", "app_secret"}

 def _redact_sensitive_url_params(url: str) -> str:
+    """
+    Return a version of the URL that is safe to log.
+
+    We intentionally drop query parameters and userinfo to avoid leaking
+    credentials or tokens via logs. Only scheme, host, port and path
+    are preserved.
+    """
    try:
        parsed = urlparse(url)
-        if not parsed.query:
-            return url
-        clean_query = []
-        for k, v in parse_qsl(parsed.query, keep_blank_values=True):
-            if k.lower() in _SENSITIVE_QUERY_KEYS:
-                clean_query.append((k, "***REDACTED***"))
-            else:
-                clean_query.append((k, v))
-        new_query = urlencode(clean_query, doseq=True)
-        redacted_url = urlunparse(parsed._replace(query=new_query))
-        return redacted_url
+        # Remove any potential userinfo (username:password@)
+        netloc = parsed.hostname or ""
+        if parsed.port:
+            netloc = f"{netloc}:{parsed.port}"
+        # Reconstruct URL without query, params, fragment, or userinfo.
+        safe_url = urlunparse(
+            (
+                parsed.scheme,
+                netloc,
+                parsed.path,
+                "",  # params
+                "",  # query
+                "",  # fragment
+            )
+        )
+        return safe_url
    except Exception:
-        return url
+        # If parsing fails, fall back to omitting the URL entirely.
+        return "<redacted-url>"

 def _is_sensitive_url(url: str) -> bool:
    """Return True if URL is one of the configured OAuth endpoints."""
@ -151,9 +164,15 @@ async def async_request(
            except httpx.RequestError as exc:
                last_exc = exc
                if attempt >= retries:
-                    # Do not log the full URL here to avoid leaking sensitive data.
+                    if not _is_sensitive_url(url):
+                        log_url = _redact_sensitive_url_params(url)
+                        logger.warning(f"async_request exhausted retries for {method}")
+                    raise
+                delay = _get_delay(backoff_factor, attempt)
+                if not _is_sensitive_url(url):
+                    log_url = _redact_sensitive_url_params(url)
                    logger.warning(
-                        f"async_request exhausted retries for {method}; last error: {exc}"
+                        f"async_request attempt {attempt + 1}/{retries + 1} failed for {method}; retrying in {delay:.2f}s"
                    )
                    raise
                delay = _get_delay(backoff_factor, attempt)