Feat: Support multiple data sources synchronizations (#10954)

### What problem does this PR solve? #10953 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-30 23:26:36 +08:00 · 2025-11-03 19:59:18 +08:00
parent 9a486e0f51
commit 3e5a39482e
33 changed files with 11444 additions and 3645 deletions
--- a/common/data_source/config.py
+++ b/common/data_source/config.py
@ -0,0 +1,252 @@
+"""Configuration constants and enum definitions"""
+import json
+import os
+from datetime import datetime, timezone
+from enum import Enum
+from typing import cast
+
+
+def get_current_tz_offset() -> int:
+    # datetime now() gets local time, datetime.now(timezone.utc) gets UTC time.
+    # remove tzinfo to compare non-timezone-aware objects.
+    time_diff = datetime.now() - datetime.now(timezone.utc).replace(tzinfo=None)
+    return round(time_diff.total_seconds() / 3600)
+
+
+ONE_HOUR = 3600
+ONE_DAY = ONE_HOUR * 24
+
+# Slack API limits
+_SLACK_LIMIT = 900
+
+# Redis lock configuration
+ONYX_SLACK_LOCK_TTL = 1800
+ONYX_SLACK_LOCK_BLOCKING_TIMEOUT = 60
+ONYX_SLACK_LOCK_TOTAL_BLOCKING_TIMEOUT = 3600
+
+
+class BlobType(str, Enum):
+    """Supported storage types"""
+    S3 = "s3"
+    R2 = "r2"
+    GOOGLE_CLOUD_STORAGE = "google_cloud_storage"
+    OCI_STORAGE = "oci_storage"
+
+
+class DocumentSource(str, Enum):
+    """Document sources"""
+    S3 = "s3"
+    NOTION = "notion"
+    R2 = "r2"
+    GOOGLE_CLOUD_STORAGE = "google_cloud_storage"
+    OCI_STORAGE = "oci_storage"
+    SLACK = "slack"
+    CONFLUENCE = "confluence"
+
+
+class FileOrigin(str, Enum):
+    """File origins"""
+    CONNECTOR = "connector"
+
+
+# Standard image MIME types supported by most vision LLMs
+IMAGE_MIME_TYPES = [
+    "image/png",
+    "image/jpeg",
+    "image/jpg",
+    "image/webp",
+]
+
+# Image types that should be excluded from processing
+EXCLUDED_IMAGE_TYPES = [
+    "image/bmp",
+    "image/tiff",
+    "image/gif",
+    "image/svg+xml",
+    "image/avif",
+]
+
+
+_PAGE_EXPANSION_FIELDS = [
+    "body.storage.value",
+    "version",
+    "space",
+    "metadata.labels",
+    "history.lastUpdated",
+]
+
+
+# Configuration constants
+BLOB_STORAGE_SIZE_THRESHOLD = 20 * 1024 * 1024  # 20MB
+INDEX_BATCH_SIZE = 2
+SLACK_NUM_THREADS = 4
+ENABLE_EXPENSIVE_EXPERT_CALLS = False
+
+# Slack related constants
+_SLACK_LIMIT = 900
+FAST_TIMEOUT = 1
+MAX_RETRIES = 7
+MAX_CHANNELS_TO_LOG = 50
+BOT_CHANNEL_MIN_BATCH_SIZE = 256
+BOT_CHANNEL_PERCENTAGE_THRESHOLD = 0.95
+
+# Download configuration
+DOWNLOAD_CHUNK_SIZE = 1024 * 1024  # 1MB
+SIZE_THRESHOLD_BUFFER = 64
+
+NOTION_CONNECTOR_DISABLE_RECURSIVE_PAGE_LOOKUP = (
+    os.environ.get("NOTION_CONNECTOR_DISABLE_RECURSIVE_PAGE_LOOKUP", "").lower()
+    == "true"
+)
+
+# This is the Oauth token
+DB_CREDENTIALS_DICT_TOKEN_KEY = "google_tokens"
+# This is the service account key
+DB_CREDENTIALS_DICT_SERVICE_ACCOUNT_KEY = "google_service_account_key"
+# The email saved for both auth types
+DB_CREDENTIALS_PRIMARY_ADMIN_KEY = "google_primary_admin"
+
+USER_FIELDS = "nextPageToken, users(primaryEmail)"
+
+# Error message substrings
+MISSING_SCOPES_ERROR_STR = "client not authorized for any of the scopes requested"
+
+SCOPE_INSTRUCTIONS = (
+    "You have upgraded RAGFlow without updating the Google Auth scopes. "
+)
+
+SLIM_BATCH_SIZE = 100
+
+# Notion API constants
+_NOTION_PAGE_SIZE = 100
+_NOTION_CALL_TIMEOUT = 30  # 30 seconds
+
+_ITERATION_LIMIT = 100_000
+
+#####
+# Indexing Configs
+#####
+# NOTE: Currently only supported in the Confluence and Google Drive connectors +
+# only handles some failures (Confluence = handles API call failures, Google
+# Drive = handles failures pulling files / parsing them)
+CONTINUE_ON_CONNECTOR_FAILURE = os.environ.get(
+    "CONTINUE_ON_CONNECTOR_FAILURE", ""
+).lower() not in ["false", ""]
+
+
+#####
+# Confluence Connector Configs
+#####
+
+CONFLUENCE_CONNECTOR_LABELS_TO_SKIP = [
+    ignored_tag
+    for ignored_tag in os.environ.get("CONFLUENCE_CONNECTOR_LABELS_TO_SKIP", "").split(
+        ","
+    )
+    if ignored_tag
+]
+
+# Avoid to get archived pages
+CONFLUENCE_CONNECTOR_INDEX_ARCHIVED_PAGES = (
+    os.environ.get("CONFLUENCE_CONNECTOR_INDEX_ARCHIVED_PAGES", "").lower() == "true"
+)
+
+# Attachments exceeding this size will not be retrieved (in bytes)
+CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD = int(
+    os.environ.get("CONFLUENCE_CONNECTOR_ATTACHMENT_SIZE_THRESHOLD", 10 * 1024 * 1024)
+)
+# Attachments with more chars than this will not be indexed. This is to prevent extremely
+# large files from freezing indexing. 200,000 is ~100 google doc pages.
+CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD = int(
+    os.environ.get("CONFLUENCE_CONNECTOR_ATTACHMENT_CHAR_COUNT_THRESHOLD", 200_000)
+)
+
+_RAW_CONFLUENCE_CONNECTOR_USER_PROFILES_OVERRIDE = os.environ.get(
+    "CONFLUENCE_CONNECTOR_USER_PROFILES_OVERRIDE", ""
+)
+CONFLUENCE_CONNECTOR_USER_PROFILES_OVERRIDE = cast(
+    list[dict[str, str]] | None,
+    (
+        json.loads(_RAW_CONFLUENCE_CONNECTOR_USER_PROFILES_OVERRIDE)
+        if _RAW_CONFLUENCE_CONNECTOR_USER_PROFILES_OVERRIDE
+        else None
+    ),
+)
+
+# enter as a floating point offset from UTC in hours (-24 < val < 24)
+# this will be applied globally, so it probably makes sense to transition this to per
+# connector as some point.
+# For the default value, we assume that the user's local timezone is more likely to be
+# correct (i.e. the configured user's timezone or the default server one) than UTC.
+# https://developer.atlassian.com/cloud/confluence/cql-fields/#created
+CONFLUENCE_TIMEZONE_OFFSET = float(
+    os.environ.get("CONFLUENCE_TIMEZONE_OFFSET", get_current_tz_offset())
+)
+
+OAUTH_SLACK_CLIENT_ID = os.environ.get("OAUTH_SLACK_CLIENT_ID", "")
+OAUTH_SLACK_CLIENT_SECRET = os.environ.get("OAUTH_SLACK_CLIENT_SECRET", "")
+OAUTH_CONFLUENCE_CLOUD_CLIENT_ID = os.environ.get(
+    "OAUTH_CONFLUENCE_CLOUD_CLIENT_ID", ""
+)
+
+OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET = os.environ.get(
+    "OAUTH_CONFLUENCE_CLOUD_CLIENT_SECRET", ""
+)
+
+OAUTH_JIRA_CLOUD_CLIENT_ID = os.environ.get("OAUTH_JIRA_CLOUD_CLIENT_ID", "")
+OAUTH_JIRA_CLOUD_CLIENT_SECRET = os.environ.get("OAUTH_JIRA_CLOUD_CLIENT_SECRET", "")
+OAUTH_GOOGLE_DRIVE_CLIENT_ID = os.environ.get("OAUTH_GOOGLE_DRIVE_CLIENT_ID", "")
+OAUTH_GOOGLE_DRIVE_CLIENT_SECRET = os.environ.get(
+    "OAUTH_GOOGLE_DRIVE_CLIENT_SECRET", ""
+)
+
+CONFLUENCE_OAUTH_TOKEN_URL = "https://auth.atlassian.com/oauth/token"
+RATE_LIMIT_MESSAGE_LOWERCASE = "Rate limit exceeded".lower()
+
+_DEFAULT_PAGINATION_LIMIT = 1000
+
+_PROBLEMATIC_EXPANSIONS = "body.storage.value"
+_REPLACEMENT_EXPANSIONS = "body.view.value"
+
+
+class HtmlBasedConnectorTransformLinksStrategy(str, Enum):
+    # remove links entirely
+    STRIP = "strip"
+    # turn HTML links into markdown links
+    MARKDOWN = "markdown"
+
+
+HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY = os.environ.get(
+    "HTML_BASED_CONNECTOR_TRANSFORM_LINKS_STRATEGY",
+    HtmlBasedConnectorTransformLinksStrategy.STRIP,
+)
+
+PARSE_WITH_TRAFILATURA = os.environ.get("PARSE_WITH_TRAFILATURA", "").lower() == "true"
+
+WEB_CONNECTOR_IGNORED_CLASSES = os.environ.get(
+    "WEB_CONNECTOR_IGNORED_CLASSES", "sidebar,footer"
+).split(",")
+WEB_CONNECTOR_IGNORED_ELEMENTS = os.environ.get(
+    "WEB_CONNECTOR_IGNORED_ELEMENTS", "nav,footer,meta,script,style,symbol,aside"
+).split(",")
+
+_USER_NOT_FOUND = "Unknown Confluence User"
+
+_COMMENT_EXPANSION_FIELDS = ["body.storage.value"]
+
+_ATTACHMENT_EXPANSION_FIELDS = [
+    "version",
+    "space",
+    "metadata.labels",
+]
+
+_RESTRICTIONS_EXPANSION_FIELDS = [
+    "space",
+    "restrictions.read.restrictions.user",
+    "restrictions.read.restrictions.group",
+    "ancestors.restrictions.read.restrictions.user",
+    "ancestors.restrictions.read.restrictions.group",
+]
+
+
+_SLIM_DOC_BATCH_SIZE = 5000