Feat: add datasource Dropbox (#11488)

### What problem does this PR solve?

Add datasource Dropbox.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-11-25 09:40:03 +08:00
committed by GitHub
parent d5f8548200
commit d1744aaaf3
9 changed files with 197 additions and 37 deletions

View File

@ -119,6 +119,7 @@ class FileSource(StrEnum):
SLACK = "slack" SLACK = "slack"
TEAMS = "teams" TEAMS = "teams"
MOODLE = "moodle" MOODLE = "moodle"
DROPBOX = "dropbox"
class PipelineTaskType(StrEnum): class PipelineTaskType(StrEnum):

View File

@ -50,6 +50,7 @@ class DocumentSource(str, Enum):
DISCORD = "discord" DISCORD = "discord"
MOODLE = "moodle" MOODLE = "moodle"
S3_COMPATIBLE = "s3_compatible" S3_COMPATIBLE = "s3_compatible"
DROPBOX = "dropbox"
class FileOrigin(str, Enum): class FileOrigin(str, Enum):

View File

@ -1,13 +1,24 @@
"""Dropbox connector""" """Dropbox connector"""
import logging
from datetime import timezone
from typing import Any from typing import Any
from dropbox import Dropbox from dropbox import Dropbox
from dropbox.exceptions import ApiError, AuthError from dropbox.exceptions import ApiError, AuthError
from dropbox.files import FileMetadata, FolderMetadata
from common.data_source.config import INDEX_BATCH_SIZE from common.data_source.config import INDEX_BATCH_SIZE, DocumentSource
from common.data_source.exceptions import ConnectorValidationError, InsufficientPermissionsError, ConnectorMissingCredentialError from common.data_source.exceptions import (
ConnectorMissingCredentialError,
ConnectorValidationError,
InsufficientPermissionsError,
)
from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch
from common.data_source.models import Document, GenerateDocumentsOutput
from common.data_source.utils import get_file_ext
logger = logging.getLogger(__name__)
class DropboxConnector(LoadConnector, PollConnector): class DropboxConnector(LoadConnector, PollConnector):
@ -19,29 +30,29 @@ class DropboxConnector(LoadConnector, PollConnector):
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None: def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
"""Load Dropbox credentials""" """Load Dropbox credentials"""
try: access_token = credentials.get("dropbox_access_token")
access_token = credentials.get("dropbox_access_token") if not access_token:
if not access_token: raise ConnectorMissingCredentialError("Dropbox access token is required")
raise ConnectorMissingCredentialError("Dropbox access token is required")
self.dropbox_client = Dropbox(access_token) self.dropbox_client = Dropbox(access_token)
return None return None
except Exception as e:
raise ConnectorMissingCredentialError(f"Dropbox: {e}")
def validate_connector_settings(self) -> None: def validate_connector_settings(self) -> None:
"""Validate Dropbox connector settings""" """Validate Dropbox connector settings"""
if not self.dropbox_client: if self.dropbox_client is None:
raise ConnectorMissingCredentialError("Dropbox") raise ConnectorMissingCredentialError("Dropbox")
try: try:
# Test connection by getting current account info self.dropbox_client.files_list_folder(path="", limit=1)
self.dropbox_client.users_get_current_account() except AuthError as e:
except (AuthError, ApiError) as e: logger.exception("[Dropbox]: Failed to validate Dropbox credentials")
if "invalid_access_token" in str(e).lower(): raise ConnectorValidationError(f"Dropbox credential is invalid: {e}")
raise InsufficientPermissionsError("Invalid Dropbox access token") except ApiError as e:
else: if e.error is not None and "insufficient_permissions" in str(e.error).lower():
raise ConnectorValidationError(f"Dropbox validation error: {e}") raise InsufficientPermissionsError("Your Dropbox token does not have sufficient permissions.")
raise ConnectorValidationError(f"Unexpected Dropbox error during validation: {e.user_message_text or e}")
except Exception as e:
raise ConnectorValidationError(f"Unexpected error during Dropbox settings validation: {e}")
def _download_file(self, path: str) -> bytes: def _download_file(self, path: str) -> bytes:
"""Download a single file from Dropbox.""" """Download a single file from Dropbox."""
@ -56,24 +67,103 @@ class DropboxConnector(LoadConnector, PollConnector):
raise ConnectorMissingCredentialError("Dropbox") raise ConnectorMissingCredentialError("Dropbox")
try: try:
# Try to get existing shared links first
shared_links = self.dropbox_client.sharing_list_shared_links(path=path) shared_links = self.dropbox_client.sharing_list_shared_links(path=path)
if shared_links.links: if shared_links.links:
return shared_links.links[0].url return shared_links.links[0].url
# Create a new shared link link_metadata = self.dropbox_client.sharing_create_shared_link_with_settings(path)
link_settings = self.dropbox_client.sharing_create_shared_link_with_settings(path) return link_metadata.url
return link_settings.url except ApiError as err:
except Exception: logger.exception(f"[Dropbox]: Failed to create a shared link for {path}: {err}")
# Fallback to basic link format return ""
return f"https://www.dropbox.com/home{path}"
def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> Any: def _yield_files_recursive(
self,
path: str,
start: SecondsSinceUnixEpoch | None,
end: SecondsSinceUnixEpoch | None,
) -> GenerateDocumentsOutput:
"""Yield files in batches from a specified Dropbox folder, including subfolders."""
if self.dropbox_client is None:
raise ConnectorMissingCredentialError("Dropbox")
result = self.dropbox_client.files_list_folder(
path,
limit=self.batch_size,
recursive=False,
include_non_downloadable_files=False,
)
while True:
batch: list[Document] = []
for entry in result.entries:
if isinstance(entry, FileMetadata):
modified_time = entry.client_modified
if modified_time.tzinfo is None:
modified_time = modified_time.replace(tzinfo=timezone.utc)
else:
modified_time = modified_time.astimezone(timezone.utc)
time_as_seconds = modified_time.timestamp()
if start is not None and time_as_seconds <= start:
continue
if end is not None and time_as_seconds > end:
continue
try:
downloaded_file = self._download_file(entry.path_display)
except Exception:
logger.exception(f"[Dropbox]: Error downloading file {entry.path_display}")
continue
batch.append(
Document(
id=f"dropbox:{entry.id}",
blob=downloaded_file,
source=DocumentSource.DROPBOX,
semantic_identifier=entry.name,
extension=get_file_ext(entry.name),
doc_updated_at=modified_time,
size_bytes=entry.size if getattr(entry, "size", None) is not None else len(downloaded_file),
)
)
elif isinstance(entry, FolderMetadata):
yield from self._yield_files_recursive(entry.path_lower, start, end)
if batch:
yield batch
if not result.has_more:
break
result = self.dropbox_client.files_list_folder_continue(result.cursor)
def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> GenerateDocumentsOutput:
"""Poll Dropbox for recent file changes""" """Poll Dropbox for recent file changes"""
# Simplified implementation - in production this would handle actual polling if self.dropbox_client is None:
return [] raise ConnectorMissingCredentialError("Dropbox")
def load_from_state(self) -> Any: for batch in self._yield_files_recursive("", start, end):
yield batch
def load_from_state(self) -> GenerateDocumentsOutput:
"""Load files from Dropbox state""" """Load files from Dropbox state"""
# Simplified implementation return self._yield_files_recursive("", None, None)
return []
if __name__ == "__main__":
import os
logging.basicConfig(level=logging.DEBUG)
connector = DropboxConnector()
connector.load_credentials({"dropbox_access_token": os.environ.get("DROPBOX_ACCESS_TOKEN")})
connector.validate_connector_settings()
document_batches = connector.load_from_state()
try:
first_batch = next(document_batches)
print(f"Loaded {len(first_batch)} documents in first batch.")
for doc in first_batch:
print(f"- {doc.semantic_identifier} ({doc.size_bytes} bytes)")
except StopIteration:
print("No documents available in Dropbox.")

View File

@ -37,7 +37,7 @@ from api.db.services.connector_service import ConnectorService, SyncLogsService
from api.db.services.knowledgebase_service import KnowledgebaseService from api.db.services.knowledgebase_service import KnowledgebaseService
from common import settings from common import settings
from common.config_utils import show_configs from common.config_utils import show_configs
from common.data_source import BlobStorageConnector, NotionConnector, DiscordConnector, GoogleDriveConnector, MoodleConnector, JiraConnector from common.data_source import BlobStorageConnector, NotionConnector, DiscordConnector, GoogleDriveConnector, MoodleConnector, JiraConnector, DropboxConnector
from common.constants import FileSource, TaskStatus from common.constants import FileSource, TaskStatus
from common.data_source.config import INDEX_BATCH_SIZE from common.data_source.config import INDEX_BATCH_SIZE
from common.data_source.confluence_connector import ConfluenceConnector from common.data_source.confluence_connector import ConfluenceConnector
@ -211,6 +211,27 @@ class Gmail(SyncBase):
pass pass
class Dropbox(SyncBase):
SOURCE_NAME: str = FileSource.DROPBOX
async def _generate(self, task: dict):
self.connector = DropboxConnector(batch_size=self.conf.get("batch_size", INDEX_BATCH_SIZE))
self.connector.load_credentials(self.conf["credentials"])
if task["reindex"] == "1" or not task["poll_range_start"]:
document_generator = self.connector.load_from_state()
begin_info = "totally"
else:
poll_start = task["poll_range_start"]
document_generator = self.connector.poll_source(
poll_start.timestamp(), datetime.now(timezone.utc).timestamp()
)
begin_info = f"from {poll_start}"
logging.info(f"[Dropbox] Connect to Dropbox {begin_info}")
return document_generator
class GoogleDrive(SyncBase): class GoogleDrive(SyncBase):
SOURCE_NAME: str = FileSource.GOOGLE_DRIVE SOURCE_NAME: str = FileSource.GOOGLE_DRIVE
@ -454,7 +475,8 @@ func_factory = {
FileSource.SHAREPOINT: SharePoint, FileSource.SHAREPOINT: SharePoint,
FileSource.SLACK: Slack, FileSource.SLACK: Slack,
FileSource.TEAMS: Teams, FileSource.TEAMS: Teams,
FileSource.MOODLE: Moodle FileSource.MOODLE: Moodle,
FileSource.DROPBOX: Dropbox,
} }

View File

@ -0,0 +1 @@
<svg xmlns="http://www.w3.org/2000/svg" viewBox="89.9 347.3 32 32" width="64" height="64" fill="#007ee5"><path d="M99.337 348.42L89.9 354.5l6.533 5.263 9.467-5.837m-16 11l9.437 6.2 6.563-5.505-9.467-5.868m9.467 5.868l6.594 5.505 9.406-6.14-6.503-5.233m6.503-5.203l-9.406-6.14-6.594 5.505 9.497 5.837m-9.467 7.047l-6.594 5.474-2.843-1.845v2.087l9.437 5.656 9.437-5.656v-2.087l-2.843 1.845"/></svg>

After

Width:  |  Height:  |  Size: 396 B

View File

@ -742,6 +742,10 @@ Example: https://fsn1.your-objectstorage.com`,
'Comma-separated emails whose "My Drive" contents should be indexed (include the primary admin).', 'Comma-separated emails whose "My Drive" contents should be indexed (include the primary admin).',
google_driveSharedFoldersTip: google_driveSharedFoldersTip:
'Comma-separated Google Drive folder links to crawl.', 'Comma-separated Google Drive folder links to crawl.',
dropboxDescription:
'Connect your Dropbox to sync files and folders from a chosen account.',
dropboxAccessTokenTip:
'Generate a long-lived access token in the Dropbox App Console with files.metadata.read, files.content.read, and sharing.read scopes.',
moodleDescription: moodleDescription:
'Connect to your Moodle LMS to sync course content, forums, and resources.', 'Connect to your Moodle LMS to sync course content, forums, and resources.',
moodleUrlTip: moodleUrlTip:

View File

@ -722,6 +722,9 @@ General实体和关系提取提示来自 GitHub - microsoft/graphrag基于
'需要索引其 “我的云端硬盘” 的邮箱,多个邮箱用逗号分隔(建议包含管理员)。', '需要索引其 “我的云端硬盘” 的邮箱,多个邮箱用逗号分隔(建议包含管理员)。',
google_driveSharedFoldersTip: google_driveSharedFoldersTip:
'需要同步的 Google Drive 文件夹链接,多个链接用逗号分隔。', '需要同步的 Google Drive 文件夹链接,多个链接用逗号分隔。',
dropboxDescription: '连接 Dropbox同步指定账号下的文件与文件夹。',
dropboxAccessTokenTip:
'请在 Dropbox App Console 生成 Access Token并勾选 files.metadata.read、files.content.read、sharing.read 等必要权限。',
jiraDescription: '接入 Jira 工作区持续同步Issues、评论与附件。', jiraDescription: '接入 Jira 工作区持续同步Issues、评论与附件。',
jiraBaseUrlTip: jiraBaseUrlTip:
'Jira 的 Base URL例如https://your-domain.atlassian.net。', 'Jira 的 Base URL例如https://your-domain.atlassian.net。',

View File

@ -12,6 +12,7 @@ export enum DataSourceKey {
MOODLE = 'moodle', MOODLE = 'moodle',
// GMAIL = 'gmail', // GMAIL = 'gmail',
JIRA = 'jira', JIRA = 'jira',
DROPBOX = 'dropbox',
// SHAREPOINT = 'sharepoint', // SHAREPOINT = 'sharepoint',
// SLACK = 'slack', // SLACK = 'slack',
// TEAMS = 'teams', // TEAMS = 'teams',
@ -53,6 +54,11 @@ export const DataSourceInfo = {
description: t(`setting.${DataSourceKey.JIRA}Description`), description: t(`setting.${DataSourceKey.JIRA}Description`),
icon: <SvgIcon name={'data-source/jira'} width={38} />, icon: <SvgIcon name={'data-source/jira'} width={38} />,
}, },
[DataSourceKey.DROPBOX]: {
name: 'Dropbox',
description: t(`setting.${DataSourceKey.DROPBOX}Description`),
icon: <SvgIcon name={'data-source/dropbox'} width={38} />,
},
}; };
export const DataSourceFormBaseFields = [ export const DataSourceFormBaseFields = [
@ -408,6 +414,22 @@ export const DataSourceFormFields = {
tooltip: t('setting.jiraPasswordTip'), tooltip: t('setting.jiraPasswordTip'),
}, },
], ],
[DataSourceKey.DROPBOX]: [
{
label: 'Access Token',
name: 'config.credentials.dropbox_access_token',
type: FormFieldType.Password,
required: true,
tooltip: t('setting.dropboxAccessTokenTip'),
},
{
label: 'Batch Size',
name: 'config.batch_size',
type: FormFieldType.Number,
required: false,
placeholder: 'Defaults to 2',
},
],
}; };
export const DataSourceFormDefaultValues = { export const DataSourceFormDefaultValues = {
@ -508,4 +530,14 @@ export const DataSourceFormDefaultValues = {
}, },
}, },
}, },
[DataSourceKey.DROPBOX]: {
name: '',
source: DataSourceKey.DROPBOX,
config: {
batch_size: 2,
credentials: {
dropbox_access_token: '',
},
},
},
}; };

View File

@ -56,6 +56,12 @@ const dataSourceTemplates = [
description: DataSourceInfo[DataSourceKey.JIRA].description, description: DataSourceInfo[DataSourceKey.JIRA].description,
icon: DataSourceInfo[DataSourceKey.JIRA].icon, icon: DataSourceInfo[DataSourceKey.JIRA].icon,
}, },
{
id: DataSourceKey.DROPBOX,
name: DataSourceInfo[DataSourceKey.DROPBOX].name,
description: DataSourceInfo[DataSourceKey.DROPBOX].description,
icon: DataSourceInfo[DataSourceKey.DROPBOX].icon,
},
]; ];
const DataSource = () => { const DataSource = () => {
const { t } = useTranslation(); const { t } = useTranslation();