mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: add datasource Dropbox (#11488)
### What problem does this PR solve? Add datasource Dropbox. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -119,6 +119,7 @@ class FileSource(StrEnum):
|
|||||||
SLACK = "slack"
|
SLACK = "slack"
|
||||||
TEAMS = "teams"
|
TEAMS = "teams"
|
||||||
MOODLE = "moodle"
|
MOODLE = "moodle"
|
||||||
|
DROPBOX = "dropbox"
|
||||||
|
|
||||||
|
|
||||||
class PipelineTaskType(StrEnum):
|
class PipelineTaskType(StrEnum):
|
||||||
|
|||||||
@ -50,6 +50,7 @@ class DocumentSource(str, Enum):
|
|||||||
DISCORD = "discord"
|
DISCORD = "discord"
|
||||||
MOODLE = "moodle"
|
MOODLE = "moodle"
|
||||||
S3_COMPATIBLE = "s3_compatible"
|
S3_COMPATIBLE = "s3_compatible"
|
||||||
|
DROPBOX = "dropbox"
|
||||||
|
|
||||||
|
|
||||||
class FileOrigin(str, Enum):
|
class FileOrigin(str, Enum):
|
||||||
|
|||||||
@ -1,13 +1,24 @@
|
|||||||
"""Dropbox connector"""
|
"""Dropbox connector"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from datetime import timezone
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
from dropbox import Dropbox
|
from dropbox import Dropbox
|
||||||
from dropbox.exceptions import ApiError, AuthError
|
from dropbox.exceptions import ApiError, AuthError
|
||||||
|
from dropbox.files import FileMetadata, FolderMetadata
|
||||||
|
|
||||||
from common.data_source.config import INDEX_BATCH_SIZE
|
from common.data_source.config import INDEX_BATCH_SIZE, DocumentSource
|
||||||
from common.data_source.exceptions import ConnectorValidationError, InsufficientPermissionsError, ConnectorMissingCredentialError
|
from common.data_source.exceptions import (
|
||||||
|
ConnectorMissingCredentialError,
|
||||||
|
ConnectorValidationError,
|
||||||
|
InsufficientPermissionsError,
|
||||||
|
)
|
||||||
from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch
|
from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch
|
||||||
|
from common.data_source.models import Document, GenerateDocumentsOutput
|
||||||
|
from common.data_source.utils import get_file_ext
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class DropboxConnector(LoadConnector, PollConnector):
|
class DropboxConnector(LoadConnector, PollConnector):
|
||||||
@ -19,29 +30,29 @@ class DropboxConnector(LoadConnector, PollConnector):
|
|||||||
|
|
||||||
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
|
||||||
"""Load Dropbox credentials"""
|
"""Load Dropbox credentials"""
|
||||||
try:
|
access_token = credentials.get("dropbox_access_token")
|
||||||
access_token = credentials.get("dropbox_access_token")
|
if not access_token:
|
||||||
if not access_token:
|
raise ConnectorMissingCredentialError("Dropbox access token is required")
|
||||||
raise ConnectorMissingCredentialError("Dropbox access token is required")
|
|
||||||
|
|
||||||
self.dropbox_client = Dropbox(access_token)
|
self.dropbox_client = Dropbox(access_token)
|
||||||
return None
|
return None
|
||||||
except Exception as e:
|
|
||||||
raise ConnectorMissingCredentialError(f"Dropbox: {e}")
|
|
||||||
|
|
||||||
def validate_connector_settings(self) -> None:
|
def validate_connector_settings(self) -> None:
|
||||||
"""Validate Dropbox connector settings"""
|
"""Validate Dropbox connector settings"""
|
||||||
if not self.dropbox_client:
|
if self.dropbox_client is None:
|
||||||
raise ConnectorMissingCredentialError("Dropbox")
|
raise ConnectorMissingCredentialError("Dropbox")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Test connection by getting current account info
|
self.dropbox_client.files_list_folder(path="", limit=1)
|
||||||
self.dropbox_client.users_get_current_account()
|
except AuthError as e:
|
||||||
except (AuthError, ApiError) as e:
|
logger.exception("[Dropbox]: Failed to validate Dropbox credentials")
|
||||||
if "invalid_access_token" in str(e).lower():
|
raise ConnectorValidationError(f"Dropbox credential is invalid: {e}")
|
||||||
raise InsufficientPermissionsError("Invalid Dropbox access token")
|
except ApiError as e:
|
||||||
else:
|
if e.error is not None and "insufficient_permissions" in str(e.error).lower():
|
||||||
raise ConnectorValidationError(f"Dropbox validation error: {e}")
|
raise InsufficientPermissionsError("Your Dropbox token does not have sufficient permissions.")
|
||||||
|
raise ConnectorValidationError(f"Unexpected Dropbox error during validation: {e.user_message_text or e}")
|
||||||
|
except Exception as e:
|
||||||
|
raise ConnectorValidationError(f"Unexpected error during Dropbox settings validation: {e}")
|
||||||
|
|
||||||
def _download_file(self, path: str) -> bytes:
|
def _download_file(self, path: str) -> bytes:
|
||||||
"""Download a single file from Dropbox."""
|
"""Download a single file from Dropbox."""
|
||||||
@ -56,24 +67,103 @@ class DropboxConnector(LoadConnector, PollConnector):
|
|||||||
raise ConnectorMissingCredentialError("Dropbox")
|
raise ConnectorMissingCredentialError("Dropbox")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Try to get existing shared links first
|
|
||||||
shared_links = self.dropbox_client.sharing_list_shared_links(path=path)
|
shared_links = self.dropbox_client.sharing_list_shared_links(path=path)
|
||||||
if shared_links.links:
|
if shared_links.links:
|
||||||
return shared_links.links[0].url
|
return shared_links.links[0].url
|
||||||
|
|
||||||
# Create a new shared link
|
link_metadata = self.dropbox_client.sharing_create_shared_link_with_settings(path)
|
||||||
link_settings = self.dropbox_client.sharing_create_shared_link_with_settings(path)
|
return link_metadata.url
|
||||||
return link_settings.url
|
except ApiError as err:
|
||||||
except Exception:
|
logger.exception(f"[Dropbox]: Failed to create a shared link for {path}: {err}")
|
||||||
# Fallback to basic link format
|
return ""
|
||||||
return f"https://www.dropbox.com/home{path}"
|
|
||||||
|
|
||||||
def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> Any:
|
def _yield_files_recursive(
|
||||||
|
self,
|
||||||
|
path: str,
|
||||||
|
start: SecondsSinceUnixEpoch | None,
|
||||||
|
end: SecondsSinceUnixEpoch | None,
|
||||||
|
) -> GenerateDocumentsOutput:
|
||||||
|
"""Yield files in batches from a specified Dropbox folder, including subfolders."""
|
||||||
|
if self.dropbox_client is None:
|
||||||
|
raise ConnectorMissingCredentialError("Dropbox")
|
||||||
|
|
||||||
|
result = self.dropbox_client.files_list_folder(
|
||||||
|
path,
|
||||||
|
limit=self.batch_size,
|
||||||
|
recursive=False,
|
||||||
|
include_non_downloadable_files=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
batch: list[Document] = []
|
||||||
|
for entry in result.entries:
|
||||||
|
if isinstance(entry, FileMetadata):
|
||||||
|
modified_time = entry.client_modified
|
||||||
|
if modified_time.tzinfo is None:
|
||||||
|
modified_time = modified_time.replace(tzinfo=timezone.utc)
|
||||||
|
else:
|
||||||
|
modified_time = modified_time.astimezone(timezone.utc)
|
||||||
|
|
||||||
|
time_as_seconds = modified_time.timestamp()
|
||||||
|
if start is not None and time_as_seconds <= start:
|
||||||
|
continue
|
||||||
|
if end is not None and time_as_seconds > end:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
downloaded_file = self._download_file(entry.path_display)
|
||||||
|
except Exception:
|
||||||
|
logger.exception(f"[Dropbox]: Error downloading file {entry.path_display}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
batch.append(
|
||||||
|
Document(
|
||||||
|
id=f"dropbox:{entry.id}",
|
||||||
|
blob=downloaded_file,
|
||||||
|
source=DocumentSource.DROPBOX,
|
||||||
|
semantic_identifier=entry.name,
|
||||||
|
extension=get_file_ext(entry.name),
|
||||||
|
doc_updated_at=modified_time,
|
||||||
|
size_bytes=entry.size if getattr(entry, "size", None) is not None else len(downloaded_file),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
elif isinstance(entry, FolderMetadata):
|
||||||
|
yield from self._yield_files_recursive(entry.path_lower, start, end)
|
||||||
|
|
||||||
|
if batch:
|
||||||
|
yield batch
|
||||||
|
|
||||||
|
if not result.has_more:
|
||||||
|
break
|
||||||
|
|
||||||
|
result = self.dropbox_client.files_list_folder_continue(result.cursor)
|
||||||
|
|
||||||
|
def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> GenerateDocumentsOutput:
|
||||||
"""Poll Dropbox for recent file changes"""
|
"""Poll Dropbox for recent file changes"""
|
||||||
# Simplified implementation - in production this would handle actual polling
|
if self.dropbox_client is None:
|
||||||
return []
|
raise ConnectorMissingCredentialError("Dropbox")
|
||||||
|
|
||||||
def load_from_state(self) -> Any:
|
for batch in self._yield_files_recursive("", start, end):
|
||||||
|
yield batch
|
||||||
|
|
||||||
|
def load_from_state(self) -> GenerateDocumentsOutput:
|
||||||
"""Load files from Dropbox state"""
|
"""Load files from Dropbox state"""
|
||||||
# Simplified implementation
|
return self._yield_files_recursive("", None, None)
|
||||||
return []
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import os
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG)
|
||||||
|
connector = DropboxConnector()
|
||||||
|
connector.load_credentials({"dropbox_access_token": os.environ.get("DROPBOX_ACCESS_TOKEN")})
|
||||||
|
connector.validate_connector_settings()
|
||||||
|
document_batches = connector.load_from_state()
|
||||||
|
try:
|
||||||
|
first_batch = next(document_batches)
|
||||||
|
print(f"Loaded {len(first_batch)} documents in first batch.")
|
||||||
|
for doc in first_batch:
|
||||||
|
print(f"- {doc.semantic_identifier} ({doc.size_bytes} bytes)")
|
||||||
|
except StopIteration:
|
||||||
|
print("No documents available in Dropbox.")
|
||||||
|
|||||||
@ -37,7 +37,7 @@ from api.db.services.connector_service import ConnectorService, SyncLogsService
|
|||||||
from api.db.services.knowledgebase_service import KnowledgebaseService
|
from api.db.services.knowledgebase_service import KnowledgebaseService
|
||||||
from common import settings
|
from common import settings
|
||||||
from common.config_utils import show_configs
|
from common.config_utils import show_configs
|
||||||
from common.data_source import BlobStorageConnector, NotionConnector, DiscordConnector, GoogleDriveConnector, MoodleConnector, JiraConnector
|
from common.data_source import BlobStorageConnector, NotionConnector, DiscordConnector, GoogleDriveConnector, MoodleConnector, JiraConnector, DropboxConnector
|
||||||
from common.constants import FileSource, TaskStatus
|
from common.constants import FileSource, TaskStatus
|
||||||
from common.data_source.config import INDEX_BATCH_SIZE
|
from common.data_source.config import INDEX_BATCH_SIZE
|
||||||
from common.data_source.confluence_connector import ConfluenceConnector
|
from common.data_source.confluence_connector import ConfluenceConnector
|
||||||
@ -211,6 +211,27 @@ class Gmail(SyncBase):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Dropbox(SyncBase):
|
||||||
|
SOURCE_NAME: str = FileSource.DROPBOX
|
||||||
|
|
||||||
|
async def _generate(self, task: dict):
|
||||||
|
self.connector = DropboxConnector(batch_size=self.conf.get("batch_size", INDEX_BATCH_SIZE))
|
||||||
|
self.connector.load_credentials(self.conf["credentials"])
|
||||||
|
|
||||||
|
if task["reindex"] == "1" or not task["poll_range_start"]:
|
||||||
|
document_generator = self.connector.load_from_state()
|
||||||
|
begin_info = "totally"
|
||||||
|
else:
|
||||||
|
poll_start = task["poll_range_start"]
|
||||||
|
document_generator = self.connector.poll_source(
|
||||||
|
poll_start.timestamp(), datetime.now(timezone.utc).timestamp()
|
||||||
|
)
|
||||||
|
begin_info = f"from {poll_start}"
|
||||||
|
|
||||||
|
logging.info(f"[Dropbox] Connect to Dropbox {begin_info}")
|
||||||
|
return document_generator
|
||||||
|
|
||||||
|
|
||||||
class GoogleDrive(SyncBase):
|
class GoogleDrive(SyncBase):
|
||||||
SOURCE_NAME: str = FileSource.GOOGLE_DRIVE
|
SOURCE_NAME: str = FileSource.GOOGLE_DRIVE
|
||||||
|
|
||||||
@ -454,7 +475,8 @@ func_factory = {
|
|||||||
FileSource.SHAREPOINT: SharePoint,
|
FileSource.SHAREPOINT: SharePoint,
|
||||||
FileSource.SLACK: Slack,
|
FileSource.SLACK: Slack,
|
||||||
FileSource.TEAMS: Teams,
|
FileSource.TEAMS: Teams,
|
||||||
FileSource.MOODLE: Moodle
|
FileSource.MOODLE: Moodle,
|
||||||
|
FileSource.DROPBOX: Dropbox,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
1
web/src/assets/svg/data-source/dropbox.svg
Normal file
1
web/src/assets/svg/data-source/dropbox.svg
Normal file
@ -0,0 +1 @@
|
|||||||
|
<svg xmlns="http://www.w3.org/2000/svg" viewBox="89.9 347.3 32 32" width="64" height="64" fill="#007ee5"><path d="M99.337 348.42L89.9 354.5l6.533 5.263 9.467-5.837m-16 11l9.437 6.2 6.563-5.505-9.467-5.868m9.467 5.868l6.594 5.505 9.406-6.14-6.503-5.233m6.503-5.203l-9.406-6.14-6.594 5.505 9.497 5.837m-9.467 7.047l-6.594 5.474-2.843-1.845v2.087l9.437 5.656 9.437-5.656v-2.087l-2.843 1.845"/></svg>
|
||||||
|
After Width: | Height: | Size: 396 B |
@ -742,6 +742,10 @@ Example: https://fsn1.your-objectstorage.com`,
|
|||||||
'Comma-separated emails whose "My Drive" contents should be indexed (include the primary admin).',
|
'Comma-separated emails whose "My Drive" contents should be indexed (include the primary admin).',
|
||||||
google_driveSharedFoldersTip:
|
google_driveSharedFoldersTip:
|
||||||
'Comma-separated Google Drive folder links to crawl.',
|
'Comma-separated Google Drive folder links to crawl.',
|
||||||
|
dropboxDescription:
|
||||||
|
'Connect your Dropbox to sync files and folders from a chosen account.',
|
||||||
|
dropboxAccessTokenTip:
|
||||||
|
'Generate a long-lived access token in the Dropbox App Console with files.metadata.read, files.content.read, and sharing.read scopes.',
|
||||||
moodleDescription:
|
moodleDescription:
|
||||||
'Connect to your Moodle LMS to sync course content, forums, and resources.',
|
'Connect to your Moodle LMS to sync course content, forums, and resources.',
|
||||||
moodleUrlTip:
|
moodleUrlTip:
|
||||||
|
|||||||
@ -722,6 +722,9 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于
|
|||||||
'需要索引其 “我的云端硬盘” 的邮箱,多个邮箱用逗号分隔(建议包含管理员)。',
|
'需要索引其 “我的云端硬盘” 的邮箱,多个邮箱用逗号分隔(建议包含管理员)。',
|
||||||
google_driveSharedFoldersTip:
|
google_driveSharedFoldersTip:
|
||||||
'需要同步的 Google Drive 文件夹链接,多个链接用逗号分隔。',
|
'需要同步的 Google Drive 文件夹链接,多个链接用逗号分隔。',
|
||||||
|
dropboxDescription: '连接 Dropbox,同步指定账号下的文件与文件夹。',
|
||||||
|
dropboxAccessTokenTip:
|
||||||
|
'请在 Dropbox App Console 生成 Access Token,并勾选 files.metadata.read、files.content.read、sharing.read 等必要权限。',
|
||||||
jiraDescription: '接入 Jira 工作区,持续同步Issues、评论与附件。',
|
jiraDescription: '接入 Jira 工作区,持续同步Issues、评论与附件。',
|
||||||
jiraBaseUrlTip:
|
jiraBaseUrlTip:
|
||||||
'Jira 的 Base URL,例如:https://your-domain.atlassian.net。',
|
'Jira 的 Base URL,例如:https://your-domain.atlassian.net。',
|
||||||
|
|||||||
@ -12,6 +12,7 @@ export enum DataSourceKey {
|
|||||||
MOODLE = 'moodle',
|
MOODLE = 'moodle',
|
||||||
// GMAIL = 'gmail',
|
// GMAIL = 'gmail',
|
||||||
JIRA = 'jira',
|
JIRA = 'jira',
|
||||||
|
DROPBOX = 'dropbox',
|
||||||
// SHAREPOINT = 'sharepoint',
|
// SHAREPOINT = 'sharepoint',
|
||||||
// SLACK = 'slack',
|
// SLACK = 'slack',
|
||||||
// TEAMS = 'teams',
|
// TEAMS = 'teams',
|
||||||
@ -53,6 +54,11 @@ export const DataSourceInfo = {
|
|||||||
description: t(`setting.${DataSourceKey.JIRA}Description`),
|
description: t(`setting.${DataSourceKey.JIRA}Description`),
|
||||||
icon: <SvgIcon name={'data-source/jira'} width={38} />,
|
icon: <SvgIcon name={'data-source/jira'} width={38} />,
|
||||||
},
|
},
|
||||||
|
[DataSourceKey.DROPBOX]: {
|
||||||
|
name: 'Dropbox',
|
||||||
|
description: t(`setting.${DataSourceKey.DROPBOX}Description`),
|
||||||
|
icon: <SvgIcon name={'data-source/dropbox'} width={38} />,
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
export const DataSourceFormBaseFields = [
|
export const DataSourceFormBaseFields = [
|
||||||
@ -408,6 +414,22 @@ export const DataSourceFormFields = {
|
|||||||
tooltip: t('setting.jiraPasswordTip'),
|
tooltip: t('setting.jiraPasswordTip'),
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
|
[DataSourceKey.DROPBOX]: [
|
||||||
|
{
|
||||||
|
label: 'Access Token',
|
||||||
|
name: 'config.credentials.dropbox_access_token',
|
||||||
|
type: FormFieldType.Password,
|
||||||
|
required: true,
|
||||||
|
tooltip: t('setting.dropboxAccessTokenTip'),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
label: 'Batch Size',
|
||||||
|
name: 'config.batch_size',
|
||||||
|
type: FormFieldType.Number,
|
||||||
|
required: false,
|
||||||
|
placeholder: 'Defaults to 2',
|
||||||
|
},
|
||||||
|
],
|
||||||
};
|
};
|
||||||
|
|
||||||
export const DataSourceFormDefaultValues = {
|
export const DataSourceFormDefaultValues = {
|
||||||
@ -508,4 +530,14 @@ export const DataSourceFormDefaultValues = {
|
|||||||
},
|
},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
[DataSourceKey.DROPBOX]: {
|
||||||
|
name: '',
|
||||||
|
source: DataSourceKey.DROPBOX,
|
||||||
|
config: {
|
||||||
|
batch_size: 2,
|
||||||
|
credentials: {
|
||||||
|
dropbox_access_token: '',
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
};
|
};
|
||||||
|
|||||||
@ -56,6 +56,12 @@ const dataSourceTemplates = [
|
|||||||
description: DataSourceInfo[DataSourceKey.JIRA].description,
|
description: DataSourceInfo[DataSourceKey.JIRA].description,
|
||||||
icon: DataSourceInfo[DataSourceKey.JIRA].icon,
|
icon: DataSourceInfo[DataSourceKey.JIRA].icon,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
id: DataSourceKey.DROPBOX,
|
||||||
|
name: DataSourceInfo[DataSourceKey.DROPBOX].name,
|
||||||
|
description: DataSourceInfo[DataSourceKey.DROPBOX].description,
|
||||||
|
icon: DataSourceInfo[DataSourceKey.DROPBOX].icon,
|
||||||
|
},
|
||||||
];
|
];
|
||||||
const DataSource = () => {
|
const DataSource = () => {
|
||||||
const { t } = useTranslation();
|
const { t } = useTranslation();
|
||||||
|
|||||||
Reference in New Issue
Block a user