Files
ragflow/common/data_source/dropbox_connector.py
Yongteng Lei d1744aaaf3 Feat: add datasource Dropbox (#11488)
### What problem does this PR solve?

Add datasource Dropbox.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
2025-11-25 09:40:03 +08:00

170 lines
6.8 KiB
Python

"""Dropbox connector"""
import logging
from datetime import timezone
from typing import Any
from dropbox import Dropbox
from dropbox.exceptions import ApiError, AuthError
from dropbox.files import FileMetadata, FolderMetadata
from common.data_source.config import INDEX_BATCH_SIZE, DocumentSource
from common.data_source.exceptions import (
ConnectorMissingCredentialError,
ConnectorValidationError,
InsufficientPermissionsError,
)
from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch
from common.data_source.models import Document, GenerateDocumentsOutput
from common.data_source.utils import get_file_ext
logger = logging.getLogger(__name__)
class DropboxConnector(LoadConnector, PollConnector):
"""Dropbox connector for accessing Dropbox files and folders"""
def __init__(self, batch_size: int = INDEX_BATCH_SIZE) -> None:
self.batch_size = batch_size
self.dropbox_client: Dropbox | None = None
def load_credentials(self, credentials: dict[str, Any]) -> dict[str, Any] | None:
"""Load Dropbox credentials"""
access_token = credentials.get("dropbox_access_token")
if not access_token:
raise ConnectorMissingCredentialError("Dropbox access token is required")
self.dropbox_client = Dropbox(access_token)
return None
def validate_connector_settings(self) -> None:
"""Validate Dropbox connector settings"""
if self.dropbox_client is None:
raise ConnectorMissingCredentialError("Dropbox")
try:
self.dropbox_client.files_list_folder(path="", limit=1)
except AuthError as e:
logger.exception("[Dropbox]: Failed to validate Dropbox credentials")
raise ConnectorValidationError(f"Dropbox credential is invalid: {e}")
except ApiError as e:
if e.error is not None and "insufficient_permissions" in str(e.error).lower():
raise InsufficientPermissionsError("Your Dropbox token does not have sufficient permissions.")
raise ConnectorValidationError(f"Unexpected Dropbox error during validation: {e.user_message_text or e}")
except Exception as e:
raise ConnectorValidationError(f"Unexpected error during Dropbox settings validation: {e}")
def _download_file(self, path: str) -> bytes:
"""Download a single file from Dropbox."""
if self.dropbox_client is None:
raise ConnectorMissingCredentialError("Dropbox")
_, resp = self.dropbox_client.files_download(path)
return resp.content
def _get_shared_link(self, path: str) -> str:
"""Create a shared link for a file in Dropbox."""
if self.dropbox_client is None:
raise ConnectorMissingCredentialError("Dropbox")
try:
shared_links = self.dropbox_client.sharing_list_shared_links(path=path)
if shared_links.links:
return shared_links.links[0].url
link_metadata = self.dropbox_client.sharing_create_shared_link_with_settings(path)
return link_metadata.url
except ApiError as err:
logger.exception(f"[Dropbox]: Failed to create a shared link for {path}: {err}")
return ""
def _yield_files_recursive(
self,
path: str,
start: SecondsSinceUnixEpoch | None,
end: SecondsSinceUnixEpoch | None,
) -> GenerateDocumentsOutput:
"""Yield files in batches from a specified Dropbox folder, including subfolders."""
if self.dropbox_client is None:
raise ConnectorMissingCredentialError("Dropbox")
result = self.dropbox_client.files_list_folder(
path,
limit=self.batch_size,
recursive=False,
include_non_downloadable_files=False,
)
while True:
batch: list[Document] = []
for entry in result.entries:
if isinstance(entry, FileMetadata):
modified_time = entry.client_modified
if modified_time.tzinfo is None:
modified_time = modified_time.replace(tzinfo=timezone.utc)
else:
modified_time = modified_time.astimezone(timezone.utc)
time_as_seconds = modified_time.timestamp()
if start is not None and time_as_seconds <= start:
continue
if end is not None and time_as_seconds > end:
continue
try:
downloaded_file = self._download_file(entry.path_display)
except Exception:
logger.exception(f"[Dropbox]: Error downloading file {entry.path_display}")
continue
batch.append(
Document(
id=f"dropbox:{entry.id}",
blob=downloaded_file,
source=DocumentSource.DROPBOX,
semantic_identifier=entry.name,
extension=get_file_ext(entry.name),
doc_updated_at=modified_time,
size_bytes=entry.size if getattr(entry, "size", None) is not None else len(downloaded_file),
)
)
elif isinstance(entry, FolderMetadata):
yield from self._yield_files_recursive(entry.path_lower, start, end)
if batch:
yield batch
if not result.has_more:
break
result = self.dropbox_client.files_list_folder_continue(result.cursor)
def poll_source(self, start: SecondsSinceUnixEpoch, end: SecondsSinceUnixEpoch) -> GenerateDocumentsOutput:
"""Poll Dropbox for recent file changes"""
if self.dropbox_client is None:
raise ConnectorMissingCredentialError("Dropbox")
for batch in self._yield_files_recursive("", start, end):
yield batch
def load_from_state(self) -> GenerateDocumentsOutput:
"""Load files from Dropbox state"""
return self._yield_files_recursive("", None, None)
if __name__ == "__main__":
import os
logging.basicConfig(level=logging.DEBUG)
connector = DropboxConnector()
connector.load_credentials({"dropbox_access_token": os.environ.get("DROPBOX_ACCESS_TOKEN")})
connector.validate_connector_settings()
document_batches = connector.load_from_state()
try:
first_batch = next(document_batches)
print(f"Loaded {len(first_batch)} documents in first batch.")
for doc in first_batch:
print(f"- {doc.semantic_identifier} ({doc.size_bytes} bytes)")
except StopIteration:
print("No documents available in Dropbox.")