mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-19 20:16:49 +08:00
Feat: Add box connector (#11845)
### What problem does this PR solve? Feat: Add box connector ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
162
common/data_source/box_connector.py
Normal file
162
common/data_source/box_connector.py
Normal file
@ -0,0 +1,162 @@
|
||||
"""Box connector"""
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from typing import Any
|
||||
|
||||
from box_sdk_gen import BoxClient
|
||||
from common.data_source.config import DocumentSource, INDEX_BATCH_SIZE
|
||||
from common.data_source.exceptions import (
|
||||
ConnectorMissingCredentialError,
|
||||
ConnectorValidationError,
|
||||
)
|
||||
from common.data_source.interfaces import LoadConnector, PollConnector, SecondsSinceUnixEpoch
|
||||
from common.data_source.models import Document, GenerateDocumentsOutput
|
||||
from common.data_source.utils import get_file_ext
|
||||
|
||||
class BoxConnector(LoadConnector, PollConnector):
|
||||
def __init__(self, folder_id: str, batch_size: int = INDEX_BATCH_SIZE, use_marker: bool = True) -> None:
|
||||
self.batch_size = batch_size
|
||||
self.folder_id = "0" if not folder_id else folder_id
|
||||
self.use_marker = use_marker
|
||||
|
||||
|
||||
def load_credentials(self, auth: Any):
|
||||
self.box_client = BoxClient(auth=auth)
|
||||
return None
|
||||
|
||||
|
||||
def validate_connector_settings(self):
|
||||
if self.box_client is None:
|
||||
raise ConnectorMissingCredentialError("Box")
|
||||
|
||||
try:
|
||||
self.box_client.users.get_user_me()
|
||||
except Exception as e:
|
||||
logging.exception("[Box]: Failed to validate Box credentials")
|
||||
raise ConnectorValidationError(f"Unexpected error during Box settings validation: {e}")
|
||||
|
||||
|
||||
def _yield_files_recursive(
|
||||
self,
|
||||
folder_id,
|
||||
start: SecondsSinceUnixEpoch | None,
|
||||
end: SecondsSinceUnixEpoch | None
|
||||
) -> GenerateDocumentsOutput:
|
||||
|
||||
if self.box_client is None:
|
||||
raise ConnectorMissingCredentialError("Box")
|
||||
|
||||
result = self.box_client.folders.get_folder_items(
|
||||
folder_id=folder_id,
|
||||
limit=self.batch_size,
|
||||
usemarker=self.use_marker
|
||||
)
|
||||
|
||||
while True:
|
||||
batch: list[Document] = []
|
||||
for entry in result.entries:
|
||||
if entry.type == 'file' :
|
||||
file = self.box_client.files.get_file_by_id(
|
||||
entry.id
|
||||
)
|
||||
raw_time = (
|
||||
getattr(file, "created_at", None)
|
||||
or getattr(file, "content_created_at", None)
|
||||
)
|
||||
|
||||
if raw_time:
|
||||
modified_time = self._box_datetime_to_epoch_seconds(raw_time)
|
||||
if start is not None and modified_time <= start:
|
||||
continue
|
||||
if end is not None and modified_time > end:
|
||||
continue
|
||||
|
||||
content_bytes = self.box_client.downloads.download_file(file.id)
|
||||
|
||||
batch.append(
|
||||
Document(
|
||||
id=f"box:{file.id}",
|
||||
blob=content_bytes.read(),
|
||||
source=DocumentSource.BOX,
|
||||
semantic_identifier=file.name,
|
||||
extension=get_file_ext(file.name),
|
||||
doc_updated_at=modified_time,
|
||||
size_bytes=file.size,
|
||||
metadata=file.metadata
|
||||
)
|
||||
)
|
||||
elif entry.type == 'folder':
|
||||
yield from self._yield_files_recursive(folder_id=entry.id, start=start, end=end)
|
||||
|
||||
if batch:
|
||||
yield batch
|
||||
|
||||
if not result.next_marker:
|
||||
break
|
||||
|
||||
result = self.box_client.folders.get_folder_items(
|
||||
folder_id=folder_id,
|
||||
limit=self.batch_size,
|
||||
marker=result.next_marker,
|
||||
usemarker=True
|
||||
)
|
||||
|
||||
|
||||
def _box_datetime_to_epoch_seconds(self, dt: datetime) -> SecondsSinceUnixEpoch:
|
||||
"""Convert a Box SDK datetime to Unix epoch seconds (UTC).
|
||||
Only supports datetime; any non-datetime should be filtered out by caller.
|
||||
"""
|
||||
if not isinstance(dt, datetime):
|
||||
raise TypeError(f"box_datetime_to_epoch_seconds expects datetime, got {type(dt)}")
|
||||
|
||||
if dt.tzinfo is None:
|
||||
dt = dt.replace(tzinfo=timezone.utc)
|
||||
else:
|
||||
dt = dt.astimezone(timezone.utc)
|
||||
|
||||
return SecondsSinceUnixEpoch(int(dt.timestamp()))
|
||||
|
||||
|
||||
def poll_source(self, start, end):
|
||||
return self._yield_files_recursive(folder_id=self.folder_id, start=start, end=end)
|
||||
|
||||
|
||||
def load_from_state(self):
|
||||
return self._yield_files_recursive(folder_id=self.folder_id, start=None, end=None)
|
||||
|
||||
|
||||
# from flask import Flask, request, redirect
|
||||
|
||||
# from box_sdk_gen import BoxClient, BoxOAuth, OAuthConfig, GetAuthorizeUrlOptions
|
||||
|
||||
# app = Flask(__name__)
|
||||
|
||||
# AUTH = BoxOAuth(
|
||||
# OAuthConfig(client_id="8suvn9ik7qezsq2dub0ye6ubox61081z", client_secret="QScvhLgBcZrb2ck1QP1ovkutpRhI2QcN")
|
||||
# )
|
||||
|
||||
|
||||
# @app.route("/")
|
||||
# def get_auth():
|
||||
# auth_url = AUTH.get_authorize_url(
|
||||
# options=GetAuthorizeUrlOptions(redirect_uri="http://localhost:4999/oauth2callback")
|
||||
# )
|
||||
# return redirect(auth_url, code=302)
|
||||
|
||||
|
||||
# @app.route("/oauth2callback")
|
||||
# def callback():
|
||||
# AUTH.get_tokens_authorization_code_grant(request.args.get("code"))
|
||||
# box = BoxConnector()
|
||||
# box.load_credentials({"auth": AUTH})
|
||||
|
||||
# lst = []
|
||||
# for file in box.load_from_state():
|
||||
# for f in file:
|
||||
# lst.append(f.semantic_identifier)
|
||||
|
||||
# return lst
|
||||
|
||||
if __name__ == "__main__":
|
||||
pass
|
||||
# app.run(port=4999)
|
||||
@ -52,7 +52,7 @@ class DocumentSource(str, Enum):
|
||||
MOODLE = "moodle"
|
||||
S3_COMPATIBLE = "s3_compatible"
|
||||
DROPBOX = "dropbox"
|
||||
|
||||
BOX = "box"
|
||||
|
||||
class FileOrigin(str, Enum):
|
||||
"""File origins"""
|
||||
@ -227,6 +227,7 @@ _DEFAULT_PAGINATION_LIMIT = 1000
|
||||
_PROBLEMATIC_EXPANSIONS = "body.storage.value"
|
||||
_REPLACEMENT_EXPANSIONS = "body.view.value"
|
||||
|
||||
BOX_WEB_OAUTH_REDIRECT_URI = os.environ.get("BOX_WEB_OAUTH_REDIRECT_URI", "http://localhost:9380/v1/connector/box/oauth/web/callback")
|
||||
|
||||
class HtmlBasedConnectorTransformLinksStrategy(str, Enum):
|
||||
# remove links entirely
|
||||
|
||||
@ -49,7 +49,7 @@ MISSING_SCOPES_ERROR_STR = "client not authorized for any of the scopes requeste
|
||||
SCOPE_INSTRUCTIONS = ""
|
||||
|
||||
|
||||
GOOGLE_WEB_OAUTH_POPUP_TEMPLATE = """<!DOCTYPE html>
|
||||
WEB_OAUTH_POPUP_TEMPLATE = """<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8" />
|
||||
|
||||
Reference in New Issue
Block a user