feat(connector): add Seafile as data source (#12945)

### What problem does this PR solve? This PR adds **Seafile** as a new data source connector for RAGFlow. [Seafile](https://www.seafile.com/) is an open-source, self-hosted file sync and share platform widely used by enterprises, universities, and organizations that require data sovereignty and privacy. Users who store documents in Seafile currently have no way to index and search their content through RAGFlow. This connector enables RAGFlow users to: - Connect to self-hosted Seafile servers via API token - Index documents from personal and shared libraries - Support incremental polling for updated files - Seamlessly integrate Seafile-stored documents into their RAG pipelines ### Type of change - [x] New Feature (non-breaking change which adds functionality) ### Changes included - `SeaFileConnector` implementing `LoadConnector` and `PollConnector` interfaces - Support for API token - Recursive file traversal across libraries - Time-based filtering for incremental updates - Seafile logo (sourced from Simple Icons, CC0) - Connector configuration and registration ### Testing - Tested against self-hosted Seafile Community Edition - Verified authentication (token) - Verified document ingestion from personal and shared libraries - Verified incremental polling with time filters
2026-02-04 17:45:07 +08:00 · 2026-02-03 06:42:05 +01:00
parent 25bb2e1616
commit deeae8dba4
9 changed files with 503 additions and 0 deletions
--- a/rag/svr/sync_data_source.py
+++ b/rag/svr/sync_data_source.py
@ -53,6 +53,7 @@ from common.data_source import (
    AsanaConnector,
    ImapConnector,
    ZendeskConnector,
+    SeaFileConnector,
 )
 from common.constants import FileSource, TaskStatus
 from common.data_source.config import INDEX_BATCH_SIZE
@ -1178,6 +1179,40 @@ class Bitbucket(SyncBase):

        return wrapper()

+class SeaFile(SyncBase):
+    SOURCE_NAME: str = FileSource.SEAFILE
+
+    async def _generate(self, task: dict):
+        self.connector = SeaFileConnector(
+            seafile_url=self.conf["seafile_url"],
+            batch_size=self.conf.get("batch_size", INDEX_BATCH_SIZE),
+            include_shared=self.conf.get("include_shared", True)
+        )
+
+        self.connector.load_credentials(self.conf["credentials"])
+
+        # Determine the time range for synchronization based on reindex or poll_range_start
+        poll_start = task.get("poll_range_start")
+
+        if task["reindex"] == "1" or poll_start is None:
+            document_generator = self.connector.load_from_state()
+            begin_info = "totally"
+        else:
+            document_generator = self.connector.poll_source(
+                poll_start.timestamp(),
+                datetime.now(timezone.utc).timestamp(),
+            )
+            begin_info = f"from {poll_start}"
+
+        logging.info(
+            "Connect to SeaFile: {} (include_shared: {}) {}".format(
+                self.conf["seafile_url"],
+                self.conf.get("include_shared", True),
+                begin_info
+            )
+        )
+        return document_generator
+
 func_factory = {
    FileSource.S3: S3,
    FileSource.R2: R2,
@ -1203,6 +1238,7 @@ func_factory = {
    FileSource.GITHUB: Github,
    FileSource.GITLAB: Gitlab,
    FileSource.BITBUCKET: Bitbucket,
+    FileSource.SEAFILE: SeaFile, 
 }