feat(connector): add Seafile as data source (#12945)

### What problem does this PR solve?
This PR adds **Seafile** as a new data source connector for RAGFlow.

[Seafile](https://www.seafile.com/) is an open-source, self-hosted file
sync and share platform widely used by enterprises, universities, and
organizations that require data sovereignty and privacy. Users who store
documents in Seafile currently have no way to index and search their
content through RAGFlow.

This connector enables RAGFlow users to:
- Connect to self-hosted Seafile servers via API token
- Index documents from personal and shared libraries
- Support incremental polling for updated files
- Seamlessly integrate Seafile-stored documents into their RAG pipelines


### Type of change

- [x] New Feature (non-breaking change which adds functionality)
### Changes included

- `SeaFileConnector` implementing `LoadConnector` and `PollConnector`
interfaces
- Support for API token
- Recursive file traversal across libraries
- Time-based filtering for incremental updates
- Seafile logo (sourced from Simple Icons, CC0)
- Connector configuration and registration

### Testing

- Tested against self-hosted Seafile Community Edition
- Verified authentication (token)
- Verified document ingestion from personal and shared libraries
- Verified incremental polling with time filters
This commit is contained in:
Yesid Cano Castro
2026-02-03 06:42:05 +01:00
committed by GitHub
parent 25bb2e1616
commit deeae8dba4
9 changed files with 503 additions and 0 deletions

View File

@ -53,6 +53,7 @@ from common.data_source import (
AsanaConnector,
ImapConnector,
ZendeskConnector,
SeaFileConnector,
)
from common.constants import FileSource, TaskStatus
from common.data_source.config import INDEX_BATCH_SIZE
@ -1178,6 +1179,40 @@ class Bitbucket(SyncBase):
return wrapper()
class SeaFile(SyncBase):
SOURCE_NAME: str = FileSource.SEAFILE
async def _generate(self, task: dict):
self.connector = SeaFileConnector(
seafile_url=self.conf["seafile_url"],
batch_size=self.conf.get("batch_size", INDEX_BATCH_SIZE),
include_shared=self.conf.get("include_shared", True)
)
self.connector.load_credentials(self.conf["credentials"])
# Determine the time range for synchronization based on reindex or poll_range_start
poll_start = task.get("poll_range_start")
if task["reindex"] == "1" or poll_start is None:
document_generator = self.connector.load_from_state()
begin_info = "totally"
else:
document_generator = self.connector.poll_source(
poll_start.timestamp(),
datetime.now(timezone.utc).timestamp(),
)
begin_info = f"from {poll_start}"
logging.info(
"Connect to SeaFile: {} (include_shared: {}) {}".format(
self.conf["seafile_url"],
self.conf.get("include_shared", True),
begin_info
)
)
return document_generator
func_factory = {
FileSource.S3: S3,
FileSource.R2: R2,
@ -1203,6 +1238,7 @@ func_factory = {
FileSource.GITHUB: Github,
FileSource.GITLAB: Gitlab,
FileSource.BITBUCKET: Bitbucket,
FileSource.SEAFILE: SeaFile,
}