mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-19 20:16:49 +08:00
feat: Add Single Bucket Mode for MinIO/S3 (#11416)
## Overview
This PR adds support for **Single Bucket Mode** in RAGFlow, allowing
users to configure MinIO/S3 to use a single bucket with a directory
structure instead of creating multiple buckets per Knowledge Base and
user folder.
## Problem Statement
The current implementation creates one bucket per Knowledge Base and one
bucket per user folder, which can be problematic when:
- Cloud providers charge per bucket
- IAM policies restrict bucket creation
- Organizations want centralized data management in a single bucket
## Solution
Added a `prefix_path` configuration option to the MinIO connector that
enables:
- Using a single bucket with directory-based organization
- Backward compatibility with existing multi-bucket deployments
- Support for MinIO, AWS S3, and other S3-compatible storage backends
## Changes
- **`rag/utils/minio_conn.py`**: Enhanced MinIO connector to support
single bucket mode with prefix paths
- **`conf/service_conf.yaml`**: Added new configuration options
(`bucket` and `prefix_path`)
- **`docker/service_conf.yaml.template`**: Updated template with single
bucket configuration examples
- **`docker/.env.single-bucket-example`**: Added example environment
variables for single bucket setup
- **`docs/single-bucket-mode.md`**: Comprehensive documentation covering
usage, migration, and troubleshooting
## Configuration Example
```yaml
minio:
user: "access-key"
password: "secret-key"
host: "minio.example.com:443"
bucket: "ragflow-bucket" # Single bucket name
prefix_path: "ragflow" # Optional prefix path
```
## Backward Compatibility
✅ Fully backward compatible - existing deployments continue to work
without any changes
- If `bucket` is not configured, uses default multi-bucket behavior
- If `bucket` is configured without `prefix_path`, uses bucket root
- If both are configured, uses `bucket/prefix_path/` structure
## Testing
- Tested with MinIO (local and cloud)
- Verified backward compatibility with existing multi-bucket mode
- Validated IAM policy restrictions work correctly
## Documentation
Included comprehensive documentation in `docs/single-bucket-mode.md`
covering:
- Configuration examples
- Migration guide from multi-bucket to single-bucket mode
- IAM policy examples
- Troubleshooting guide
---
**Related Issue**: Addresses use cases where bucket creation is
restricted or costly
This commit is contained in:
@ -17,6 +17,8 @@ minio:
|
||||
user: 'rag_flow'
|
||||
password: 'infini_rag_flow'
|
||||
host: 'localhost:9000'
|
||||
bucket: ''
|
||||
prefix_path: ''
|
||||
es:
|
||||
hosts: 'http://localhost:1200'
|
||||
username: 'elastic'
|
||||
|
||||
108
docker/.env.single-bucket-example
Normal file
108
docker/.env.single-bucket-example
Normal file
@ -0,0 +1,108 @@
|
||||
# Example: Single Bucket Mode Configuration
|
||||
#
|
||||
# This file shows how to configure RAGFlow to use a single MinIO/S3 bucket
|
||||
# with directory structure instead of creating multiple buckets.
|
||||
|
||||
# ============================================================================
|
||||
# MinIO/S3 Configuration for Single Bucket Mode
|
||||
# ============================================================================
|
||||
|
||||
# MinIO/S3 Endpoint (with port if not default)
|
||||
# For HTTPS (port 443), the connection will automatically use secure=True
|
||||
export MINIO_HOST=minio.example.com:443
|
||||
|
||||
# Access credentials
|
||||
export MINIO_USER=your-access-key
|
||||
export MINIO_PASSWORD=your-secret-password-here
|
||||
|
||||
# Single Bucket Configuration (NEW!)
|
||||
# If set, all data will be stored in this bucket instead of creating
|
||||
# separate buckets for each knowledge base
|
||||
export MINIO_BUCKET=ragflow-bucket
|
||||
|
||||
# Optional: Prefix path within the bucket (NEW!)
|
||||
# If set, all files will be stored under this prefix
|
||||
# Example: bucket/prefix_path/kb_id/file.pdf
|
||||
export MINIO_PREFIX_PATH=ragflow
|
||||
|
||||
# ============================================================================
|
||||
# Alternative: Multi-Bucket Mode (Default)
|
||||
# ============================================================================
|
||||
#
|
||||
# To use the original multi-bucket mode, simply don't set MINIO_BUCKET
|
||||
# and MINIO_PREFIX_PATH:
|
||||
#
|
||||
# export MINIO_HOST=minio.local
|
||||
# export MINIO_USER=admin
|
||||
# export MINIO_PASSWORD=password
|
||||
# # MINIO_BUCKET not set
|
||||
# # MINIO_PREFIX_PATH not set
|
||||
|
||||
# ============================================================================
|
||||
# Storage Mode Selection (Environment Variable)
|
||||
# ============================================================================
|
||||
#
|
||||
# Make sure this is set to use MinIO (default)
|
||||
export STORAGE_IMPL=MINIO
|
||||
|
||||
# ============================================================================
|
||||
# Example Path Structures
|
||||
# ============================================================================
|
||||
#
|
||||
# Multi-Bucket Mode (default):
|
||||
# bucket: kb_12345/file.pdf
|
||||
# bucket: kb_67890/file.pdf
|
||||
# bucket: folder_abc/file.txt
|
||||
#
|
||||
# Single Bucket Mode (MINIO_BUCKET set):
|
||||
# bucket: ragflow-bucket/kb_12345/file.pdf
|
||||
# bucket: ragflow-bucket/kb_67890/file.pdf
|
||||
# bucket: ragflow-bucket/folder_abc/file.txt
|
||||
#
|
||||
# Single Bucket with Prefix (both set):
|
||||
# bucket: ragflow-bucket/ragflow/kb_12345/file.pdf
|
||||
# bucket: ragflow-bucket/ragflow/kb_67890/file.pdf
|
||||
# bucket: ragflow-bucket/ragflow/folder_abc/file.txt
|
||||
|
||||
# ============================================================================
|
||||
# IAM Policy for Single Bucket Mode
|
||||
# ============================================================================
|
||||
#
|
||||
# When using single bucket mode, you only need permissions for one bucket:
|
||||
#
|
||||
# {
|
||||
# "Version": "2012-10-17",
|
||||
# "Statement": [
|
||||
# {
|
||||
# "Effect": "Allow",
|
||||
# "Action": ["s3:*"],
|
||||
# "Resource": [
|
||||
# "arn:aws:s3:::ragflow-bucket",
|
||||
# "arn:aws:s3:::ragflow-bucket/*"
|
||||
# ]
|
||||
# }
|
||||
# ]
|
||||
# }
|
||||
|
||||
# ============================================================================
|
||||
# Testing the Configuration
|
||||
# ============================================================================
|
||||
#
|
||||
# After setting these variables, you can test with MinIO Client (mc):
|
||||
#
|
||||
# # Configure mc alias
|
||||
# mc alias set ragflow https://minio.example.com:443 \
|
||||
# your-access-key \
|
||||
# your-secret-password-here
|
||||
#
|
||||
# # List bucket contents
|
||||
# mc ls ragflow/ragflow-bucket/
|
||||
#
|
||||
# # If prefix is set, check the prefix
|
||||
# mc ls ragflow/ragflow-bucket/ragflow/
|
||||
#
|
||||
# # Test write permission
|
||||
# echo "test" | mc pipe ragflow/ragflow-bucket/ragflow/_test.txt
|
||||
#
|
||||
# # Clean up test file
|
||||
# mc rm ragflow/ragflow-bucket/ragflow/_test.txt
|
||||
@ -17,6 +17,8 @@ minio:
|
||||
user: '${MINIO_USER:-rag_flow}'
|
||||
password: '${MINIO_PASSWORD:-infini_rag_flow}'
|
||||
host: '${MINIO_HOST:-minio}:9000'
|
||||
bucket: '${MINIO_BUCKET:-}'
|
||||
prefix_path: '${MINIO_PREFIX_PATH:-}'
|
||||
es:
|
||||
hosts: 'http://${ES_HOST:-es01}:9200'
|
||||
username: '${ES_USER:-elastic}'
|
||||
|
||||
162
docs/single-bucket-mode.md
Normal file
162
docs/single-bucket-mode.md
Normal file
@ -0,0 +1,162 @@
|
||||
# Single Bucket Mode for MinIO/S3
|
||||
|
||||
## Overview
|
||||
|
||||
By default, RAGFlow creates one bucket per Knowledge Base (dataset) and one bucket per user folder. This can be problematic when:
|
||||
|
||||
- Your cloud provider charges per bucket
|
||||
- Your IAM policy restricts bucket creation
|
||||
- You want all data organized in a single bucket with directory structure
|
||||
|
||||
The **Single Bucket Mode** allows you to configure RAGFlow to use a single bucket with a directory structure instead of multiple buckets.
|
||||
|
||||
## How It Works
|
||||
|
||||
### Default Mode (Multiple Buckets)
|
||||
|
||||
```
|
||||
bucket: kb_12345/
|
||||
└── document_1.pdf
|
||||
bucket: kb_67890/
|
||||
└── document_2.pdf
|
||||
bucket: folder_abc/
|
||||
└── file_3.txt
|
||||
```
|
||||
|
||||
### Single Bucket Mode (with prefix_path)
|
||||
|
||||
```
|
||||
bucket: ragflow-bucket/
|
||||
└── ragflow/
|
||||
├── kb_12345/
|
||||
│ └── document_1.pdf
|
||||
├── kb_67890/
|
||||
│ └── document_2.pdf
|
||||
└── folder_abc/
|
||||
└── file_3.txt
|
||||
```
|
||||
|
||||
## Configuration
|
||||
|
||||
### MinIO Configuration
|
||||
|
||||
Edit your `service_conf.yaml` or set environment variables:
|
||||
|
||||
```yaml
|
||||
minio:
|
||||
user: "your-access-key"
|
||||
password: "your-secret-key"
|
||||
host: "minio.example.com:443"
|
||||
bucket: "ragflow-bucket" # Default bucket name
|
||||
prefix_path: "ragflow" # Optional prefix path
|
||||
```
|
||||
|
||||
Or using environment variables:
|
||||
|
||||
```bash
|
||||
export MINIO_USER=your-access-key
|
||||
export MINIO_PASSWORD=your-secret-key
|
||||
export MINIO_HOST=minio.example.com:443
|
||||
export MINIO_BUCKET=ragflow-bucket
|
||||
export MINIO_PREFIX_PATH=ragflow
|
||||
```
|
||||
|
||||
### S3 Configuration (already supported)
|
||||
|
||||
```yaml
|
||||
s3:
|
||||
access_key: "your-access-key"
|
||||
secret_key: "your-secret-key"
|
||||
endpoint_url: "https://s3.amazonaws.com"
|
||||
bucket: "my-ragflow-bucket"
|
||||
prefix_path: "production"
|
||||
region: "us-east-1"
|
||||
```
|
||||
|
||||
## IAM Policy Example
|
||||
|
||||
When using single bucket mode, you only need permissions for one bucket:
|
||||
|
||||
```json
|
||||
{
|
||||
"Version": "2012-10-17",
|
||||
"Statement": [
|
||||
{
|
||||
"Effect": "Allow",
|
||||
"Action": ["s3:*"],
|
||||
"Resource": [
|
||||
"arn:aws:s3:::ragflow-bucket",
|
||||
"arn:aws:s3:::ragflow-bucket/*"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
## Migration from Multi-Bucket to Single Bucket
|
||||
|
||||
If you're migrating from multi-bucket mode to single-bucket mode:
|
||||
|
||||
1. **Set environment variables** for the new configuration
|
||||
2. **Restart RAGFlow** services
|
||||
3. **Migrate existing data** (optional):
|
||||
|
||||
```bash
|
||||
# Example using mc (MinIO Client)
|
||||
mc alias set old-minio http://old-minio:9000 ACCESS_KEY SECRET_KEY
|
||||
mc alias set new-minio https://new-minio:443 ACCESS_KEY SECRET_KEY
|
||||
|
||||
# List all knowledge base buckets
|
||||
mc ls old-minio/ | grep kb_ | while read -r line; do
|
||||
bucket=$(echo $line | awk '{print $5}')
|
||||
# Copy each bucket to the new structure
|
||||
mc cp --recursive old-minio/$bucket/ new-minio/ragflow-bucket/ragflow/$bucket/
|
||||
done
|
||||
```
|
||||
|
||||
## Toggle Between Modes
|
||||
|
||||
### Enable Single Bucket Mode
|
||||
|
||||
```yaml
|
||||
minio:
|
||||
bucket: "my-single-bucket"
|
||||
prefix_path: "ragflow"
|
||||
```
|
||||
|
||||
### Disable (Use Multi-Bucket Mode)
|
||||
|
||||
```yaml
|
||||
minio:
|
||||
# Leave bucket and prefix_path empty or commented out
|
||||
# bucket: ''
|
||||
# prefix_path: ''
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Issue: Access Denied errors
|
||||
|
||||
**Solution**: Ensure your IAM policy grants access to the bucket specified in the configuration.
|
||||
|
||||
### Issue: Files not found after switching modes
|
||||
|
||||
**Solution**: The path structure changes between modes. You'll need to migrate existing data.
|
||||
|
||||
### Issue: Connection fails with HTTPS
|
||||
|
||||
**Solution**: Ensure `secure: True` is set in the MinIO connection (automatically handled for port 443).
|
||||
|
||||
## Storage Backends Supported
|
||||
|
||||
- ✅ **MinIO** - Full support with single bucket mode
|
||||
- ✅ **AWS S3** - Full support with single bucket mode
|
||||
- ✅ **Alibaba OSS** - Full support with single bucket mode
|
||||
- ✅ **Azure Blob** - Uses container-based structure (different paradigm)
|
||||
- ⚠️ **OpenDAL** - Depends on underlying storage backend
|
||||
|
||||
## Performance Considerations
|
||||
|
||||
- **Single bucket mode** may have slightly better performance for bucket listing operations
|
||||
- **Multi-bucket mode** provides better isolation and organization for large deployments
|
||||
- Choose based on your specific requirements and infrastructure constraints
|
||||
@ -28,8 +28,51 @@ from common import settings
|
||||
class RAGFlowMinio:
|
||||
def __init__(self):
|
||||
self.conn = None
|
||||
# Use `or None` to convert empty strings to None, ensuring single-bucket
|
||||
# mode is truly disabled when not configured
|
||||
self.bucket = settings.MINIO.get('bucket', None) or None
|
||||
self.prefix_path = settings.MINIO.get('prefix_path', None) or None
|
||||
self.__open__()
|
||||
|
||||
@staticmethod
|
||||
def use_default_bucket(method):
|
||||
def wrapper(self, bucket, *args, **kwargs):
|
||||
# If there is a default bucket, use the default bucket
|
||||
# but preserve the original bucket identifier so it can be
|
||||
# used as a path prefix inside the physical/default bucket.
|
||||
original_bucket = bucket
|
||||
actual_bucket = self.bucket if self.bucket else bucket
|
||||
if self.bucket:
|
||||
# pass original identifier forward for use by other decorators
|
||||
kwargs['_orig_bucket'] = original_bucket
|
||||
return method(self, actual_bucket, *args, **kwargs)
|
||||
return wrapper
|
||||
|
||||
@staticmethod
|
||||
def use_prefix_path(method):
|
||||
def wrapper(self, bucket, fnm, *args, **kwargs):
|
||||
# If a default MINIO bucket is configured, the use_default_bucket
|
||||
# decorator will have replaced the `bucket` arg with the physical
|
||||
# bucket name and forwarded the original identifier as `_orig_bucket`.
|
||||
# Prefer that original identifier when constructing the key path so
|
||||
# objects are stored under <physical-bucket>/<identifier>/...
|
||||
orig_bucket = kwargs.pop('_orig_bucket', None)
|
||||
|
||||
if self.prefix_path:
|
||||
# If a prefix_path is configured, include it and then the identifier
|
||||
if orig_bucket:
|
||||
fnm = f"{self.prefix_path}/{orig_bucket}/{fnm}"
|
||||
else:
|
||||
fnm = f"{self.prefix_path}/{fnm}"
|
||||
else:
|
||||
# No prefix_path configured. If orig_bucket exists and the
|
||||
# physical bucket equals configured default, use orig_bucket as a path.
|
||||
if orig_bucket and bucket == self.bucket:
|
||||
fnm = f"{orig_bucket}/{fnm}"
|
||||
|
||||
return method(self, bucket, fnm, *args, **kwargs)
|
||||
return wrapper
|
||||
|
||||
def __open__(self):
|
||||
try:
|
||||
if self.conn:
|
||||
@ -52,19 +95,27 @@ class RAGFlowMinio:
|
||||
self.conn = None
|
||||
|
||||
def health(self):
|
||||
bucket, fnm, binary = "txtxtxtxt1", "txtxtxtxt1", b"_t@@@1"
|
||||
if not self.conn.bucket_exists(bucket):
|
||||
self.conn.make_bucket(bucket)
|
||||
bucket = self.bucket if self.bucket else "ragflow-bucket"
|
||||
fnm = "_health_check"
|
||||
if self.prefix_path:
|
||||
fnm = f"{self.prefix_path}/{fnm}"
|
||||
binary = b"_t@@@1"
|
||||
# Don't try to create bucket - it should already exist
|
||||
# if not self.conn.bucket_exists(bucket):
|
||||
# self.conn.make_bucket(bucket)
|
||||
r = self.conn.put_object(bucket, fnm,
|
||||
BytesIO(binary),
|
||||
len(binary)
|
||||
)
|
||||
return r
|
||||
|
||||
@use_default_bucket
|
||||
@use_prefix_path
|
||||
def put(self, bucket, fnm, binary, tenant_id=None):
|
||||
for _ in range(3):
|
||||
try:
|
||||
if not self.conn.bucket_exists(bucket):
|
||||
# Note: bucket must already exist - we don't have permission to create buckets
|
||||
if not self.bucket and not self.conn.bucket_exists(bucket):
|
||||
self.conn.make_bucket(bucket)
|
||||
|
||||
r = self.conn.put_object(bucket, fnm,
|
||||
@ -77,12 +128,16 @@ class RAGFlowMinio:
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
|
||||
@use_default_bucket
|
||||
@use_prefix_path
|
||||
def rm(self, bucket, fnm, tenant_id=None):
|
||||
try:
|
||||
self.conn.remove_object(bucket, fnm)
|
||||
except Exception:
|
||||
logging.exception(f"Fail to remove {bucket}/{fnm}:")
|
||||
|
||||
@use_default_bucket
|
||||
@use_prefix_path
|
||||
def get(self, bucket, filename, tenant_id=None):
|
||||
for _ in range(1):
|
||||
try:
|
||||
@ -92,8 +147,10 @@ class RAGFlowMinio:
|
||||
logging.exception(f"Fail to get {bucket}/{filename}")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return None
|
||||
return
|
||||
|
||||
@use_default_bucket
|
||||
@use_prefix_path
|
||||
def obj_exist(self, bucket, filename, tenant_id=None):
|
||||
try:
|
||||
if not self.conn.bucket_exists(bucket):
|
||||
@ -109,6 +166,7 @@ class RAGFlowMinio:
|
||||
logging.exception(f"obj_exist {bucket}/{filename} got exception")
|
||||
return False
|
||||
|
||||
@use_default_bucket
|
||||
def bucket_exists(self, bucket):
|
||||
try:
|
||||
if not self.conn.bucket_exists(bucket):
|
||||
@ -122,6 +180,8 @@ class RAGFlowMinio:
|
||||
logging.exception(f"bucket_exist {bucket} got exception")
|
||||
return False
|
||||
|
||||
@use_default_bucket
|
||||
@use_prefix_path
|
||||
def get_presigned_url(self, bucket, fnm, expires, tenant_id=None):
|
||||
for _ in range(10):
|
||||
try:
|
||||
@ -130,20 +190,50 @@ class RAGFlowMinio:
|
||||
logging.exception(f"Fail to get_presigned {bucket}/{fnm}:")
|
||||
self.__open__()
|
||||
time.sleep(1)
|
||||
return None
|
||||
return
|
||||
|
||||
def remove_bucket(self, bucket):
|
||||
@use_default_bucket
|
||||
def remove_bucket(self, bucket, **kwargs):
|
||||
orig_bucket = kwargs.pop('_orig_bucket', None)
|
||||
try:
|
||||
if self.conn.bucket_exists(bucket):
|
||||
objects_to_delete = self.conn.list_objects(bucket, recursive=True)
|
||||
if self.bucket:
|
||||
# Single bucket mode: remove objects with prefix
|
||||
prefix = ""
|
||||
if self.prefix_path:
|
||||
prefix = f"{self.prefix_path}/"
|
||||
if orig_bucket:
|
||||
prefix += f"{orig_bucket}/"
|
||||
|
||||
# List objects with prefix
|
||||
objects_to_delete = self.conn.list_objects(bucket, prefix=prefix, recursive=True)
|
||||
for obj in objects_to_delete:
|
||||
self.conn.remove_object(bucket, obj.object_name)
|
||||
self.conn.remove_bucket(bucket)
|
||||
# Do NOT remove the physical bucket
|
||||
else:
|
||||
if self.conn.bucket_exists(bucket):
|
||||
objects_to_delete = self.conn.list_objects(bucket, recursive=True)
|
||||
for obj in objects_to_delete:
|
||||
self.conn.remove_object(bucket, obj.object_name)
|
||||
self.conn.remove_bucket(bucket)
|
||||
except Exception:
|
||||
logging.exception(f"Fail to remove bucket {bucket}")
|
||||
|
||||
def _resolve_bucket_and_path(self, bucket, fnm):
|
||||
if self.bucket:
|
||||
if self.prefix_path:
|
||||
fnm = f"{self.prefix_path}/{bucket}/{fnm}"
|
||||
else:
|
||||
fnm = f"{bucket}/{fnm}"
|
||||
bucket = self.bucket
|
||||
elif self.prefix_path:
|
||||
fnm = f"{self.prefix_path}/{fnm}"
|
||||
return bucket, fnm
|
||||
|
||||
def copy(self, src_bucket, src_path, dest_bucket, dest_path):
|
||||
try:
|
||||
src_bucket, src_path = self._resolve_bucket_and_path(src_bucket, src_path)
|
||||
dest_bucket, dest_path = self._resolve_bucket_and_path(dest_bucket, dest_path)
|
||||
|
||||
if not self.conn.bucket_exists(dest_bucket):
|
||||
self.conn.make_bucket(dest_bucket)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user