mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-02 00:25:06 +08:00
### What problem does this PR solve? fixes https://github.com/infiniflow/ragflow/issues/12774 Add a CLI tool for migrating RAGFlow data from Elasticsearch to OceanBase, enabling users to switch their document storage backend. - Automatic discovery and migration of all `ragflow_*` indices - Schema conversion with vector dimension auto-detection - Batch processing with progress tracking and resume capability - Data consistency validation and migration report generation **Note**: Due to network issues, I was unable to pull the required Docker images (Elasticsearch, OceanBase) to run the full end-to-end verification. Unit tests have been verified to pass. I will complete the e2e verification when network conditions allow, and submit a follow-up PR if any fixes are needed. ```bash ============================= test session starts ============================== platform darwin -- Python 3.13.6, pytest-9.0.2, pluggy-1.6.0 rootdir: /Users/sevenc/code/ai/oceanbase/ragflow/tools/es-to-oceanbase-migration configfile: pyproject.toml testpaths: tests plugins: anyio-4.12.1, asyncio-1.3.0, cov-7.0.0 collected 86 items tests/test_progress.py::TestMigrationProgress::test_create_basic_progress PASSED [ 1%] tests/test_progress.py::TestMigrationProgress::test_create_progress_with_counts PASSED [ 2%] tests/test_progress.py::TestMigrationProgress::test_progress_default_values PASSED [ 3%] tests/test_progress.py::TestMigrationProgress::test_progress_status_values PASSED [ 4%] tests/test_progress.py::TestProgressManager::test_create_progress_manager PASSED [ 5%] tests/test_progress.py::TestProgressManager::test_create_progress_manager_creates_dir PASSED [ 6%] tests/test_progress.py::TestProgressManager::test_create_progress PASSED [ 8%] tests/test_progress.py::TestProgressManager::test_save_and_load_progress PASSED [ 9%] tests/test_progress.py::TestProgressManager::test_load_nonexistent_progress PASSED [ 10%] tests/test_progress.py::TestProgressManager::test_delete_progress PASSED [ 11%] tests/test_progress.py::TestProgressManager::test_update_progress PASSED [ 12%] tests/test_progress.py::TestProgressManager::test_update_progress_multiple_batches PASSED [ 13%] tests/test_progress.py::TestProgressManager::test_mark_completed PASSED [ 15%] tests/test_progress.py::TestProgressManager::test_mark_failed PASSED [ 16%] tests/test_progress.py::TestProgressManager::test_mark_paused PASSED [ 17%] tests/test_progress.py::TestProgressManager::test_can_resume_running PASSED [ 18%] tests/test_progress.py::TestProgressManager::test_can_resume_paused PASSED [ 19%] tests/test_progress.py::TestProgressManager::test_can_resume_completed PASSED [ 20%] tests/test_progress.py::TestProgressManager::test_can_resume_nonexistent PASSED [ 22%] tests/test_progress.py::TestProgressManager::test_get_resume_info PASSED [ 23%] tests/test_progress.py::TestProgressManager::test_get_resume_info_nonexistent PASSED [ 24%] tests/test_progress.py::TestProgressManager::test_progress_file_path PASSED [ 25%] tests/test_progress.py::TestProgressManager::test_progress_file_content PASSED [ 26%] tests/test_schema.py::TestRAGFlowSchemaConverter::test_analyze_ragflow_mapping PASSED [ 27%] tests/test_schema.py::TestRAGFlowSchemaConverter::test_detect_vector_size PASSED [ 29%] tests/test_schema.py::TestRAGFlowSchemaConverter::test_unknown_fields PASSED [ 30%] tests/test_schema.py::TestRAGFlowSchemaConverter::test_get_column_definitions PASSED [ 31%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_basic_document PASSED [ 32%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_with_vector PASSED [ 33%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_array_fields PASSED [ 34%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_json_fields PASSED [ 36%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_unknown_fields_to_extra PASSED [ 37%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_kb_id_list PASSED [ 38%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_content_with_weight_dict PASSED [ 39%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_batch PASSED [ 40%] tests/test_schema.py::TestVectorFieldPattern::test_valid_patterns PASSED [ 41%] tests/test_schema.py::TestVectorFieldPattern::test_invalid_patterns PASSED [ 43%] tests/test_schema.py::TestVectorFieldPattern::test_extract_dimension PASSED [ 44%] tests/test_schema.py::TestConstants::test_array_columns PASSED [ 45%] tests/test_schema.py::TestConstants::test_json_columns PASSED [ 46%] tests/test_schema.py::TestConstants::test_ragflow_columns_completeness PASSED [ 47%] tests/test_schema.py::TestConstants::test_fts_columns PASSED [ 48%] tests/test_schema.py::TestConstants::test_ragflow_columns_types PASSED [ 50%] tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_empty_mapping PASSED [ 51%] tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_mapping_without_properties PASSED [ 52%] tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_multiple_vector_fields PASSED [ 53%] tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_get_column_definitions_without_analysis PASSED [ 54%] tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_get_vector_fields PASSED [ 55%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_empty_document PASSED [ 56%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_document_without_source PASSED [ 58%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_boolean_to_integer PASSED [ 59%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_invalid_integer PASSED [ 60%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_float_field PASSED [ 61%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_array_with_special_characters PASSED [ 62%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_already_json_array PASSED [ 63%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_single_value_to_array PASSED [ 65%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_detect_vector_fields_from_document PASSED [ 66%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_with_default_values PASSED [ 67%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_list_content PASSED [ 68%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_batch_empty PASSED [ 69%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_existing_extra_field_merged PASSED [ 70%] tests/test_verify.py::TestVerificationResult::test_create_basic_result PASSED [ 72%] tests/test_verify.py::TestVerificationResult::test_result_default_values PASSED [ 73%] tests/test_verify.py::TestVerificationResult::test_result_with_counts PASSED [ 74%] tests/test_verify.py::TestMigrationVerifier::test_verify_counts_match PASSED [ 75%] tests/test_verify.py::TestMigrationVerifier::test_verify_counts_mismatch PASSED [ 76%] tests/test_verify.py::TestMigrationVerifier::test_verify_samples_all_match PASSED [ 77%] tests/test_verify.py::TestMigrationVerifier::test_verify_samples_some_missing PASSED [ 79%] tests/test_verify.py::TestMigrationVerifier::test_verify_samples_data_mismatch PASSED [ 80%] tests/test_verify.py::TestMigrationVerifier::test_values_equal_none_values PASSED [ 81%] tests/test_verify.py::TestMigrationVerifier::test_values_equal_array_columns PASSED [ 82%] tests/test_verify.py::TestMigrationVerifier::test_values_equal_json_columns PASSED [ 83%] tests/test_verify.py::TestMigrationVerifier::test_values_equal_kb_id_list PASSED [ 84%] tests/test_verify.py::TestMigrationVerifier::test_values_equal_content_with_weight_dict PASSED [ 86%] tests/test_verify.py::TestMigrationVerifier::test_determine_result_passed PASSED [ 87%] tests/test_verify.py::TestMigrationVerifier::test_determine_result_failed_count PASSED [ 88%] tests/test_verify.py::TestMigrationVerifier::test_determine_result_failed_samples PASSED [ 89%] tests/test_verify.py::TestMigrationVerifier::test_generate_report PASSED [ 90%] tests/test_verify.py::TestMigrationVerifier::test_generate_report_with_missing PASSED [ 91%] tests/test_verify.py::TestMigrationVerifier::test_generate_report_with_mismatches PASSED [ 93%] tests/test_verify.py::TestValueComparison::test_string_comparison PASSED [ 94%] tests/test_verify.py::TestValueComparison::test_integer_comparison PASSED [ 95%] tests/test_verify.py::TestValueComparison::test_float_comparison PASSED [ 96%] tests/test_verify.py::TestValueComparison::test_boolean_comparison PASSED [ 97%] tests/test_verify.py::TestValueComparison::test_empty_array_comparison PASSED [ 98%] tests/test_verify.py::TestValueComparison::test_nested_json_comparison PASSED [100%] ======================= 86 passed, 88 warnings in 0.66s ======================== ``` ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe):
371 lines
13 KiB
Python
371 lines
13 KiB
Python
"""
|
|
RAGFlow-specific migration orchestrator from Elasticsearch to OceanBase.
|
|
"""
|
|
|
|
import logging
|
|
import time
|
|
from typing import Any, Callable
|
|
|
|
from rich.console import Console
|
|
from rich.progress import (
|
|
Progress,
|
|
SpinnerColumn,
|
|
TextColumn,
|
|
BarColumn,
|
|
TaskProgressColumn,
|
|
TimeRemainingColumn,
|
|
)
|
|
|
|
from .es_client import ESClient
|
|
from .ob_client import OBClient
|
|
from .schema import RAGFlowSchemaConverter, RAGFlowDataConverter, VECTOR_FIELD_PATTERN
|
|
from .progress import ProgressManager, MigrationProgress
|
|
from .verify import MigrationVerifier
|
|
|
|
logger = logging.getLogger(__name__)
|
|
console = Console()
|
|
|
|
|
|
class ESToOceanBaseMigrator:
|
|
"""
|
|
RAGFlow-specific migration orchestrator.
|
|
|
|
This migrator is designed specifically for RAGFlow's data structure,
|
|
handling the fixed schema and vector embeddings correctly.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
es_client: ESClient,
|
|
ob_client: OBClient,
|
|
progress_dir: str = ".migration_progress",
|
|
):
|
|
"""
|
|
Initialize migrator.
|
|
|
|
Args:
|
|
es_client: Elasticsearch client
|
|
ob_client: OceanBase client
|
|
progress_dir: Directory for progress files
|
|
"""
|
|
self.es_client = es_client
|
|
self.ob_client = ob_client
|
|
self.progress_manager = ProgressManager(progress_dir)
|
|
self.schema_converter = RAGFlowSchemaConverter()
|
|
|
|
def migrate(
|
|
self,
|
|
es_index: str,
|
|
ob_table: str,
|
|
batch_size: int = 1000,
|
|
resume: bool = False,
|
|
verify_after: bool = True,
|
|
on_progress: Callable[[int, int], None] | None = None,
|
|
) -> dict[str, Any]:
|
|
"""
|
|
Execute full migration from ES to OceanBase for RAGFlow data.
|
|
|
|
Args:
|
|
es_index: Source Elasticsearch index
|
|
ob_table: Target OceanBase table
|
|
batch_size: Documents per batch
|
|
resume: Resume from previous progress
|
|
verify_after: Run verification after migration
|
|
on_progress: Progress callback (migrated, total)
|
|
|
|
Returns:
|
|
Migration result dictionary
|
|
"""
|
|
start_time = time.time()
|
|
result = {
|
|
"success": False,
|
|
"es_index": es_index,
|
|
"ob_table": ob_table,
|
|
"total_documents": 0,
|
|
"migrated_documents": 0,
|
|
"failed_documents": 0,
|
|
"duration_seconds": 0,
|
|
"verification": None,
|
|
"error": None,
|
|
}
|
|
|
|
progress: MigrationProgress | None = None
|
|
|
|
try:
|
|
# Step 1: Check connections
|
|
console.print("[bold blue]Step 1: Checking connections...[/]")
|
|
self._check_connections()
|
|
|
|
# Step 2: Analyze ES index
|
|
console.print("\n[bold blue]Step 2: Analyzing ES index...[/]")
|
|
analysis = self._analyze_es_index(es_index)
|
|
|
|
# Auto-detect vector size from ES mapping
|
|
vector_size = 768 # Default fallback
|
|
if analysis["vector_fields"]:
|
|
vector_size = analysis["vector_fields"][0]["dimension"]
|
|
console.print(f" [green]Auto-detected vector dimension: {vector_size}[/]")
|
|
else:
|
|
console.print(f" [yellow]No vector fields found, using default: {vector_size}[/]")
|
|
console.print(f" Known RAGFlow fields: {len(analysis['known_fields'])}")
|
|
if analysis["unknown_fields"]:
|
|
console.print(f" [yellow]Unknown fields (will be stored in 'extra'): {analysis['unknown_fields']}[/]")
|
|
|
|
# Step 3: Get total document count
|
|
total_docs = self.es_client.count_documents(es_index)
|
|
console.print(f" Total documents: {total_docs:,}")
|
|
|
|
result["total_documents"] = total_docs
|
|
|
|
if total_docs == 0:
|
|
console.print("[yellow]No documents to migrate[/]")
|
|
result["success"] = True
|
|
return result
|
|
|
|
# Step 4: Handle resume or fresh start
|
|
if resume and self.progress_manager.can_resume(es_index, ob_table):
|
|
console.print("\n[bold yellow]Resuming from previous progress...[/]")
|
|
progress = self.progress_manager.load_progress(es_index, ob_table)
|
|
console.print(
|
|
f" Previously migrated: {progress.migrated_documents:,} documents"
|
|
)
|
|
else:
|
|
# Fresh start - check if table already exists
|
|
if self.ob_client.table_exists(ob_table):
|
|
raise RuntimeError(
|
|
f"Table '{ob_table}' already exists in OceanBase. "
|
|
f"Migration aborted to prevent data conflicts. "
|
|
f"Please drop the table manually or use a different table name."
|
|
)
|
|
|
|
progress = self.progress_manager.create_progress(
|
|
es_index, ob_table, total_docs
|
|
)
|
|
|
|
# Step 5: Create table if needed
|
|
if not progress.table_created:
|
|
console.print("\n[bold blue]Step 3: Creating OceanBase table...[/]")
|
|
if not self.ob_client.table_exists(ob_table):
|
|
self.ob_client.create_ragflow_table(
|
|
table_name=ob_table,
|
|
vector_size=vector_size,
|
|
create_indexes=True,
|
|
create_fts_indexes=True,
|
|
)
|
|
console.print(f" Created table '{ob_table}' with RAGFlow schema")
|
|
else:
|
|
console.print(f" Table '{ob_table}' already exists")
|
|
# Check and add vector column if needed
|
|
self.ob_client.add_vector_column(ob_table, vector_size)
|
|
|
|
progress.table_created = True
|
|
progress.indexes_created = True
|
|
progress.schema_converted = True
|
|
self.progress_manager.save_progress(progress)
|
|
|
|
# Step 6: Migrate data
|
|
console.print("\n[bold blue]Step 4: Migrating data...[/]")
|
|
data_converter = RAGFlowDataConverter()
|
|
|
|
migrated = self._migrate_data(
|
|
es_index=es_index,
|
|
ob_table=ob_table,
|
|
data_converter=data_converter,
|
|
progress=progress,
|
|
batch_size=batch_size,
|
|
on_progress=on_progress,
|
|
)
|
|
|
|
result["migrated_documents"] = migrated
|
|
result["failed_documents"] = progress.failed_documents
|
|
|
|
# Step 7: Mark completed
|
|
self.progress_manager.mark_completed(progress)
|
|
|
|
# Step 8: Verify (optional)
|
|
if verify_after:
|
|
console.print("\n[bold blue]Step 5: Verifying migration...[/]")
|
|
verifier = MigrationVerifier(self.es_client, self.ob_client)
|
|
verification = verifier.verify(
|
|
es_index, ob_table,
|
|
primary_key="id"
|
|
)
|
|
result["verification"] = {
|
|
"passed": verification.passed,
|
|
"message": verification.message,
|
|
"es_count": verification.es_count,
|
|
"ob_count": verification.ob_count,
|
|
"sample_match_rate": verification.sample_match_rate,
|
|
}
|
|
console.print(verifier.generate_report(verification))
|
|
|
|
result["success"] = True
|
|
result["duration_seconds"] = time.time() - start_time
|
|
|
|
console.print(
|
|
f"\n[bold green]Migration completed successfully![/]"
|
|
f"\n Total: {result['total_documents']:,} documents"
|
|
f"\n Migrated: {result['migrated_documents']:,} documents"
|
|
f"\n Failed: {result['failed_documents']:,} documents"
|
|
f"\n Duration: {result['duration_seconds']:.1f} seconds"
|
|
)
|
|
|
|
except KeyboardInterrupt:
|
|
console.print("\n[bold yellow]Migration interrupted by user[/]")
|
|
if progress:
|
|
self.progress_manager.mark_paused(progress)
|
|
result["error"] = "Interrupted by user"
|
|
|
|
except Exception as e:
|
|
logger.exception("Migration failed")
|
|
if progress:
|
|
self.progress_manager.mark_failed(progress, str(e))
|
|
result["error"] = str(e)
|
|
console.print(f"\n[bold red]Migration failed: {e}[/]")
|
|
|
|
return result
|
|
|
|
def _check_connections(self):
|
|
"""Verify connections to both databases."""
|
|
# Check ES
|
|
es_health = self.es_client.health_check()
|
|
if es_health.get("status") not in ("green", "yellow"):
|
|
raise RuntimeError(f"ES cluster unhealthy: {es_health}")
|
|
console.print(f" ES cluster status: {es_health.get('status')}")
|
|
|
|
# Check OceanBase
|
|
if not self.ob_client.health_check():
|
|
raise RuntimeError("OceanBase connection failed")
|
|
|
|
ob_version = self.ob_client.get_version()
|
|
console.print(f" OceanBase connection: OK (version: {ob_version})")
|
|
|
|
def _analyze_es_index(self, es_index: str) -> dict[str, Any]:
|
|
"""Analyze ES index structure for RAGFlow compatibility."""
|
|
es_mapping = self.es_client.get_index_mapping(es_index)
|
|
return self.schema_converter.analyze_es_mapping(es_mapping)
|
|
|
|
def _migrate_data(
|
|
self,
|
|
es_index: str,
|
|
ob_table: str,
|
|
data_converter: RAGFlowDataConverter,
|
|
progress: MigrationProgress,
|
|
batch_size: int,
|
|
on_progress: Callable[[int, int], None] | None,
|
|
) -> int:
|
|
"""Migrate data in batches."""
|
|
total = progress.total_documents
|
|
migrated = progress.migrated_documents
|
|
|
|
with Progress(
|
|
SpinnerColumn(),
|
|
TextColumn("[progress.description]{task.description}"),
|
|
BarColumn(),
|
|
TaskProgressColumn(),
|
|
TimeRemainingColumn(),
|
|
console=console,
|
|
) as pbar:
|
|
task = pbar.add_task(
|
|
"Migrating...",
|
|
total=total,
|
|
completed=migrated,
|
|
)
|
|
|
|
batch_count = 0
|
|
for batch in self.es_client.scroll_documents(es_index, batch_size):
|
|
batch_count += 1
|
|
|
|
# Convert batch to OceanBase format
|
|
ob_rows = data_converter.convert_batch(batch)
|
|
|
|
# Insert batch
|
|
try:
|
|
inserted = self.ob_client.insert_batch(ob_table, ob_rows)
|
|
migrated += inserted
|
|
|
|
# Update progress
|
|
last_ids = [doc.get("_id", doc.get("id", "")) for doc in batch]
|
|
self.progress_manager.update_progress(
|
|
progress,
|
|
migrated_count=inserted,
|
|
last_batch_ids=last_ids,
|
|
)
|
|
|
|
# Update progress bar
|
|
pbar.update(task, completed=migrated)
|
|
|
|
# Callback
|
|
if on_progress:
|
|
on_progress(migrated, total)
|
|
|
|
# Log periodically
|
|
if batch_count % 10 == 0:
|
|
logger.info(f"Migrated {migrated:,}/{total:,} documents")
|
|
|
|
except Exception as e:
|
|
logger.error(f"Batch insert failed: {e}")
|
|
progress.failed_documents += len(batch)
|
|
# Continue with next batch
|
|
|
|
return migrated
|
|
|
|
def get_schema_preview(self, es_index: str) -> dict[str, Any]:
|
|
"""
|
|
Get a preview of schema analysis without executing migration.
|
|
|
|
Args:
|
|
es_index: Elasticsearch index name
|
|
|
|
Returns:
|
|
Schema analysis information
|
|
"""
|
|
es_mapping = self.es_client.get_index_mapping(es_index)
|
|
analysis = self.schema_converter.analyze_es_mapping(es_mapping)
|
|
column_defs = self.schema_converter.get_column_definitions()
|
|
|
|
return {
|
|
"es_index": es_index,
|
|
"es_mapping": es_mapping,
|
|
"analysis": analysis,
|
|
"ob_columns": column_defs,
|
|
"vector_fields": self.schema_converter.get_vector_fields(),
|
|
"total_columns": len(column_defs),
|
|
}
|
|
|
|
def get_data_preview(
|
|
self,
|
|
es_index: str,
|
|
sample_size: int = 5,
|
|
kb_id: str | None = None,
|
|
) -> list[dict[str, Any]]:
|
|
"""
|
|
Get sample documents from ES for preview.
|
|
|
|
Args:
|
|
es_index: ES index name
|
|
sample_size: Number of samples
|
|
kb_id: Optional KB filter
|
|
"""
|
|
query = None
|
|
if kb_id:
|
|
query = {"term": {"kb_id": kb_id}}
|
|
return self.es_client.get_sample_documents(es_index, sample_size, query=query)
|
|
|
|
def list_knowledge_bases(self, es_index: str) -> list[str]:
|
|
"""
|
|
List all knowledge base IDs in an ES index.
|
|
|
|
Args:
|
|
es_index: ES index name
|
|
|
|
Returns:
|
|
List of kb_id values
|
|
"""
|
|
try:
|
|
agg_result = self.es_client.aggregate_field(es_index, "kb_id")
|
|
return [bucket["key"] for bucket in agg_result.get("buckets", [])]
|
|
except Exception as e:
|
|
logger.warning(f"Failed to list knowledge bases: {e}")
|
|
return []
|