Files
ragflow/tools/es-to-oceanbase-migration/src/es_ob_migration/schema.py
Se7en 332b11cf96 feat(tools): add Elasticsearch to OceanBase migration tool (#12927)
### What problem does this PR solve?

fixes https://github.com/infiniflow/ragflow/issues/12774

Add a CLI tool for migrating RAGFlow data from Elasticsearch to
OceanBase, enabling users to switch their document storage backend.

- Automatic discovery and migration of all `ragflow_*` indices
- Schema conversion with vector dimension auto-detection
- Batch processing with progress tracking and resume capability
- Data consistency validation and migration report generation

**Note**: Due to network issues, I was unable to pull the required
Docker images (Elasticsearch, OceanBase) to run the full end-to-end
verification. Unit tests have been verified to pass. I will complete the
e2e verification when network conditions allow, and submit a follow-up
PR if any fixes are needed.

```bash
============================= test session starts ==============================
platform darwin -- Python 3.13.6, pytest-9.0.2, pluggy-1.6.0
rootdir: /Users/sevenc/code/ai/oceanbase/ragflow/tools/es-to-oceanbase-migration
configfile: pyproject.toml
testpaths: tests
plugins: anyio-4.12.1, asyncio-1.3.0, cov-7.0.0
collected 86 items

tests/test_progress.py::TestMigrationProgress::test_create_basic_progress PASSED [  1%]
tests/test_progress.py::TestMigrationProgress::test_create_progress_with_counts PASSED [  2%]
tests/test_progress.py::TestMigrationProgress::test_progress_default_values PASSED [  3%]
tests/test_progress.py::TestMigrationProgress::test_progress_status_values PASSED [  4%]
tests/test_progress.py::TestProgressManager::test_create_progress_manager PASSED [  5%]
tests/test_progress.py::TestProgressManager::test_create_progress_manager_creates_dir PASSED [  6%]
tests/test_progress.py::TestProgressManager::test_create_progress PASSED [  8%]
tests/test_progress.py::TestProgressManager::test_save_and_load_progress PASSED [  9%]
tests/test_progress.py::TestProgressManager::test_load_nonexistent_progress PASSED [ 10%]
tests/test_progress.py::TestProgressManager::test_delete_progress PASSED [ 11%]
tests/test_progress.py::TestProgressManager::test_update_progress PASSED [ 12%]
tests/test_progress.py::TestProgressManager::test_update_progress_multiple_batches PASSED [ 13%]
tests/test_progress.py::TestProgressManager::test_mark_completed PASSED  [ 15%]
tests/test_progress.py::TestProgressManager::test_mark_failed PASSED     [ 16%]
tests/test_progress.py::TestProgressManager::test_mark_paused PASSED     [ 17%]
tests/test_progress.py::TestProgressManager::test_can_resume_running PASSED [ 18%]
tests/test_progress.py::TestProgressManager::test_can_resume_paused PASSED [ 19%]
tests/test_progress.py::TestProgressManager::test_can_resume_completed PASSED [ 20%]
tests/test_progress.py::TestProgressManager::test_can_resume_nonexistent PASSED [ 22%]
tests/test_progress.py::TestProgressManager::test_get_resume_info PASSED [ 23%]
tests/test_progress.py::TestProgressManager::test_get_resume_info_nonexistent PASSED [ 24%]
tests/test_progress.py::TestProgressManager::test_progress_file_path PASSED [ 25%]
tests/test_progress.py::TestProgressManager::test_progress_file_content PASSED [ 26%]
tests/test_schema.py::TestRAGFlowSchemaConverter::test_analyze_ragflow_mapping PASSED [ 27%]
tests/test_schema.py::TestRAGFlowSchemaConverter::test_detect_vector_size PASSED [ 29%]
tests/test_schema.py::TestRAGFlowSchemaConverter::test_unknown_fields PASSED [ 30%]
tests/test_schema.py::TestRAGFlowSchemaConverter::test_get_column_definitions PASSED [ 31%]
tests/test_schema.py::TestRAGFlowDataConverter::test_convert_basic_document PASSED [ 32%]
tests/test_schema.py::TestRAGFlowDataConverter::test_convert_with_vector PASSED [ 33%]
tests/test_schema.py::TestRAGFlowDataConverter::test_convert_array_fields PASSED [ 34%]
tests/test_schema.py::TestRAGFlowDataConverter::test_convert_json_fields PASSED [ 36%]
tests/test_schema.py::TestRAGFlowDataConverter::test_convert_unknown_fields_to_extra PASSED [ 37%]
tests/test_schema.py::TestRAGFlowDataConverter::test_convert_kb_id_list PASSED [ 38%]
tests/test_schema.py::TestRAGFlowDataConverter::test_convert_content_with_weight_dict PASSED [ 39%]
tests/test_schema.py::TestRAGFlowDataConverter::test_convert_batch PASSED [ 40%]
tests/test_schema.py::TestVectorFieldPattern::test_valid_patterns PASSED [ 41%]
tests/test_schema.py::TestVectorFieldPattern::test_invalid_patterns PASSED [ 43%]
tests/test_schema.py::TestVectorFieldPattern::test_extract_dimension PASSED [ 44%]
tests/test_schema.py::TestConstants::test_array_columns PASSED           [ 45%]
tests/test_schema.py::TestConstants::test_json_columns PASSED            [ 46%]
tests/test_schema.py::TestConstants::test_ragflow_columns_completeness PASSED [ 47%]
tests/test_schema.py::TestConstants::test_fts_columns PASSED             [ 48%]
tests/test_schema.py::TestConstants::test_ragflow_columns_types PASSED   [ 50%]
tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_empty_mapping PASSED [ 51%]
tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_mapping_without_properties PASSED [ 52%]
tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_multiple_vector_fields PASSED [ 53%]
tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_get_column_definitions_without_analysis PASSED [ 54%]
tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_get_vector_fields PASSED [ 55%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_empty_document PASSED [ 56%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_document_without_source PASSED [ 58%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_boolean_to_integer PASSED [ 59%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_invalid_integer PASSED [ 60%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_float_field PASSED [ 61%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_array_with_special_characters PASSED [ 62%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_already_json_array PASSED [ 63%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_single_value_to_array PASSED [ 65%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_detect_vector_fields_from_document PASSED [ 66%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_with_default_values PASSED [ 67%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_list_content PASSED [ 68%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_batch_empty PASSED [ 69%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_existing_extra_field_merged PASSED [ 70%]
tests/test_verify.py::TestVerificationResult::test_create_basic_result PASSED [ 72%]
tests/test_verify.py::TestVerificationResult::test_result_default_values PASSED [ 73%]
tests/test_verify.py::TestVerificationResult::test_result_with_counts PASSED [ 74%]
tests/test_verify.py::TestMigrationVerifier::test_verify_counts_match PASSED [ 75%]
tests/test_verify.py::TestMigrationVerifier::test_verify_counts_mismatch PASSED [ 76%]
tests/test_verify.py::TestMigrationVerifier::test_verify_samples_all_match PASSED [ 77%]
tests/test_verify.py::TestMigrationVerifier::test_verify_samples_some_missing PASSED [ 79%]
tests/test_verify.py::TestMigrationVerifier::test_verify_samples_data_mismatch PASSED [ 80%]
tests/test_verify.py::TestMigrationVerifier::test_values_equal_none_values PASSED [ 81%]
tests/test_verify.py::TestMigrationVerifier::test_values_equal_array_columns PASSED [ 82%]
tests/test_verify.py::TestMigrationVerifier::test_values_equal_json_columns PASSED [ 83%]
tests/test_verify.py::TestMigrationVerifier::test_values_equal_kb_id_list PASSED [ 84%]
tests/test_verify.py::TestMigrationVerifier::test_values_equal_content_with_weight_dict PASSED [ 86%]
tests/test_verify.py::TestMigrationVerifier::test_determine_result_passed PASSED [ 87%]
tests/test_verify.py::TestMigrationVerifier::test_determine_result_failed_count PASSED [ 88%]
tests/test_verify.py::TestMigrationVerifier::test_determine_result_failed_samples PASSED [ 89%]
tests/test_verify.py::TestMigrationVerifier::test_generate_report PASSED [ 90%]
tests/test_verify.py::TestMigrationVerifier::test_generate_report_with_missing PASSED [ 91%]
tests/test_verify.py::TestMigrationVerifier::test_generate_report_with_mismatches PASSED [ 93%]
tests/test_verify.py::TestValueComparison::test_string_comparison PASSED [ 94%]
tests/test_verify.py::TestValueComparison::test_integer_comparison PASSED [ 95%]
tests/test_verify.py::TestValueComparison::test_float_comparison PASSED  [ 96%]
tests/test_verify.py::TestValueComparison::test_boolean_comparison PASSED [ 97%]
tests/test_verify.py::TestValueComparison::test_empty_array_comparison PASSED [ 98%]
tests/test_verify.py::TestValueComparison::test_nested_json_comparison PASSED [100%]

======================= 86 passed, 88 warnings in 0.66s ========================
```

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
2026-01-31 16:11:27 +08:00

452 lines
16 KiB
Python

"""
RAGFlow-specific schema conversion from Elasticsearch to OceanBase.
This module handles the fixed RAGFlow table structure migration.
RAGFlow uses a predefined schema for both ES and OceanBase.
"""
import json
import logging
import re
from typing import Any
logger = logging.getLogger(__name__)
# RAGFlow fixed column definitions (from rag/utils/ob_conn.py)
# These are the actual columns used by RAGFlow
RAGFLOW_COLUMNS = {
# Primary identifiers
"id": {"ob_type": "String(256)", "nullable": False, "is_primary": True},
"kb_id": {"ob_type": "String(256)", "nullable": False, "index": True},
"doc_id": {"ob_type": "String(256)", "nullable": True, "index": True},
# Document metadata
"docnm_kwd": {"ob_type": "String(256)", "nullable": True}, # document name
"doc_type_kwd": {"ob_type": "String(256)", "nullable": True}, # document type
# Title fields
"title_tks": {"ob_type": "String(256)", "nullable": True}, # title tokens
"title_sm_tks": {"ob_type": "String(256)", "nullable": True}, # fine-grained title tokens
# Content fields
"content_with_weight": {"ob_type": "LONGTEXT", "nullable": True}, # original content
"content_ltks": {"ob_type": "LONGTEXT", "nullable": True}, # long text tokens
"content_sm_ltks": {"ob_type": "LONGTEXT", "nullable": True}, # fine-grained tokens
# Feature fields
"pagerank_fea": {"ob_type": "Integer", "nullable": True}, # page rank priority
# Array fields
"important_kwd": {"ob_type": "ARRAY(String(256))", "nullable": True, "is_array": True}, # keywords
"important_tks": {"ob_type": "TEXT", "nullable": True}, # keyword tokens
"question_kwd": {"ob_type": "ARRAY(String(1024))", "nullable": True, "is_array": True}, # questions
"question_tks": {"ob_type": "TEXT", "nullable": True}, # question tokens
"tag_kwd": {"ob_type": "ARRAY(String(256))", "nullable": True, "is_array": True}, # tags
"tag_feas": {"ob_type": "JSON", "nullable": True, "is_json": True}, # tag features
# Status fields
"available_int": {"ob_type": "Integer", "nullable": False, "default": 1},
# Time fields
"create_time": {"ob_type": "String(19)", "nullable": True},
"create_timestamp_flt": {"ob_type": "Double", "nullable": True},
# Image field
"img_id": {"ob_type": "String(128)", "nullable": True},
# Position fields (arrays)
"position_int": {"ob_type": "ARRAY(ARRAY(Integer))", "nullable": True, "is_array": True},
"page_num_int": {"ob_type": "ARRAY(Integer)", "nullable": True, "is_array": True},
"top_int": {"ob_type": "ARRAY(Integer)", "nullable": True, "is_array": True},
# Knowledge graph fields
"knowledge_graph_kwd": {"ob_type": "String(256)", "nullable": True, "index": True},
"source_id": {"ob_type": "ARRAY(String(256))", "nullable": True, "is_array": True},
"entity_kwd": {"ob_type": "String(256)", "nullable": True},
"entity_type_kwd": {"ob_type": "String(256)", "nullable": True, "index": True},
"from_entity_kwd": {"ob_type": "String(256)", "nullable": True},
"to_entity_kwd": {"ob_type": "String(256)", "nullable": True},
"weight_int": {"ob_type": "Integer", "nullable": True},
"weight_flt": {"ob_type": "Double", "nullable": True},
"entities_kwd": {"ob_type": "ARRAY(String(256))", "nullable": True, "is_array": True},
"rank_flt": {"ob_type": "Double", "nullable": True},
# Status
"removed_kwd": {"ob_type": "String(256)", "nullable": True, "index": True, "default": "N"},
# JSON fields
"metadata": {"ob_type": "JSON", "nullable": True, "is_json": True},
"extra": {"ob_type": "JSON", "nullable": True, "is_json": True},
# New columns
"_order_id": {"ob_type": "Integer", "nullable": True},
"group_id": {"ob_type": "String(256)", "nullable": True},
"mom_id": {"ob_type": "String(256)", "nullable": True},
}
# Array column names for special handling
ARRAY_COLUMNS = [
"important_kwd", "question_kwd", "tag_kwd", "source_id",
"entities_kwd", "position_int", "page_num_int", "top_int"
]
# JSON column names
JSON_COLUMNS = ["tag_feas", "metadata", "extra"]
# Fulltext search columns (for reference)
FTS_COLUMNS_ORIGIN = ["docnm_kwd", "content_with_weight", "important_tks", "question_tks"]
FTS_COLUMNS_TKS = ["title_tks", "title_sm_tks", "important_tks", "question_tks", "content_ltks", "content_sm_ltks"]
# Vector field pattern: q_{vector_size}_vec
VECTOR_FIELD_PATTERN = re.compile(r"q_(?P<vector_size>\d+)_vec")
class RAGFlowSchemaConverter:
"""
Convert RAGFlow Elasticsearch documents to OceanBase format.
RAGFlow uses a fixed schema, so this converter knows exactly
what fields to expect and how to map them.
"""
def __init__(self):
self.vector_fields: list[dict[str, Any]] = []
self.detected_vector_size: int | None = None
def analyze_es_mapping(self, es_mapping: dict[str, Any]) -> dict[str, Any]:
"""
Analyze ES mapping to extract vector field dimensions.
Args:
es_mapping: Elasticsearch index mapping
Returns:
Analysis result with detected fields
"""
result = {
"known_fields": [],
"vector_fields": [],
"unknown_fields": [],
}
properties = es_mapping.get("properties", {})
for field_name, field_def in properties.items():
# Check if it's a known RAGFlow field
if field_name in RAGFLOW_COLUMNS:
result["known_fields"].append(field_name)
# Check if it's a vector field
elif VECTOR_FIELD_PATTERN.match(field_name):
match = VECTOR_FIELD_PATTERN.match(field_name)
vec_size = int(match.group("vector_size"))
result["vector_fields"].append({
"name": field_name,
"dimension": vec_size,
})
self.vector_fields.append({
"name": field_name,
"dimension": vec_size,
})
if self.detected_vector_size is None:
self.detected_vector_size = vec_size
else:
# Unknown field - might be custom field stored in 'extra'
result["unknown_fields"].append(field_name)
logger.info(
f"Analyzed ES mapping: {len(result['known_fields'])} known fields, "
f"{len(result['vector_fields'])} vector fields, "
f"{len(result['unknown_fields'])} unknown fields"
)
return result
def get_column_definitions(self) -> list[dict[str, Any]]:
"""
Get RAGFlow column definitions for OceanBase table creation.
Returns:
List of column definitions
"""
columns = []
for col_name, col_def in RAGFLOW_COLUMNS.items():
columns.append({
"name": col_name,
"ob_type": col_def["ob_type"],
"nullable": col_def.get("nullable", True),
"is_primary": col_def.get("is_primary", False),
"index": col_def.get("index", False),
"is_array": col_def.get("is_array", False),
"is_json": col_def.get("is_json", False),
"default": col_def.get("default"),
})
# Add detected vector fields
for vec_field in self.vector_fields:
columns.append({
"name": vec_field["name"],
"ob_type": f"VECTOR({vec_field['dimension']})",
"nullable": True,
"is_vector": True,
"dimension": vec_field["dimension"],
})
return columns
def get_vector_fields(self) -> list[dict[str, Any]]:
"""Get list of vector fields for index creation."""
return self.vector_fields
class RAGFlowDataConverter:
"""
Convert RAGFlow ES documents to OceanBase row format.
This converter handles the specific data transformations needed
for RAGFlow's data structure.
"""
def __init__(self):
"""Initialize data converter."""
self.vector_fields: set[str] = set()
def detect_vector_fields(self, doc: dict[str, Any]) -> None:
"""Detect vector fields from a sample document."""
for key in doc.keys():
if VECTOR_FIELD_PATTERN.match(key):
self.vector_fields.add(key)
def convert_document(self, es_doc: dict[str, Any]) -> dict[str, Any]:
"""
Convert an ES document to OceanBase row format.
Args:
es_doc: Elasticsearch document (with _id and _source)
Returns:
Dictionary ready for OceanBase insertion
"""
# Extract _id and _source
doc_id = es_doc.get("_id")
source = es_doc.get("_source", es_doc)
row = {}
# Set document ID
if doc_id:
row["id"] = str(doc_id)
elif "id" in source:
row["id"] = str(source["id"])
# Process each field
for field_name, field_def in RAGFLOW_COLUMNS.items():
if field_name == "id":
continue # Already handled
value = source.get(field_name)
if value is None:
# Use default if available
default = field_def.get("default")
if default is not None:
row[field_name] = default
continue
# Convert based on field type
row[field_name] = self._convert_field_value(
field_name, value, field_def
)
# Handle vector fields
for key, value in source.items():
if VECTOR_FIELD_PATTERN.match(key):
if isinstance(value, list):
row[key] = value
self.vector_fields.add(key)
# Handle unknown fields -> store in 'extra'
extra_fields = {}
for key, value in source.items():
if key not in RAGFLOW_COLUMNS and not VECTOR_FIELD_PATTERN.match(key):
extra_fields[key] = value
if extra_fields:
existing_extra = row.get("extra")
if existing_extra and isinstance(existing_extra, dict):
existing_extra.update(extra_fields)
else:
row["extra"] = json.dumps(extra_fields, ensure_ascii=False)
return row
def _convert_field_value(
self,
field_name: str,
value: Any,
field_def: dict[str, Any]
) -> Any:
"""
Convert a field value to the appropriate format for OceanBase.
Args:
field_name: Field name
value: Original value from ES
field_def: Field definition from RAGFLOW_COLUMNS
Returns:
Converted value
"""
if value is None:
return None
ob_type = field_def.get("ob_type", "")
is_array = field_def.get("is_array", False)
is_json = field_def.get("is_json", False)
# Handle array fields
if is_array:
return self._convert_array_value(value)
# Handle JSON fields
if is_json:
return self._convert_json_value(value)
# Handle specific types
if "Integer" in ob_type:
return self._convert_integer(value)
if "Double" in ob_type or "Float" in ob_type:
return self._convert_float(value)
if "LONGTEXT" in ob_type or "TEXT" in ob_type:
return self._convert_text(value)
if "String" in ob_type:
return self._convert_string(value, field_name)
# Default: convert to string
return str(value) if value is not None else None
def _convert_array_value(self, value: Any) -> str | None:
"""Convert array value to JSON string for OceanBase."""
if value is None:
return None
if isinstance(value, str):
# Already a JSON string
try:
# Validate it's valid JSON
json.loads(value)
return value
except json.JSONDecodeError:
# Not valid JSON, wrap in array
return json.dumps([value], ensure_ascii=False)
if isinstance(value, list):
# Clean array values
cleaned = []
for item in value:
if isinstance(item, str):
# Clean special characters
cleaned_str = item.strip()
cleaned_str = cleaned_str.replace('\\', '\\\\')
cleaned_str = cleaned_str.replace('\n', '\\n')
cleaned_str = cleaned_str.replace('\r', '\\r')
cleaned_str = cleaned_str.replace('\t', '\\t')
cleaned.append(cleaned_str)
else:
cleaned.append(item)
return json.dumps(cleaned, ensure_ascii=False)
# Single value - wrap in array
return json.dumps([value], ensure_ascii=False)
def _convert_json_value(self, value: Any) -> str | None:
"""Convert JSON value to string for OceanBase."""
if value is None:
return None
if isinstance(value, str):
# Already a string, validate JSON
try:
json.loads(value)
return value
except json.JSONDecodeError:
# Not valid JSON, return as-is
return value
if isinstance(value, (dict, list)):
return json.dumps(value, ensure_ascii=False)
return str(value)
def _convert_integer(self, value: Any) -> int | None:
"""Convert to integer."""
if value is None:
return None
if isinstance(value, bool):
return 1 if value else 0
try:
return int(value)
except (ValueError, TypeError):
return None
def _convert_float(self, value: Any) -> float | None:
"""Convert to float."""
if value is None:
return None
try:
return float(value)
except (ValueError, TypeError):
return None
def _convert_text(self, value: Any) -> str | None:
"""Convert to text/longtext."""
if value is None:
return None
if isinstance(value, dict):
# content_with_weight might be stored as dict
return json.dumps(value, ensure_ascii=False)
if isinstance(value, list):
return json.dumps(value, ensure_ascii=False)
return str(value)
def _convert_string(self, value: Any, field_name: str) -> str | None:
"""Convert to string with length considerations."""
if value is None:
return None
# Handle kb_id which might be a list in ES
if field_name == "kb_id" and isinstance(value, list):
return str(value[0]) if value else None
if isinstance(value, (dict, list)):
return json.dumps(value, ensure_ascii=False)
return str(value)
def convert_batch(self, es_docs: list[dict[str, Any]]) -> list[dict[str, Any]]:
"""
Convert a batch of ES documents.
Args:
es_docs: List of Elasticsearch documents
Returns:
List of dictionaries ready for OceanBase insertion
"""
return [self.convert_document(doc) for doc in es_docs]
# Backwards compatibility aliases
SchemaConverter = RAGFlowSchemaConverter
DataConverter = RAGFlowDataConverter