mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-05 01:55:05 +08:00
### What problem does this PR solve? fixes https://github.com/infiniflow/ragflow/issues/12774 Add a CLI tool for migrating RAGFlow data from Elasticsearch to OceanBase, enabling users to switch their document storage backend. - Automatic discovery and migration of all `ragflow_*` indices - Schema conversion with vector dimension auto-detection - Batch processing with progress tracking and resume capability - Data consistency validation and migration report generation **Note**: Due to network issues, I was unable to pull the required Docker images (Elasticsearch, OceanBase) to run the full end-to-end verification. Unit tests have been verified to pass. I will complete the e2e verification when network conditions allow, and submit a follow-up PR if any fixes are needed. ```bash ============================= test session starts ============================== platform darwin -- Python 3.13.6, pytest-9.0.2, pluggy-1.6.0 rootdir: /Users/sevenc/code/ai/oceanbase/ragflow/tools/es-to-oceanbase-migration configfile: pyproject.toml testpaths: tests plugins: anyio-4.12.1, asyncio-1.3.0, cov-7.0.0 collected 86 items tests/test_progress.py::TestMigrationProgress::test_create_basic_progress PASSED [ 1%] tests/test_progress.py::TestMigrationProgress::test_create_progress_with_counts PASSED [ 2%] tests/test_progress.py::TestMigrationProgress::test_progress_default_values PASSED [ 3%] tests/test_progress.py::TestMigrationProgress::test_progress_status_values PASSED [ 4%] tests/test_progress.py::TestProgressManager::test_create_progress_manager PASSED [ 5%] tests/test_progress.py::TestProgressManager::test_create_progress_manager_creates_dir PASSED [ 6%] tests/test_progress.py::TestProgressManager::test_create_progress PASSED [ 8%] tests/test_progress.py::TestProgressManager::test_save_and_load_progress PASSED [ 9%] tests/test_progress.py::TestProgressManager::test_load_nonexistent_progress PASSED [ 10%] tests/test_progress.py::TestProgressManager::test_delete_progress PASSED [ 11%] tests/test_progress.py::TestProgressManager::test_update_progress PASSED [ 12%] tests/test_progress.py::TestProgressManager::test_update_progress_multiple_batches PASSED [ 13%] tests/test_progress.py::TestProgressManager::test_mark_completed PASSED [ 15%] tests/test_progress.py::TestProgressManager::test_mark_failed PASSED [ 16%] tests/test_progress.py::TestProgressManager::test_mark_paused PASSED [ 17%] tests/test_progress.py::TestProgressManager::test_can_resume_running PASSED [ 18%] tests/test_progress.py::TestProgressManager::test_can_resume_paused PASSED [ 19%] tests/test_progress.py::TestProgressManager::test_can_resume_completed PASSED [ 20%] tests/test_progress.py::TestProgressManager::test_can_resume_nonexistent PASSED [ 22%] tests/test_progress.py::TestProgressManager::test_get_resume_info PASSED [ 23%] tests/test_progress.py::TestProgressManager::test_get_resume_info_nonexistent PASSED [ 24%] tests/test_progress.py::TestProgressManager::test_progress_file_path PASSED [ 25%] tests/test_progress.py::TestProgressManager::test_progress_file_content PASSED [ 26%] tests/test_schema.py::TestRAGFlowSchemaConverter::test_analyze_ragflow_mapping PASSED [ 27%] tests/test_schema.py::TestRAGFlowSchemaConverter::test_detect_vector_size PASSED [ 29%] tests/test_schema.py::TestRAGFlowSchemaConverter::test_unknown_fields PASSED [ 30%] tests/test_schema.py::TestRAGFlowSchemaConverter::test_get_column_definitions PASSED [ 31%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_basic_document PASSED [ 32%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_with_vector PASSED [ 33%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_array_fields PASSED [ 34%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_json_fields PASSED [ 36%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_unknown_fields_to_extra PASSED [ 37%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_kb_id_list PASSED [ 38%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_content_with_weight_dict PASSED [ 39%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_batch PASSED [ 40%] tests/test_schema.py::TestVectorFieldPattern::test_valid_patterns PASSED [ 41%] tests/test_schema.py::TestVectorFieldPattern::test_invalid_patterns PASSED [ 43%] tests/test_schema.py::TestVectorFieldPattern::test_extract_dimension PASSED [ 44%] tests/test_schema.py::TestConstants::test_array_columns PASSED [ 45%] tests/test_schema.py::TestConstants::test_json_columns PASSED [ 46%] tests/test_schema.py::TestConstants::test_ragflow_columns_completeness PASSED [ 47%] tests/test_schema.py::TestConstants::test_fts_columns PASSED [ 48%] tests/test_schema.py::TestConstants::test_ragflow_columns_types PASSED [ 50%] tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_empty_mapping PASSED [ 51%] tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_mapping_without_properties PASSED [ 52%] tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_multiple_vector_fields PASSED [ 53%] tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_get_column_definitions_without_analysis PASSED [ 54%] tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_get_vector_fields PASSED [ 55%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_empty_document PASSED [ 56%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_document_without_source PASSED [ 58%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_boolean_to_integer PASSED [ 59%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_invalid_integer PASSED [ 60%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_float_field PASSED [ 61%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_array_with_special_characters PASSED [ 62%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_already_json_array PASSED [ 63%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_single_value_to_array PASSED [ 65%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_detect_vector_fields_from_document PASSED [ 66%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_with_default_values PASSED [ 67%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_list_content PASSED [ 68%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_batch_empty PASSED [ 69%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_existing_extra_field_merged PASSED [ 70%] tests/test_verify.py::TestVerificationResult::test_create_basic_result PASSED [ 72%] tests/test_verify.py::TestVerificationResult::test_result_default_values PASSED [ 73%] tests/test_verify.py::TestVerificationResult::test_result_with_counts PASSED [ 74%] tests/test_verify.py::TestMigrationVerifier::test_verify_counts_match PASSED [ 75%] tests/test_verify.py::TestMigrationVerifier::test_verify_counts_mismatch PASSED [ 76%] tests/test_verify.py::TestMigrationVerifier::test_verify_samples_all_match PASSED [ 77%] tests/test_verify.py::TestMigrationVerifier::test_verify_samples_some_missing PASSED [ 79%] tests/test_verify.py::TestMigrationVerifier::test_verify_samples_data_mismatch PASSED [ 80%] tests/test_verify.py::TestMigrationVerifier::test_values_equal_none_values PASSED [ 81%] tests/test_verify.py::TestMigrationVerifier::test_values_equal_array_columns PASSED [ 82%] tests/test_verify.py::TestMigrationVerifier::test_values_equal_json_columns PASSED [ 83%] tests/test_verify.py::TestMigrationVerifier::test_values_equal_kb_id_list PASSED [ 84%] tests/test_verify.py::TestMigrationVerifier::test_values_equal_content_with_weight_dict PASSED [ 86%] tests/test_verify.py::TestMigrationVerifier::test_determine_result_passed PASSED [ 87%] tests/test_verify.py::TestMigrationVerifier::test_determine_result_failed_count PASSED [ 88%] tests/test_verify.py::TestMigrationVerifier::test_determine_result_failed_samples PASSED [ 89%] tests/test_verify.py::TestMigrationVerifier::test_generate_report PASSED [ 90%] tests/test_verify.py::TestMigrationVerifier::test_generate_report_with_missing PASSED [ 91%] tests/test_verify.py::TestMigrationVerifier::test_generate_report_with_mismatches PASSED [ 93%] tests/test_verify.py::TestValueComparison::test_string_comparison PASSED [ 94%] tests/test_verify.py::TestValueComparison::test_integer_comparison PASSED [ 95%] tests/test_verify.py::TestValueComparison::test_float_comparison PASSED [ 96%] tests/test_verify.py::TestValueComparison::test_boolean_comparison PASSED [ 97%] tests/test_verify.py::TestValueComparison::test_empty_array_comparison PASSED [ 98%] tests/test_verify.py::TestValueComparison::test_nested_json_comparison PASSED [100%] ======================= 86 passed, 88 warnings in 0.66s ======================== ``` ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe):
452 lines
16 KiB
Python
452 lines
16 KiB
Python
"""
|
|
RAGFlow-specific schema conversion from Elasticsearch to OceanBase.
|
|
|
|
This module handles the fixed RAGFlow table structure migration.
|
|
RAGFlow uses a predefined schema for both ES and OceanBase.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
from typing import Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# RAGFlow fixed column definitions (from rag/utils/ob_conn.py)
|
|
# These are the actual columns used by RAGFlow
|
|
RAGFLOW_COLUMNS = {
|
|
# Primary identifiers
|
|
"id": {"ob_type": "String(256)", "nullable": False, "is_primary": True},
|
|
"kb_id": {"ob_type": "String(256)", "nullable": False, "index": True},
|
|
"doc_id": {"ob_type": "String(256)", "nullable": True, "index": True},
|
|
|
|
# Document metadata
|
|
"docnm_kwd": {"ob_type": "String(256)", "nullable": True}, # document name
|
|
"doc_type_kwd": {"ob_type": "String(256)", "nullable": True}, # document type
|
|
|
|
# Title fields
|
|
"title_tks": {"ob_type": "String(256)", "nullable": True}, # title tokens
|
|
"title_sm_tks": {"ob_type": "String(256)", "nullable": True}, # fine-grained title tokens
|
|
|
|
# Content fields
|
|
"content_with_weight": {"ob_type": "LONGTEXT", "nullable": True}, # original content
|
|
"content_ltks": {"ob_type": "LONGTEXT", "nullable": True}, # long text tokens
|
|
"content_sm_ltks": {"ob_type": "LONGTEXT", "nullable": True}, # fine-grained tokens
|
|
|
|
# Feature fields
|
|
"pagerank_fea": {"ob_type": "Integer", "nullable": True}, # page rank priority
|
|
|
|
# Array fields
|
|
"important_kwd": {"ob_type": "ARRAY(String(256))", "nullable": True, "is_array": True}, # keywords
|
|
"important_tks": {"ob_type": "TEXT", "nullable": True}, # keyword tokens
|
|
"question_kwd": {"ob_type": "ARRAY(String(1024))", "nullable": True, "is_array": True}, # questions
|
|
"question_tks": {"ob_type": "TEXT", "nullable": True}, # question tokens
|
|
"tag_kwd": {"ob_type": "ARRAY(String(256))", "nullable": True, "is_array": True}, # tags
|
|
"tag_feas": {"ob_type": "JSON", "nullable": True, "is_json": True}, # tag features
|
|
|
|
# Status fields
|
|
"available_int": {"ob_type": "Integer", "nullable": False, "default": 1},
|
|
|
|
# Time fields
|
|
"create_time": {"ob_type": "String(19)", "nullable": True},
|
|
"create_timestamp_flt": {"ob_type": "Double", "nullable": True},
|
|
|
|
# Image field
|
|
"img_id": {"ob_type": "String(128)", "nullable": True},
|
|
|
|
# Position fields (arrays)
|
|
"position_int": {"ob_type": "ARRAY(ARRAY(Integer))", "nullable": True, "is_array": True},
|
|
"page_num_int": {"ob_type": "ARRAY(Integer)", "nullable": True, "is_array": True},
|
|
"top_int": {"ob_type": "ARRAY(Integer)", "nullable": True, "is_array": True},
|
|
|
|
# Knowledge graph fields
|
|
"knowledge_graph_kwd": {"ob_type": "String(256)", "nullable": True, "index": True},
|
|
"source_id": {"ob_type": "ARRAY(String(256))", "nullable": True, "is_array": True},
|
|
"entity_kwd": {"ob_type": "String(256)", "nullable": True},
|
|
"entity_type_kwd": {"ob_type": "String(256)", "nullable": True, "index": True},
|
|
"from_entity_kwd": {"ob_type": "String(256)", "nullable": True},
|
|
"to_entity_kwd": {"ob_type": "String(256)", "nullable": True},
|
|
"weight_int": {"ob_type": "Integer", "nullable": True},
|
|
"weight_flt": {"ob_type": "Double", "nullable": True},
|
|
"entities_kwd": {"ob_type": "ARRAY(String(256))", "nullable": True, "is_array": True},
|
|
"rank_flt": {"ob_type": "Double", "nullable": True},
|
|
|
|
# Status
|
|
"removed_kwd": {"ob_type": "String(256)", "nullable": True, "index": True, "default": "N"},
|
|
|
|
# JSON fields
|
|
"metadata": {"ob_type": "JSON", "nullable": True, "is_json": True},
|
|
"extra": {"ob_type": "JSON", "nullable": True, "is_json": True},
|
|
|
|
# New columns
|
|
"_order_id": {"ob_type": "Integer", "nullable": True},
|
|
"group_id": {"ob_type": "String(256)", "nullable": True},
|
|
"mom_id": {"ob_type": "String(256)", "nullable": True},
|
|
}
|
|
|
|
# Array column names for special handling
|
|
ARRAY_COLUMNS = [
|
|
"important_kwd", "question_kwd", "tag_kwd", "source_id",
|
|
"entities_kwd", "position_int", "page_num_int", "top_int"
|
|
]
|
|
|
|
# JSON column names
|
|
JSON_COLUMNS = ["tag_feas", "metadata", "extra"]
|
|
|
|
# Fulltext search columns (for reference)
|
|
FTS_COLUMNS_ORIGIN = ["docnm_kwd", "content_with_weight", "important_tks", "question_tks"]
|
|
FTS_COLUMNS_TKS = ["title_tks", "title_sm_tks", "important_tks", "question_tks", "content_ltks", "content_sm_ltks"]
|
|
|
|
# Vector field pattern: q_{vector_size}_vec
|
|
VECTOR_FIELD_PATTERN = re.compile(r"q_(?P<vector_size>\d+)_vec")
|
|
|
|
|
|
class RAGFlowSchemaConverter:
|
|
"""
|
|
Convert RAGFlow Elasticsearch documents to OceanBase format.
|
|
|
|
RAGFlow uses a fixed schema, so this converter knows exactly
|
|
what fields to expect and how to map them.
|
|
"""
|
|
|
|
def __init__(self):
|
|
self.vector_fields: list[dict[str, Any]] = []
|
|
self.detected_vector_size: int | None = None
|
|
|
|
def analyze_es_mapping(self, es_mapping: dict[str, Any]) -> dict[str, Any]:
|
|
"""
|
|
Analyze ES mapping to extract vector field dimensions.
|
|
|
|
Args:
|
|
es_mapping: Elasticsearch index mapping
|
|
|
|
Returns:
|
|
Analysis result with detected fields
|
|
"""
|
|
result = {
|
|
"known_fields": [],
|
|
"vector_fields": [],
|
|
"unknown_fields": [],
|
|
}
|
|
|
|
properties = es_mapping.get("properties", {})
|
|
|
|
for field_name, field_def in properties.items():
|
|
# Check if it's a known RAGFlow field
|
|
if field_name in RAGFLOW_COLUMNS:
|
|
result["known_fields"].append(field_name)
|
|
# Check if it's a vector field
|
|
elif VECTOR_FIELD_PATTERN.match(field_name):
|
|
match = VECTOR_FIELD_PATTERN.match(field_name)
|
|
vec_size = int(match.group("vector_size"))
|
|
result["vector_fields"].append({
|
|
"name": field_name,
|
|
"dimension": vec_size,
|
|
})
|
|
self.vector_fields.append({
|
|
"name": field_name,
|
|
"dimension": vec_size,
|
|
})
|
|
if self.detected_vector_size is None:
|
|
self.detected_vector_size = vec_size
|
|
else:
|
|
# Unknown field - might be custom field stored in 'extra'
|
|
result["unknown_fields"].append(field_name)
|
|
|
|
logger.info(
|
|
f"Analyzed ES mapping: {len(result['known_fields'])} known fields, "
|
|
f"{len(result['vector_fields'])} vector fields, "
|
|
f"{len(result['unknown_fields'])} unknown fields"
|
|
)
|
|
|
|
return result
|
|
|
|
def get_column_definitions(self) -> list[dict[str, Any]]:
|
|
"""
|
|
Get RAGFlow column definitions for OceanBase table creation.
|
|
|
|
Returns:
|
|
List of column definitions
|
|
"""
|
|
columns = []
|
|
|
|
for col_name, col_def in RAGFLOW_COLUMNS.items():
|
|
columns.append({
|
|
"name": col_name,
|
|
"ob_type": col_def["ob_type"],
|
|
"nullable": col_def.get("nullable", True),
|
|
"is_primary": col_def.get("is_primary", False),
|
|
"index": col_def.get("index", False),
|
|
"is_array": col_def.get("is_array", False),
|
|
"is_json": col_def.get("is_json", False),
|
|
"default": col_def.get("default"),
|
|
})
|
|
|
|
# Add detected vector fields
|
|
for vec_field in self.vector_fields:
|
|
columns.append({
|
|
"name": vec_field["name"],
|
|
"ob_type": f"VECTOR({vec_field['dimension']})",
|
|
"nullable": True,
|
|
"is_vector": True,
|
|
"dimension": vec_field["dimension"],
|
|
})
|
|
|
|
return columns
|
|
|
|
def get_vector_fields(self) -> list[dict[str, Any]]:
|
|
"""Get list of vector fields for index creation."""
|
|
return self.vector_fields
|
|
|
|
|
|
class RAGFlowDataConverter:
|
|
"""
|
|
Convert RAGFlow ES documents to OceanBase row format.
|
|
|
|
This converter handles the specific data transformations needed
|
|
for RAGFlow's data structure.
|
|
"""
|
|
|
|
def __init__(self):
|
|
"""Initialize data converter."""
|
|
self.vector_fields: set[str] = set()
|
|
|
|
def detect_vector_fields(self, doc: dict[str, Any]) -> None:
|
|
"""Detect vector fields from a sample document."""
|
|
for key in doc.keys():
|
|
if VECTOR_FIELD_PATTERN.match(key):
|
|
self.vector_fields.add(key)
|
|
|
|
def convert_document(self, es_doc: dict[str, Any]) -> dict[str, Any]:
|
|
"""
|
|
Convert an ES document to OceanBase row format.
|
|
|
|
Args:
|
|
es_doc: Elasticsearch document (with _id and _source)
|
|
|
|
Returns:
|
|
Dictionary ready for OceanBase insertion
|
|
"""
|
|
# Extract _id and _source
|
|
doc_id = es_doc.get("_id")
|
|
source = es_doc.get("_source", es_doc)
|
|
|
|
row = {}
|
|
|
|
# Set document ID
|
|
if doc_id:
|
|
row["id"] = str(doc_id)
|
|
elif "id" in source:
|
|
row["id"] = str(source["id"])
|
|
|
|
# Process each field
|
|
for field_name, field_def in RAGFLOW_COLUMNS.items():
|
|
if field_name == "id":
|
|
continue # Already handled
|
|
|
|
value = source.get(field_name)
|
|
|
|
if value is None:
|
|
# Use default if available
|
|
default = field_def.get("default")
|
|
if default is not None:
|
|
row[field_name] = default
|
|
continue
|
|
|
|
# Convert based on field type
|
|
row[field_name] = self._convert_field_value(
|
|
field_name, value, field_def
|
|
)
|
|
|
|
# Handle vector fields
|
|
for key, value in source.items():
|
|
if VECTOR_FIELD_PATTERN.match(key):
|
|
if isinstance(value, list):
|
|
row[key] = value
|
|
self.vector_fields.add(key)
|
|
|
|
# Handle unknown fields -> store in 'extra'
|
|
extra_fields = {}
|
|
for key, value in source.items():
|
|
if key not in RAGFLOW_COLUMNS and not VECTOR_FIELD_PATTERN.match(key):
|
|
extra_fields[key] = value
|
|
|
|
if extra_fields:
|
|
existing_extra = row.get("extra")
|
|
if existing_extra and isinstance(existing_extra, dict):
|
|
existing_extra.update(extra_fields)
|
|
else:
|
|
row["extra"] = json.dumps(extra_fields, ensure_ascii=False)
|
|
|
|
return row
|
|
|
|
def _convert_field_value(
|
|
self,
|
|
field_name: str,
|
|
value: Any,
|
|
field_def: dict[str, Any]
|
|
) -> Any:
|
|
"""
|
|
Convert a field value to the appropriate format for OceanBase.
|
|
|
|
Args:
|
|
field_name: Field name
|
|
value: Original value from ES
|
|
field_def: Field definition from RAGFLOW_COLUMNS
|
|
|
|
Returns:
|
|
Converted value
|
|
"""
|
|
if value is None:
|
|
return None
|
|
|
|
ob_type = field_def.get("ob_type", "")
|
|
is_array = field_def.get("is_array", False)
|
|
is_json = field_def.get("is_json", False)
|
|
|
|
# Handle array fields
|
|
if is_array:
|
|
return self._convert_array_value(value)
|
|
|
|
# Handle JSON fields
|
|
if is_json:
|
|
return self._convert_json_value(value)
|
|
|
|
# Handle specific types
|
|
if "Integer" in ob_type:
|
|
return self._convert_integer(value)
|
|
|
|
if "Double" in ob_type or "Float" in ob_type:
|
|
return self._convert_float(value)
|
|
|
|
if "LONGTEXT" in ob_type or "TEXT" in ob_type:
|
|
return self._convert_text(value)
|
|
|
|
if "String" in ob_type:
|
|
return self._convert_string(value, field_name)
|
|
|
|
# Default: convert to string
|
|
return str(value) if value is not None else None
|
|
|
|
def _convert_array_value(self, value: Any) -> str | None:
|
|
"""Convert array value to JSON string for OceanBase."""
|
|
if value is None:
|
|
return None
|
|
|
|
if isinstance(value, str):
|
|
# Already a JSON string
|
|
try:
|
|
# Validate it's valid JSON
|
|
json.loads(value)
|
|
return value
|
|
except json.JSONDecodeError:
|
|
# Not valid JSON, wrap in array
|
|
return json.dumps([value], ensure_ascii=False)
|
|
|
|
if isinstance(value, list):
|
|
# Clean array values
|
|
cleaned = []
|
|
for item in value:
|
|
if isinstance(item, str):
|
|
# Clean special characters
|
|
cleaned_str = item.strip()
|
|
cleaned_str = cleaned_str.replace('\\', '\\\\')
|
|
cleaned_str = cleaned_str.replace('\n', '\\n')
|
|
cleaned_str = cleaned_str.replace('\r', '\\r')
|
|
cleaned_str = cleaned_str.replace('\t', '\\t')
|
|
cleaned.append(cleaned_str)
|
|
else:
|
|
cleaned.append(item)
|
|
return json.dumps(cleaned, ensure_ascii=False)
|
|
|
|
# Single value - wrap in array
|
|
return json.dumps([value], ensure_ascii=False)
|
|
|
|
def _convert_json_value(self, value: Any) -> str | None:
|
|
"""Convert JSON value to string for OceanBase."""
|
|
if value is None:
|
|
return None
|
|
|
|
if isinstance(value, str):
|
|
# Already a string, validate JSON
|
|
try:
|
|
json.loads(value)
|
|
return value
|
|
except json.JSONDecodeError:
|
|
# Not valid JSON, return as-is
|
|
return value
|
|
|
|
if isinstance(value, (dict, list)):
|
|
return json.dumps(value, ensure_ascii=False)
|
|
|
|
return str(value)
|
|
|
|
def _convert_integer(self, value: Any) -> int | None:
|
|
"""Convert to integer."""
|
|
if value is None:
|
|
return None
|
|
|
|
if isinstance(value, bool):
|
|
return 1 if value else 0
|
|
|
|
try:
|
|
return int(value)
|
|
except (ValueError, TypeError):
|
|
return None
|
|
|
|
def _convert_float(self, value: Any) -> float | None:
|
|
"""Convert to float."""
|
|
if value is None:
|
|
return None
|
|
|
|
try:
|
|
return float(value)
|
|
except (ValueError, TypeError):
|
|
return None
|
|
|
|
def _convert_text(self, value: Any) -> str | None:
|
|
"""Convert to text/longtext."""
|
|
if value is None:
|
|
return None
|
|
|
|
if isinstance(value, dict):
|
|
# content_with_weight might be stored as dict
|
|
return json.dumps(value, ensure_ascii=False)
|
|
|
|
if isinstance(value, list):
|
|
return json.dumps(value, ensure_ascii=False)
|
|
|
|
return str(value)
|
|
|
|
def _convert_string(self, value: Any, field_name: str) -> str | None:
|
|
"""Convert to string with length considerations."""
|
|
if value is None:
|
|
return None
|
|
|
|
# Handle kb_id which might be a list in ES
|
|
if field_name == "kb_id" and isinstance(value, list):
|
|
return str(value[0]) if value else None
|
|
|
|
if isinstance(value, (dict, list)):
|
|
return json.dumps(value, ensure_ascii=False)
|
|
|
|
return str(value)
|
|
|
|
def convert_batch(self, es_docs: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
|
"""
|
|
Convert a batch of ES documents.
|
|
|
|
Args:
|
|
es_docs: List of Elasticsearch documents
|
|
|
|
Returns:
|
|
List of dictionaries ready for OceanBase insertion
|
|
"""
|
|
return [self.convert_document(doc) for doc in es_docs]
|
|
|
|
|
|
# Backwards compatibility aliases
|
|
SchemaConverter = RAGFlowSchemaConverter
|
|
DataConverter = RAGFlowDataConverter
|