Files
ragflow/tools/es-to-oceanbase-migration/tests/test_verify.py
Se7en 332b11cf96 feat(tools): add Elasticsearch to OceanBase migration tool (#12927)
### What problem does this PR solve?

fixes https://github.com/infiniflow/ragflow/issues/12774

Add a CLI tool for migrating RAGFlow data from Elasticsearch to
OceanBase, enabling users to switch their document storage backend.

- Automatic discovery and migration of all `ragflow_*` indices
- Schema conversion with vector dimension auto-detection
- Batch processing with progress tracking and resume capability
- Data consistency validation and migration report generation

**Note**: Due to network issues, I was unable to pull the required
Docker images (Elasticsearch, OceanBase) to run the full end-to-end
verification. Unit tests have been verified to pass. I will complete the
e2e verification when network conditions allow, and submit a follow-up
PR if any fixes are needed.

```bash
============================= test session starts ==============================
platform darwin -- Python 3.13.6, pytest-9.0.2, pluggy-1.6.0
rootdir: /Users/sevenc/code/ai/oceanbase/ragflow/tools/es-to-oceanbase-migration
configfile: pyproject.toml
testpaths: tests
plugins: anyio-4.12.1, asyncio-1.3.0, cov-7.0.0
collected 86 items

tests/test_progress.py::TestMigrationProgress::test_create_basic_progress PASSED [  1%]
tests/test_progress.py::TestMigrationProgress::test_create_progress_with_counts PASSED [  2%]
tests/test_progress.py::TestMigrationProgress::test_progress_default_values PASSED [  3%]
tests/test_progress.py::TestMigrationProgress::test_progress_status_values PASSED [  4%]
tests/test_progress.py::TestProgressManager::test_create_progress_manager PASSED [  5%]
tests/test_progress.py::TestProgressManager::test_create_progress_manager_creates_dir PASSED [  6%]
tests/test_progress.py::TestProgressManager::test_create_progress PASSED [  8%]
tests/test_progress.py::TestProgressManager::test_save_and_load_progress PASSED [  9%]
tests/test_progress.py::TestProgressManager::test_load_nonexistent_progress PASSED [ 10%]
tests/test_progress.py::TestProgressManager::test_delete_progress PASSED [ 11%]
tests/test_progress.py::TestProgressManager::test_update_progress PASSED [ 12%]
tests/test_progress.py::TestProgressManager::test_update_progress_multiple_batches PASSED [ 13%]
tests/test_progress.py::TestProgressManager::test_mark_completed PASSED  [ 15%]
tests/test_progress.py::TestProgressManager::test_mark_failed PASSED     [ 16%]
tests/test_progress.py::TestProgressManager::test_mark_paused PASSED     [ 17%]
tests/test_progress.py::TestProgressManager::test_can_resume_running PASSED [ 18%]
tests/test_progress.py::TestProgressManager::test_can_resume_paused PASSED [ 19%]
tests/test_progress.py::TestProgressManager::test_can_resume_completed PASSED [ 20%]
tests/test_progress.py::TestProgressManager::test_can_resume_nonexistent PASSED [ 22%]
tests/test_progress.py::TestProgressManager::test_get_resume_info PASSED [ 23%]
tests/test_progress.py::TestProgressManager::test_get_resume_info_nonexistent PASSED [ 24%]
tests/test_progress.py::TestProgressManager::test_progress_file_path PASSED [ 25%]
tests/test_progress.py::TestProgressManager::test_progress_file_content PASSED [ 26%]
tests/test_schema.py::TestRAGFlowSchemaConverter::test_analyze_ragflow_mapping PASSED [ 27%]
tests/test_schema.py::TestRAGFlowSchemaConverter::test_detect_vector_size PASSED [ 29%]
tests/test_schema.py::TestRAGFlowSchemaConverter::test_unknown_fields PASSED [ 30%]
tests/test_schema.py::TestRAGFlowSchemaConverter::test_get_column_definitions PASSED [ 31%]
tests/test_schema.py::TestRAGFlowDataConverter::test_convert_basic_document PASSED [ 32%]
tests/test_schema.py::TestRAGFlowDataConverter::test_convert_with_vector PASSED [ 33%]
tests/test_schema.py::TestRAGFlowDataConverter::test_convert_array_fields PASSED [ 34%]
tests/test_schema.py::TestRAGFlowDataConverter::test_convert_json_fields PASSED [ 36%]
tests/test_schema.py::TestRAGFlowDataConverter::test_convert_unknown_fields_to_extra PASSED [ 37%]
tests/test_schema.py::TestRAGFlowDataConverter::test_convert_kb_id_list PASSED [ 38%]
tests/test_schema.py::TestRAGFlowDataConverter::test_convert_content_with_weight_dict PASSED [ 39%]
tests/test_schema.py::TestRAGFlowDataConverter::test_convert_batch PASSED [ 40%]
tests/test_schema.py::TestVectorFieldPattern::test_valid_patterns PASSED [ 41%]
tests/test_schema.py::TestVectorFieldPattern::test_invalid_patterns PASSED [ 43%]
tests/test_schema.py::TestVectorFieldPattern::test_extract_dimension PASSED [ 44%]
tests/test_schema.py::TestConstants::test_array_columns PASSED           [ 45%]
tests/test_schema.py::TestConstants::test_json_columns PASSED            [ 46%]
tests/test_schema.py::TestConstants::test_ragflow_columns_completeness PASSED [ 47%]
tests/test_schema.py::TestConstants::test_fts_columns PASSED             [ 48%]
tests/test_schema.py::TestConstants::test_ragflow_columns_types PASSED   [ 50%]
tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_empty_mapping PASSED [ 51%]
tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_mapping_without_properties PASSED [ 52%]
tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_multiple_vector_fields PASSED [ 53%]
tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_get_column_definitions_without_analysis PASSED [ 54%]
tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_get_vector_fields PASSED [ 55%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_empty_document PASSED [ 56%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_document_without_source PASSED [ 58%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_boolean_to_integer PASSED [ 59%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_invalid_integer PASSED [ 60%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_float_field PASSED [ 61%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_array_with_special_characters PASSED [ 62%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_already_json_array PASSED [ 63%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_single_value_to_array PASSED [ 65%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_detect_vector_fields_from_document PASSED [ 66%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_with_default_values PASSED [ 67%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_list_content PASSED [ 68%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_batch_empty PASSED [ 69%]
tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_existing_extra_field_merged PASSED [ 70%]
tests/test_verify.py::TestVerificationResult::test_create_basic_result PASSED [ 72%]
tests/test_verify.py::TestVerificationResult::test_result_default_values PASSED [ 73%]
tests/test_verify.py::TestVerificationResult::test_result_with_counts PASSED [ 74%]
tests/test_verify.py::TestMigrationVerifier::test_verify_counts_match PASSED [ 75%]
tests/test_verify.py::TestMigrationVerifier::test_verify_counts_mismatch PASSED [ 76%]
tests/test_verify.py::TestMigrationVerifier::test_verify_samples_all_match PASSED [ 77%]
tests/test_verify.py::TestMigrationVerifier::test_verify_samples_some_missing PASSED [ 79%]
tests/test_verify.py::TestMigrationVerifier::test_verify_samples_data_mismatch PASSED [ 80%]
tests/test_verify.py::TestMigrationVerifier::test_values_equal_none_values PASSED [ 81%]
tests/test_verify.py::TestMigrationVerifier::test_values_equal_array_columns PASSED [ 82%]
tests/test_verify.py::TestMigrationVerifier::test_values_equal_json_columns PASSED [ 83%]
tests/test_verify.py::TestMigrationVerifier::test_values_equal_kb_id_list PASSED [ 84%]
tests/test_verify.py::TestMigrationVerifier::test_values_equal_content_with_weight_dict PASSED [ 86%]
tests/test_verify.py::TestMigrationVerifier::test_determine_result_passed PASSED [ 87%]
tests/test_verify.py::TestMigrationVerifier::test_determine_result_failed_count PASSED [ 88%]
tests/test_verify.py::TestMigrationVerifier::test_determine_result_failed_samples PASSED [ 89%]
tests/test_verify.py::TestMigrationVerifier::test_generate_report PASSED [ 90%]
tests/test_verify.py::TestMigrationVerifier::test_generate_report_with_missing PASSED [ 91%]
tests/test_verify.py::TestMigrationVerifier::test_generate_report_with_mismatches PASSED [ 93%]
tests/test_verify.py::TestValueComparison::test_string_comparison PASSED [ 94%]
tests/test_verify.py::TestValueComparison::test_integer_comparison PASSED [ 95%]
tests/test_verify.py::TestValueComparison::test_float_comparison PASSED  [ 96%]
tests/test_verify.py::TestValueComparison::test_boolean_comparison PASSED [ 97%]
tests/test_verify.py::TestValueComparison::test_empty_array_comparison PASSED [ 98%]
tests/test_verify.py::TestValueComparison::test_nested_json_comparison PASSED [100%]

======================= 86 passed, 88 warnings in 0.66s ========================
```

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [ ] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):
2026-01-31 16:11:27 +08:00

386 lines
14 KiB
Python

"""
Tests for migration verification.
"""
import json
import pytest
from unittest.mock import Mock, MagicMock
from es_ob_migration.verify import MigrationVerifier, VerificationResult
class TestVerificationResult:
"""Test VerificationResult dataclass."""
def test_create_basic_result(self):
"""Test creating a basic result."""
result = VerificationResult(
es_index="ragflow_test",
ob_table="ragflow_test",
)
assert result.es_index == "ragflow_test"
assert result.ob_table == "ragflow_test"
assert result.es_count == 0
assert result.ob_count == 0
assert result.passed is False
def test_result_default_values(self):
"""Test default values."""
result = VerificationResult(
es_index="test",
ob_table="test",
)
assert result.count_match is False
assert result.count_diff == 0
assert result.sample_size == 0
assert result.samples_verified == 0
assert result.samples_matched == 0
assert result.sample_match_rate == 0.0
assert result.missing_in_ob == []
assert result.data_mismatches == []
assert result.message == ""
def test_result_with_counts(self):
"""Test result with count data."""
result = VerificationResult(
es_index="test",
ob_table="test",
es_count=1000,
ob_count=1000,
count_match=True,
)
assert result.es_count == 1000
assert result.ob_count == 1000
assert result.count_match is True
class TestMigrationVerifier:
"""Test MigrationVerifier class."""
@pytest.fixture
def mock_es_client(self):
"""Create mock ES client."""
client = Mock()
client.count_documents = Mock(return_value=100)
client.get_sample_documents = Mock(return_value=[])
return client
@pytest.fixture
def mock_ob_client(self):
"""Create mock OB client."""
client = Mock()
client.count_rows = Mock(return_value=100)
client.get_row_by_id = Mock(return_value=None)
return client
@pytest.fixture
def verifier(self, mock_es_client, mock_ob_client):
"""Create verifier with mock clients."""
return MigrationVerifier(mock_es_client, mock_ob_client)
def test_verify_counts_match(self, mock_es_client, mock_ob_client):
"""Test verification when counts match."""
mock_es_client.count_documents.return_value = 1000
mock_ob_client.count_rows.return_value = 1000
mock_es_client.get_sample_documents.return_value = []
verifier = MigrationVerifier(mock_es_client, mock_ob_client)
result = verifier.verify("ragflow_test", "ragflow_test", sample_size=0)
assert result.es_count == 1000
assert result.ob_count == 1000
assert result.count_match is True
assert result.count_diff == 0
def test_verify_counts_mismatch(self, mock_es_client, mock_ob_client):
"""Test verification when counts don't match."""
mock_es_client.count_documents.return_value = 1000
mock_ob_client.count_rows.return_value = 950
mock_es_client.get_sample_documents.return_value = []
verifier = MigrationVerifier(mock_es_client, mock_ob_client)
result = verifier.verify("ragflow_test", "ragflow_test", sample_size=0)
assert result.es_count == 1000
assert result.ob_count == 950
assert result.count_match is False
assert result.count_diff == 50
def test_verify_samples_all_match(self, mock_es_client, mock_ob_client):
"""Test sample verification when all samples match."""
# Setup ES samples
es_samples = [
{"_id": f"doc_{i}", "id": f"doc_{i}", "kb_id": "kb_001", "content_with_weight": f"content_{i}"}
for i in range(10)
]
mock_es_client.count_documents.return_value = 100
mock_es_client.get_sample_documents.return_value = es_samples
# Setup OB to return matching documents
def get_row(table, doc_id):
return {"id": doc_id, "kb_id": "kb_001", "content_with_weight": f"content_{doc_id.split('_')[1]}"}
mock_ob_client.count_rows.return_value = 100
mock_ob_client.get_row_by_id.side_effect = get_row
verifier = MigrationVerifier(mock_es_client, mock_ob_client)
result = verifier.verify("ragflow_test", "ragflow_test", sample_size=10)
assert result.samples_verified == 10
assert result.samples_matched == 10
assert result.sample_match_rate == 1.0
def test_verify_samples_some_missing(self, mock_es_client, mock_ob_client):
"""Test sample verification when some documents are missing."""
es_samples = [
{"_id": f"doc_{i}", "id": f"doc_{i}", "kb_id": "kb_001"}
for i in range(10)
]
mock_es_client.count_documents.return_value = 100
mock_es_client.get_sample_documents.return_value = es_samples
# Only return some documents
def get_row(table, doc_id):
idx = int(doc_id.split("_")[1])
if idx < 7: # Only return first 7
return {"id": doc_id, "kb_id": "kb_001"}
return None
mock_ob_client.count_rows.return_value = 100
mock_ob_client.get_row_by_id.side_effect = get_row
verifier = MigrationVerifier(mock_es_client, mock_ob_client)
result = verifier.verify("ragflow_test", "ragflow_test", sample_size=10)
assert result.samples_verified == 10
assert result.samples_matched == 7
assert len(result.missing_in_ob) == 3
def test_verify_samples_data_mismatch(self, mock_es_client, mock_ob_client):
"""Test sample verification when data doesn't match."""
es_samples = [
{"_id": "doc_1", "id": "doc_1", "kb_id": "kb_001", "available_int": 1}
]
mock_es_client.count_documents.return_value = 100
mock_es_client.get_sample_documents.return_value = es_samples
# Return document with different data
mock_ob_client.count_rows.return_value = 100
mock_ob_client.get_row_by_id.return_value = {
"id": "doc_1", "kb_id": "kb_002", "available_int": 0 # Different values
}
verifier = MigrationVerifier(mock_es_client, mock_ob_client)
result = verifier.verify("ragflow_test", "ragflow_test", sample_size=1)
assert result.samples_verified == 1
assert result.samples_matched == 0
assert len(result.data_mismatches) == 1
def test_values_equal_none_values(self, verifier):
"""Test value comparison with None values."""
assert verifier._values_equal("field", None, None) is True
assert verifier._values_equal("field", "value", None) is False
assert verifier._values_equal("field", None, "value") is False
def test_values_equal_array_columns(self, verifier):
"""Test value comparison for array columns."""
# Array stored as JSON string in OB
assert verifier._values_equal(
"important_kwd",
["key1", "key2"],
'["key1", "key2"]'
) is True
# Order shouldn't matter for arrays
assert verifier._values_equal(
"important_kwd",
["key2", "key1"],
'["key1", "key2"]'
) is True
def test_values_equal_json_columns(self, verifier):
"""Test value comparison for JSON columns."""
assert verifier._values_equal(
"metadata",
{"author": "John"},
'{"author": "John"}'
) is True
def test_values_equal_kb_id_list(self, verifier):
"""Test kb_id comparison when ES has list."""
# ES sometimes stores kb_id as list
assert verifier._values_equal(
"kb_id",
["kb_001", "kb_002"],
"kb_001"
) is True
def test_values_equal_content_with_weight_dict(self, verifier):
"""Test content_with_weight comparison when OB has JSON string."""
assert verifier._values_equal(
"content_with_weight",
{"text": "content", "weight": 1.0},
'{"text": "content", "weight": 1.0}'
) is True
def test_determine_result_passed(self, mock_es_client, mock_ob_client):
"""Test result determination for passed verification."""
mock_es_client.count_documents.return_value = 1000
mock_ob_client.count_rows.return_value = 1000
es_samples = [{"_id": f"doc_{i}", "id": f"doc_{i}", "kb_id": "kb_001"} for i in range(100)]
mock_es_client.get_sample_documents.return_value = es_samples
mock_ob_client.get_row_by_id.side_effect = lambda t, d: {"id": d, "kb_id": "kb_001"}
verifier = MigrationVerifier(mock_es_client, mock_ob_client)
result = verifier.verify("test", "test", sample_size=100)
assert result.passed is True
assert "PASSED" in result.message
def test_determine_result_failed_count(self, mock_es_client, mock_ob_client):
"""Test result determination when count verification fails."""
mock_es_client.count_documents.return_value = 1000
mock_ob_client.count_rows.return_value = 500 # Big difference
mock_es_client.get_sample_documents.return_value = []
verifier = MigrationVerifier(mock_es_client, mock_ob_client)
result = verifier.verify("test", "test", sample_size=0)
assert result.passed is False
assert "FAILED" in result.message
def test_determine_result_failed_samples(self, mock_es_client, mock_ob_client):
"""Test result determination when sample verification fails."""
mock_es_client.count_documents.return_value = 100
mock_ob_client.count_rows.return_value = 100
es_samples = [{"_id": f"doc_{i}", "id": f"doc_{i}"} for i in range(10)]
mock_es_client.get_sample_documents.return_value = es_samples
mock_ob_client.get_row_by_id.return_value = None # All missing
verifier = MigrationVerifier(mock_es_client, mock_ob_client)
result = verifier.verify("test", "test", sample_size=10)
assert result.passed is False
def test_generate_report(self, verifier):
"""Test report generation."""
result = VerificationResult(
es_index="ragflow_test",
ob_table="ragflow_test",
es_count=1000,
ob_count=1000,
count_match=True,
count_diff=0,
sample_size=100,
samples_verified=100,
samples_matched=100,
sample_match_rate=1.0,
passed=True,
message="Verification PASSED",
)
report = verifier.generate_report(result)
assert "ragflow_test" in report
assert "1,000" in report
assert "PASSED" in report
assert "100.00%" in report
def test_generate_report_with_missing(self, verifier):
"""Test report generation with missing documents."""
result = VerificationResult(
es_index="test",
ob_table="test",
es_count=100,
ob_count=95,
count_match=False,
count_diff=5,
sample_size=10,
samples_verified=10,
samples_matched=8,
sample_match_rate=0.8,
missing_in_ob=["doc_1", "doc_2"],
passed=False,
message="Verification FAILED",
)
report = verifier.generate_report(result)
assert "Missing in OceanBase" in report
assert "doc_1" in report
assert "FAILED" in report
def test_generate_report_with_mismatches(self, verifier):
"""Test report generation with data mismatches."""
result = VerificationResult(
es_index="test",
ob_table="test",
es_count=100,
ob_count=100,
count_match=True,
sample_size=10,
samples_verified=10,
samples_matched=8,
sample_match_rate=0.8,
data_mismatches=[
{
"id": "doc_1",
"differences": [
{"field": "kb_id", "es_value": "kb_001", "ob_value": "kb_002"}
]
}
],
passed=False,
message="Verification FAILED",
)
report = verifier.generate_report(result)
assert "Data Mismatches" in report
assert "doc_1" in report
assert "kb_id" in report
class TestValueComparison:
"""Test value comparison edge cases."""
@pytest.fixture
def verifier(self):
"""Create verifier with mock clients."""
return MigrationVerifier(Mock(), Mock())
def test_string_comparison(self, verifier):
"""Test string comparison."""
assert verifier._values_equal("field", "value", "value") is True
assert verifier._values_equal("field", "value1", "value2") is False
def test_integer_comparison(self, verifier):
"""Test integer comparison (converted to string)."""
assert verifier._values_equal("field", 123, "123") is True
assert verifier._values_equal("field", "123", 123) is True
def test_float_comparison(self, verifier):
"""Test float comparison."""
assert verifier._values_equal("field", 1.5, "1.5") is True
def test_boolean_comparison(self, verifier):
"""Test boolean comparison."""
assert verifier._values_equal("field", True, "True") is True
assert verifier._values_equal("field", False, "False") is True
def test_empty_array_comparison(self, verifier):
"""Test empty array comparison."""
assert verifier._values_equal("important_kwd", [], "[]") is True
def test_nested_json_comparison(self, verifier):
"""Test nested JSON comparison."""
es_value = {"nested": {"key": "value"}}
ob_value = '{"nested": {"key": "value"}}'
assert verifier._values_equal("metadata", es_value, ob_value) is True