feat(tools): add Elasticsearch to OceanBase migration tool (#12927)

### What problem does this PR solve? fixes https://github.com/infiniflow/ragflow/issues/12774 Add a CLI tool for migrating RAGFlow data from Elasticsearch to OceanBase, enabling users to switch their document storage backend. - Automatic discovery and migration of all `ragflow_*` indices - Schema conversion with vector dimension auto-detection - Batch processing with progress tracking and resume capability - Data consistency validation and migration report generation **Note**: Due to network issues, I was unable to pull the required Docker images (Elasticsearch, OceanBase) to run the full end-to-end verification. Unit tests have been verified to pass. I will complete the e2e verification when network conditions allow, and submit a follow-up PR if any fixes are needed. ```bash ============================= test session starts ============================== platform darwin -- Python 3.13.6, pytest-9.0.2, pluggy-1.6.0 rootdir: /Users/sevenc/code/ai/oceanbase/ragflow/tools/es-to-oceanbase-migration configfile: pyproject.toml testpaths: tests plugins: anyio-4.12.1, asyncio-1.3.0, cov-7.0.0 collected 86 items tests/test_progress.py::TestMigrationProgress::test_create_basic_progress PASSED [ 1%] tests/test_progress.py::TestMigrationProgress::test_create_progress_with_counts PASSED [ 2%] tests/test_progress.py::TestMigrationProgress::test_progress_default_values PASSED [ 3%] tests/test_progress.py::TestMigrationProgress::test_progress_status_values PASSED [ 4%] tests/test_progress.py::TestProgressManager::test_create_progress_manager PASSED [ 5%] tests/test_progress.py::TestProgressManager::test_create_progress_manager_creates_dir PASSED [ 6%] tests/test_progress.py::TestProgressManager::test_create_progress PASSED [ 8%] tests/test_progress.py::TestProgressManager::test_save_and_load_progress PASSED [ 9%] tests/test_progress.py::TestProgressManager::test_load_nonexistent_progress PASSED [ 10%] tests/test_progress.py::TestProgressManager::test_delete_progress PASSED [ 11%] tests/test_progress.py::TestProgressManager::test_update_progress PASSED [ 12%] tests/test_progress.py::TestProgressManager::test_update_progress_multiple_batches PASSED [ 13%] tests/test_progress.py::TestProgressManager::test_mark_completed PASSED [ 15%] tests/test_progress.py::TestProgressManager::test_mark_failed PASSED [ 16%] tests/test_progress.py::TestProgressManager::test_mark_paused PASSED [ 17%] tests/test_progress.py::TestProgressManager::test_can_resume_running PASSED [ 18%] tests/test_progress.py::TestProgressManager::test_can_resume_paused PASSED [ 19%] tests/test_progress.py::TestProgressManager::test_can_resume_completed PASSED [ 20%] tests/test_progress.py::TestProgressManager::test_can_resume_nonexistent PASSED [ 22%] tests/test_progress.py::TestProgressManager::test_get_resume_info PASSED [ 23%] tests/test_progress.py::TestProgressManager::test_get_resume_info_nonexistent PASSED [ 24%] tests/test_progress.py::TestProgressManager::test_progress_file_path PASSED [ 25%] tests/test_progress.py::TestProgressManager::test_progress_file_content PASSED [ 26%] tests/test_schema.py::TestRAGFlowSchemaConverter::test_analyze_ragflow_mapping PASSED [ 27%] tests/test_schema.py::TestRAGFlowSchemaConverter::test_detect_vector_size PASSED [ 29%] tests/test_schema.py::TestRAGFlowSchemaConverter::test_unknown_fields PASSED [ 30%] tests/test_schema.py::TestRAGFlowSchemaConverter::test_get_column_definitions PASSED [ 31%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_basic_document PASSED [ 32%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_with_vector PASSED [ 33%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_array_fields PASSED [ 34%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_json_fields PASSED [ 36%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_unknown_fields_to_extra PASSED [ 37%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_kb_id_list PASSED [ 38%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_content_with_weight_dict PASSED [ 39%] tests/test_schema.py::TestRAGFlowDataConverter::test_convert_batch PASSED [ 40%] tests/test_schema.py::TestVectorFieldPattern::test_valid_patterns PASSED [ 41%] tests/test_schema.py::TestVectorFieldPattern::test_invalid_patterns PASSED [ 43%] tests/test_schema.py::TestVectorFieldPattern::test_extract_dimension PASSED [ 44%] tests/test_schema.py::TestConstants::test_array_columns PASSED [ 45%] tests/test_schema.py::TestConstants::test_json_columns PASSED [ 46%] tests/test_schema.py::TestConstants::test_ragflow_columns_completeness PASSED [ 47%] tests/test_schema.py::TestConstants::test_fts_columns PASSED [ 48%] tests/test_schema.py::TestConstants::test_ragflow_columns_types PASSED [ 50%] tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_empty_mapping PASSED [ 51%] tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_mapping_without_properties PASSED [ 52%] tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_multiple_vector_fields PASSED [ 53%] tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_get_column_definitions_without_analysis PASSED [ 54%] tests/test_schema.py::TestRAGFlowSchemaConverterEdgeCases::test_get_vector_fields PASSED [ 55%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_empty_document PASSED [ 56%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_document_without_source PASSED [ 58%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_boolean_to_integer PASSED [ 59%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_invalid_integer PASSED [ 60%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_float_field PASSED [ 61%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_array_with_special_characters PASSED [ 62%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_already_json_array PASSED [ 63%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_single_value_to_array PASSED [ 65%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_detect_vector_fields_from_document PASSED [ 66%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_with_default_values PASSED [ 67%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_list_content PASSED [ 68%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_convert_batch_empty PASSED [ 69%] tests/test_schema.py::TestRAGFlowDataConverterEdgeCases::test_existing_extra_field_merged PASSED [ 70%] tests/test_verify.py::TestVerificationResult::test_create_basic_result PASSED [ 72%] tests/test_verify.py::TestVerificationResult::test_result_default_values PASSED [ 73%] tests/test_verify.py::TestVerificationResult::test_result_with_counts PASSED [ 74%] tests/test_verify.py::TestMigrationVerifier::test_verify_counts_match PASSED [ 75%] tests/test_verify.py::TestMigrationVerifier::test_verify_counts_mismatch PASSED [ 76%] tests/test_verify.py::TestMigrationVerifier::test_verify_samples_all_match PASSED [ 77%] tests/test_verify.py::TestMigrationVerifier::test_verify_samples_some_missing PASSED [ 79%] tests/test_verify.py::TestMigrationVerifier::test_verify_samples_data_mismatch PASSED [ 80%] tests/test_verify.py::TestMigrationVerifier::test_values_equal_none_values PASSED [ 81%] tests/test_verify.py::TestMigrationVerifier::test_values_equal_array_columns PASSED [ 82%] tests/test_verify.py::TestMigrationVerifier::test_values_equal_json_columns PASSED [ 83%] tests/test_verify.py::TestMigrationVerifier::test_values_equal_kb_id_list PASSED [ 84%] tests/test_verify.py::TestMigrationVerifier::test_values_equal_content_with_weight_dict PASSED [ 86%] tests/test_verify.py::TestMigrationVerifier::test_determine_result_passed PASSED [ 87%] tests/test_verify.py::TestMigrationVerifier::test_determine_result_failed_count PASSED [ 88%] tests/test_verify.py::TestMigrationVerifier::test_determine_result_failed_samples PASSED [ 89%] tests/test_verify.py::TestMigrationVerifier::test_generate_report PASSED [ 90%] tests/test_verify.py::TestMigrationVerifier::test_generate_report_with_missing PASSED [ 91%] tests/test_verify.py::TestMigrationVerifier::test_generate_report_with_mismatches PASSED [ 93%] tests/test_verify.py::TestValueComparison::test_string_comparison PASSED [ 94%] tests/test_verify.py::TestValueComparison::test_integer_comparison PASSED [ 95%] tests/test_verify.py::TestValueComparison::test_float_comparison PASSED [ 96%] tests/test_verify.py::TestValueComparison::test_boolean_comparison PASSED [ 97%] tests/test_verify.py::TestValueComparison::test_empty_array_comparison PASSED [ 98%] tests/test_verify.py::TestValueComparison::test_nested_json_comparison PASSED [100%] ======================= 86 passed, 88 warnings in 0.66s ======================== ``` ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [ ] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe):
2026-02-02 00:25:06 +08:00 · 2026-01-31 16:11:27 +08:00
parent c4c3f744c0
commit 332b11cf96
15 changed files with 5606 additions and 0 deletions
--- a/tools/es-to-oceanbase-migration/tests/init.py
+++ b/tools/es-to-oceanbase-migration/tests/init.py
@ -0,0 +1 @@
+# Tests for ES to OceanBase migration tool
--- a/tools/es-to-oceanbase-migration/tests/test_progress.py
+++ b/tools/es-to-oceanbase-migration/tests/test_progress.py
@ -0,0 +1,321 @@
+"""
+Tests for progress tracking and resume capability.
+"""
+
+import json
+import os
+import tempfile
+import pytest
+from pathlib import Path
+from datetime import datetime
+
+from es_ob_migration.progress import MigrationProgress, ProgressManager
+
+
+class TestMigrationProgress:
+    """Test MigrationProgress dataclass."""
+
+    def test_create_basic_progress(self):
+        """Test creating a basic progress object."""
+        progress = MigrationProgress(
+            es_index="ragflow_test",
+            ob_table="ragflow_test",
+        )
+        
+        assert progress.es_index == "ragflow_test"
+        assert progress.ob_table == "ragflow_test"
+        assert progress.total_documents == 0
+        assert progress.migrated_documents == 0
+        assert progress.status == "pending"
+        assert progress.started_at != ""
+        assert progress.updated_at != ""
+
+    def test_create_progress_with_counts(self):
+        """Test creating progress with document counts."""
+        progress = MigrationProgress(
+            es_index="ragflow_test",
+            ob_table="ragflow_test",
+            total_documents=1000,
+            migrated_documents=500,
+        )
+        
+        assert progress.total_documents == 1000
+        assert progress.migrated_documents == 500
+
+    def test_progress_default_values(self):
+        """Test default values."""
+        progress = MigrationProgress(
+            es_index="test_index",
+            ob_table="test_table",
+        )
+        
+        assert progress.failed_documents == 0
+        assert progress.last_sort_values == []
+        assert progress.last_batch_ids == []
+        assert progress.error_message == ""
+        assert progress.schema_converted is False
+        assert progress.table_created is False
+        assert progress.indexes_created is False
+
+    def test_progress_status_values(self):
+        """Test various status values."""
+        for status in ["pending", "running", "completed", "failed", "paused"]:
+            progress = MigrationProgress(
+                es_index="test",
+                ob_table="test",
+                status=status,
+            )
+            assert progress.status == status
+
+
+class TestProgressManager:
+    """Test ProgressManager class."""
+
+    @pytest.fixture
+    def temp_dir(self):
+        """Create a temporary directory for tests."""
+        with tempfile.TemporaryDirectory() as tmpdir:
+            yield tmpdir
+
+    @pytest.fixture
+    def manager(self, temp_dir):
+        """Create a ProgressManager with temp directory."""
+        return ProgressManager(progress_dir=temp_dir)
+
+    def test_create_progress_manager(self, temp_dir):
+        """Test creating a progress manager."""
+        manager = ProgressManager(progress_dir=temp_dir)
+        assert manager.progress_dir.exists()
+
+    def test_create_progress_manager_creates_dir(self, temp_dir):
+        """Test that progress manager creates directory."""
+        new_dir = os.path.join(temp_dir, "new_progress")
+        manager = ProgressManager(progress_dir=new_dir)
+        assert Path(new_dir).exists()
+
+    def test_create_progress(self, manager):
+        """Test creating new progress."""
+        progress = manager.create_progress(
+            es_index="ragflow_abc123",
+            ob_table="ragflow_abc123",
+            total_documents=1000,
+        )
+        
+        assert progress.es_index == "ragflow_abc123"
+        assert progress.ob_table == "ragflow_abc123"
+        assert progress.total_documents == 1000
+        assert progress.status == "running"
+
+    def test_save_and_load_progress(self, manager):
+        """Test saving and loading progress."""
+        # Create and save
+        progress = manager.create_progress(
+            es_index="ragflow_test",
+            ob_table="ragflow_test",
+            total_documents=500,
+        )
+        progress.migrated_documents = 250
+        progress.last_sort_values = ["doc_250", 1234567890]
+        manager.save_progress(progress)
+        
+        # Load
+        loaded = manager.load_progress("ragflow_test", "ragflow_test")
+        
+        assert loaded is not None
+        assert loaded.es_index == "ragflow_test"
+        assert loaded.total_documents == 500
+        assert loaded.migrated_documents == 250
+        assert loaded.last_sort_values == ["doc_250", 1234567890]
+
+    def test_load_nonexistent_progress(self, manager):
+        """Test loading progress that doesn't exist."""
+        loaded = manager.load_progress("nonexistent", "nonexistent")
+        assert loaded is None
+
+    def test_delete_progress(self, manager):
+        """Test deleting progress."""
+        # Create progress
+        manager.create_progress(
+            es_index="ragflow_delete_test",
+            ob_table="ragflow_delete_test",
+            total_documents=100,
+        )
+        
+        # Verify it exists
+        assert manager.load_progress("ragflow_delete_test", "ragflow_delete_test") is not None
+        
+        # Delete
+        manager.delete_progress("ragflow_delete_test", "ragflow_delete_test")
+        
+        # Verify it's gone
+        assert manager.load_progress("ragflow_delete_test", "ragflow_delete_test") is None
+
+    def test_update_progress(self, manager):
+        """Test updating progress."""
+        progress = manager.create_progress(
+            es_index="ragflow_update",
+            ob_table="ragflow_update",
+            total_documents=1000,
+        )
+        
+        # Update
+        manager.update_progress(
+            progress,
+            migrated_count=100,
+            last_sort_values=["doc_100", 9999],
+            last_batch_ids=["id1", "id2", "id3"],
+        )
+        
+        assert progress.migrated_documents == 100
+        assert progress.last_sort_values == ["doc_100", 9999]
+        assert progress.last_batch_ids == ["id1", "id2", "id3"]
+
+    def test_update_progress_multiple_batches(self, manager):
+        """Test updating progress multiple times."""
+        progress = manager.create_progress(
+            es_index="ragflow_multi",
+            ob_table="ragflow_multi",
+            total_documents=1000,
+        )
+        
+        # Update multiple times
+        for i in range(1, 11):
+            manager.update_progress(progress, migrated_count=100)
+        
+        assert progress.migrated_documents == 1000
+
+    def test_mark_completed(self, manager):
+        """Test marking migration as completed."""
+        progress = manager.create_progress(
+            es_index="ragflow_complete",
+            ob_table="ragflow_complete",
+            total_documents=100,
+        )
+        progress.migrated_documents = 100
+        
+        manager.mark_completed(progress)
+        
+        assert progress.status == "completed"
+
+    def test_mark_failed(self, manager):
+        """Test marking migration as failed."""
+        progress = manager.create_progress(
+            es_index="ragflow_fail",
+            ob_table="ragflow_fail",
+            total_documents=100,
+        )
+        
+        manager.mark_failed(progress, "Connection timeout")
+        
+        assert progress.status == "failed"
+        assert progress.error_message == "Connection timeout"
+
+    def test_mark_paused(self, manager):
+        """Test marking migration as paused."""
+        progress = manager.create_progress(
+            es_index="ragflow_pause",
+            ob_table="ragflow_pause",
+            total_documents=1000,
+        )
+        progress.migrated_documents = 500
+        
+        manager.mark_paused(progress)
+        
+        assert progress.status == "paused"
+
+    def test_can_resume_running(self, manager):
+        """Test can_resume for running migration."""
+        progress = manager.create_progress(
+            es_index="ragflow_resume_running",
+            ob_table="ragflow_resume_running",
+            total_documents=1000,
+        )
+        
+        assert manager.can_resume("ragflow_resume_running", "ragflow_resume_running") is True
+
+    def test_can_resume_paused(self, manager):
+        """Test can_resume for paused migration."""
+        progress = manager.create_progress(
+            es_index="ragflow_resume_paused",
+            ob_table="ragflow_resume_paused",
+            total_documents=1000,
+        )
+        manager.mark_paused(progress)
+        
+        assert manager.can_resume("ragflow_resume_paused", "ragflow_resume_paused") is True
+
+    def test_can_resume_completed(self, manager):
+        """Test can_resume for completed migration."""
+        progress = manager.create_progress(
+            es_index="ragflow_resume_complete",
+            ob_table="ragflow_resume_complete",
+            total_documents=100,
+        )
+        progress.migrated_documents = 100
+        manager.mark_completed(progress)
+        
+        # Completed migrations should not be resumed
+        assert manager.can_resume("ragflow_resume_complete", "ragflow_resume_complete") is False
+
+    def test_can_resume_nonexistent(self, manager):
+        """Test can_resume for nonexistent migration."""
+        assert manager.can_resume("nonexistent", "nonexistent") is False
+
+    def test_get_resume_info(self, manager):
+        """Test getting resume information."""
+        progress = manager.create_progress(
+            es_index="ragflow_info",
+            ob_table="ragflow_info",
+            total_documents=1000,
+        )
+        progress.migrated_documents = 500
+        progress.last_sort_values = ["doc_500", 12345]
+        progress.schema_converted = True
+        progress.table_created = True
+        manager.save_progress(progress)
+        
+        info = manager.get_resume_info("ragflow_info", "ragflow_info")
+        
+        assert info is not None
+        assert info["migrated_documents"] == 500
+        assert info["total_documents"] == 1000
+        assert info["last_sort_values"] == ["doc_500", 12345]
+        assert info["schema_converted"] is True
+        assert info["table_created"] is True
+        assert info["status"] == "running"
+
+    def test_get_resume_info_nonexistent(self, manager):
+        """Test getting resume info for nonexistent migration."""
+        info = manager.get_resume_info("nonexistent", "nonexistent")
+        assert info is None
+
+    def test_progress_file_path(self, manager):
+        """Test progress file naming."""
+        progress = manager.create_progress(
+            es_index="ragflow_abc123",
+            ob_table="ragflow_abc123",
+            total_documents=100,
+        )
+        
+        expected_file = manager.progress_dir / "ragflow_abc123_to_ragflow_abc123.json"
+        assert expected_file.exists()
+
+    def test_progress_file_content(self, manager):
+        """Test progress file JSON content."""
+        progress = manager.create_progress(
+            es_index="ragflow_json",
+            ob_table="ragflow_json",
+            total_documents=100,
+        )
+        progress.migrated_documents = 50
+        manager.save_progress(progress)
+        
+        # Read file directly
+        progress_file = manager.progress_dir / "ragflow_json_to_ragflow_json.json"
+        with open(progress_file) as f:
+            data = json.load(f)
+        
+        assert data["es_index"] == "ragflow_json"
+        assert data["ob_table"] == "ragflow_json"
+        assert data["total_documents"] == 100
+        assert data["migrated_documents"] == 50
--- a/tools/es-to-oceanbase-migration/tests/test_schema.py
+++ b/tools/es-to-oceanbase-migration/tests/test_schema.py
@ -0,0 +1,649 @@
+"""
+Tests for RAGFlow schema conversion.
+
+This module tests:
+- RAGFlowSchemaConverter: Analyzes ES mappings and generates OB column definitions
+- RAGFlowDataConverter: Converts ES documents to OceanBase row format
+- Vector field pattern matching
+- Schema constants
+"""
+
+import json
+import pytest
+from es_ob_migration.schema import (
+    RAGFlowSchemaConverter,
+    RAGFlowDataConverter,
+    RAGFLOW_COLUMNS,
+    ARRAY_COLUMNS,
+    JSON_COLUMNS,
+    VECTOR_FIELD_PATTERN,
+    FTS_COLUMNS_ORIGIN,
+    FTS_COLUMNS_TKS,
+)
+
+
+class TestRAGFlowSchemaConverter:
+    """Test RAGFlowSchemaConverter class."""
+
+    def test_analyze_ragflow_mapping(self):
+        """Test analyzing a RAGFlow ES mapping."""
+        converter = RAGFlowSchemaConverter()
+        
+        # Simulate a RAGFlow ES mapping
+        es_mapping = {
+            "properties": {
+                "id": {"type": "keyword"},
+                "kb_id": {"type": "keyword"},
+                "doc_id": {"type": "keyword"},
+                "docnm_kwd": {"type": "keyword"},
+                "content_with_weight": {"type": "text"},
+                "content_ltks": {"type": "text"},
+                "available_int": {"type": "integer"},
+                "important_kwd": {"type": "keyword"},
+                "q_768_vec": {"type": "dense_vector", "dims": 768},
+            }
+        }
+        
+        analysis = converter.analyze_es_mapping(es_mapping)
+        
+        # Check known fields
+        assert "id" in analysis["known_fields"]
+        assert "kb_id" in analysis["known_fields"]
+        assert "content_with_weight" in analysis["known_fields"]
+        
+        # Check vector fields
+        assert len(analysis["vector_fields"]) == 1
+        assert analysis["vector_fields"][0]["name"] == "q_768_vec"
+        assert analysis["vector_fields"][0]["dimension"] == 768
+
+    def test_detect_vector_size(self):
+        """Test automatic vector size detection."""
+        converter = RAGFlowSchemaConverter()
+        
+        es_mapping = {
+            "properties": {
+                "q_1536_vec": {"type": "dense_vector", "dims": 1536},
+            }
+        }
+        
+        converter.analyze_es_mapping(es_mapping)
+        
+        assert converter.detected_vector_size == 1536
+
+    def test_unknown_fields(self):
+        """Test that unknown fields are properly identified."""
+        converter = RAGFlowSchemaConverter()
+        
+        es_mapping = {
+            "properties": {
+                "id": {"type": "keyword"},
+                "custom_field": {"type": "text"},
+                "another_field": {"type": "integer"},
+            }
+        }
+        
+        analysis = converter.analyze_es_mapping(es_mapping)
+        
+        assert "custom_field" in analysis["unknown_fields"]
+        assert "another_field" in analysis["unknown_fields"]
+
+    def test_get_column_definitions(self):
+        """Test getting RAGFlow column definitions."""
+        converter = RAGFlowSchemaConverter()
+        
+        # First analyze to detect vector fields
+        es_mapping = {
+            "properties": {
+                "q_768_vec": {"type": "dense_vector", "dims": 768},
+            }
+        }
+        converter.analyze_es_mapping(es_mapping)
+        
+        columns = converter.get_column_definitions()
+        
+        # Check that all RAGFlow columns are present
+        column_names = [c["name"] for c in columns]
+        
+        for col_name in RAGFLOW_COLUMNS:
+            assert col_name in column_names, f"Missing column: {col_name}"
+        
+        # Check vector column is added
+        assert "q_768_vec" in column_names
+
+
+class TestRAGFlowDataConverter:
+    """Test RAGFlowDataConverter class."""
+
+    def test_convert_basic_document(self):
+        """Test converting a basic RAGFlow document."""
+        converter = RAGFlowDataConverter()
+        
+        es_doc = {
+            "_id": "test-id-123",
+            "_source": {
+                "id": "test-id-123",
+                "kb_id": "kb-001",
+                "doc_id": "doc-001",
+                "docnm_kwd": "test_document.pdf",
+                "content_with_weight": "This is test content",
+                "available_int": 1,
+            }
+        }
+        
+        row = converter.convert_document(es_doc)
+        
+        assert row["id"] == "test-id-123"
+        assert row["kb_id"] == "kb-001"
+        assert row["doc_id"] == "doc-001"
+        assert row["docnm_kwd"] == "test_document.pdf"
+        assert row["content_with_weight"] == "This is test content"
+        assert row["available_int"] == 1
+
+    def test_convert_with_vector(self):
+        """Test converting document with vector embedding."""
+        converter = RAGFlowDataConverter()
+        
+        embedding = [0.1] * 768
+        es_doc = {
+            "_id": "vec-doc-001",
+            "_source": {
+                "id": "vec-doc-001",
+                "kb_id": "kb-001",
+                "q_768_vec": embedding,
+            }
+        }
+        
+        row = converter.convert_document(es_doc)
+        
+        assert row["id"] == "vec-doc-001"
+        assert row["q_768_vec"] == embedding
+        assert "q_768_vec" in converter.vector_fields
+
+    def test_convert_array_fields(self):
+        """Test converting array fields."""
+        converter = RAGFlowDataConverter()
+        
+        es_doc = {
+            "_id": "array-doc",
+            "_source": {
+                "id": "array-doc",
+                "kb_id": "kb-001",
+                "important_kwd": ["keyword1", "keyword2", "keyword3"],
+                "question_kwd": ["What is this?", "How does it work?"],
+                "tag_kwd": ["tag1", "tag2"],
+            }
+        }
+        
+        row = converter.convert_document(es_doc)
+        
+        # Array fields should be JSON strings
+        assert isinstance(row["important_kwd"], str)
+        parsed = json.loads(row["important_kwd"])
+        assert parsed == ["keyword1", "keyword2", "keyword3"]
+
+    def test_convert_json_fields(self):
+        """Test converting JSON fields."""
+        converter = RAGFlowDataConverter()
+        
+        es_doc = {
+            "_id": "json-doc",
+            "_source": {
+                "id": "json-doc",
+                "kb_id": "kb-001",
+                "tag_feas": {"tag1": 0.8, "tag2": 0.5},
+                "metadata": {"author": "John", "date": "2024-01-01"},
+            }
+        }
+        
+        row = converter.convert_document(es_doc)
+        
+        # JSON fields should be JSON strings
+        assert isinstance(row["tag_feas"], str)
+        assert isinstance(row["metadata"], str)
+        
+        tag_feas = json.loads(row["tag_feas"])
+        assert tag_feas == {"tag1": 0.8, "tag2": 0.5}
+
+    def test_convert_unknown_fields_to_extra(self):
+        """Test that unknown fields are stored in 'extra'."""
+        converter = RAGFlowDataConverter()
+        
+        es_doc = {
+            "_id": "extra-doc",
+            "_source": {
+                "id": "extra-doc",
+                "kb_id": "kb-001",
+                "custom_field": "custom_value",
+                "another_custom": 123,
+            }
+        }
+        
+        row = converter.convert_document(es_doc)
+        
+        assert "extra" in row
+        extra = json.loads(row["extra"])
+        assert extra["custom_field"] == "custom_value"
+        assert extra["another_custom"] == 123
+
+    def test_convert_kb_id_list(self):
+        """Test converting kb_id when it's a list (ES format)."""
+        converter = RAGFlowDataConverter()
+        
+        es_doc = {
+            "_id": "kb-list-doc",
+            "_source": {
+                "id": "kb-list-doc",
+                "kb_id": ["kb-001", "kb-002"],  # Some ES docs have list
+            }
+        }
+        
+        row = converter.convert_document(es_doc)
+        
+        # Should take first element
+        assert row["kb_id"] == "kb-001"
+
+    def test_convert_content_with_weight_dict(self):
+        """Test converting content_with_weight when it's a dict."""
+        converter = RAGFlowDataConverter()
+        
+        es_doc = {
+            "_id": "content-dict-doc",
+            "_source": {
+                "id": "content-dict-doc",
+                "kb_id": "kb-001",
+                "content_with_weight": {
+                    "text": "Some content",
+                    "weight": 1.0,
+                },
+            }
+        }
+        
+        row = converter.convert_document(es_doc)
+        
+        # Dict should be JSON serialized
+        assert isinstance(row["content_with_weight"], str)
+        parsed = json.loads(row["content_with_weight"])
+        assert parsed["text"] == "Some content"
+
+    def test_convert_batch(self):
+        """Test batch conversion."""
+        converter = RAGFlowDataConverter()
+        
+        es_docs = [
+            {"_id": f"doc-{i}", "_source": {"id": f"doc-{i}", "kb_id": "kb-001"}}
+            for i in range(5)
+        ]
+        
+        rows = converter.convert_batch(es_docs)
+        
+        assert len(rows) == 5
+        for i, row in enumerate(rows):
+            assert row["id"] == f"doc-{i}"
+
+
+class TestVectorFieldPattern:
+    """Test vector field pattern matching."""
+
+    def test_valid_patterns(self):
+        """Test valid vector field patterns."""
+        valid_names = [
+            "q_768_vec",
+            "q_1024_vec",
+            "q_1536_vec",
+            "q_3072_vec",
+        ]
+        
+        for name in valid_names:
+            match = VECTOR_FIELD_PATTERN.match(name)
+            assert match is not None, f"Should match: {name}"
+
+    def test_invalid_patterns(self):
+        """Test invalid vector field patterns."""
+        invalid_names = [
+            "q_vec",
+            "768_vec",
+            "q_768",
+            "vector_768",
+            "content_with_weight",
+        ]
+        
+        for name in invalid_names:
+            match = VECTOR_FIELD_PATTERN.match(name)
+            assert match is None, f"Should not match: {name}"
+
+    def test_extract_dimension(self):
+        """Test extracting dimension from pattern."""
+        match = VECTOR_FIELD_PATTERN.match("q_1536_vec")
+        assert match is not None
+        assert int(match.group("vector_size")) == 1536
+
+
+class TestConstants:
+    """Test schema constants."""
+
+    def test_array_columns(self):
+        """Test ARRAY_COLUMNS list."""
+        expected = [
+            "important_kwd", "question_kwd", "tag_kwd", "source_id",
+            "entities_kwd", "position_int", "page_num_int", "top_int"
+        ]
+        
+        for col in expected:
+            assert col in ARRAY_COLUMNS, f"Missing array column: {col}"
+
+    def test_json_columns(self):
+        """Test JSON_COLUMNS list."""
+        expected = ["tag_feas", "metadata", "extra"]
+        
+        for col in expected:
+            assert col in JSON_COLUMNS, f"Missing JSON column: {col}"
+
+    def test_ragflow_columns_completeness(self):
+        """Test that RAGFLOW_COLUMNS has all required fields."""
+        required_fields = [
+            "id", "kb_id", "doc_id", "content_with_weight",
+            "available_int", "metadata", "extra",
+        ]
+        
+        for field in required_fields:
+            assert field in RAGFLOW_COLUMNS, f"Missing required field: {field}"
+
+    def test_fts_columns(self):
+        """Test fulltext search column lists."""
+        assert "content_with_weight" in FTS_COLUMNS_ORIGIN
+        assert "content_ltks" in FTS_COLUMNS_TKS
+
+    def test_ragflow_columns_types(self):
+        """Test column type definitions."""
+        # Primary key
+        assert RAGFLOW_COLUMNS["id"]["is_primary"] is True
+        assert RAGFLOW_COLUMNS["id"]["nullable"] is False
+        
+        # Indexed columns
+        assert RAGFLOW_COLUMNS["kb_id"]["index"] is True
+        assert RAGFLOW_COLUMNS["doc_id"]["index"] is True
+        
+        # Array columns
+        assert RAGFLOW_COLUMNS["important_kwd"]["is_array"] is True
+        assert RAGFLOW_COLUMNS["question_kwd"]["is_array"] is True
+        
+        # JSON columns
+        assert RAGFLOW_COLUMNS["metadata"]["is_json"] is True
+        assert RAGFLOW_COLUMNS["extra"]["is_json"] is True
+
+
+class TestRAGFlowSchemaConverterEdgeCases:
+    """Test edge cases for RAGFlowSchemaConverter."""
+
+    def test_empty_mapping(self):
+        """Test analyzing empty mapping."""
+        converter = RAGFlowSchemaConverter()
+        
+        analysis = converter.analyze_es_mapping({})
+        
+        assert analysis["known_fields"] == []
+        assert analysis["vector_fields"] == []
+        assert analysis["unknown_fields"] == []
+
+    def test_mapping_without_properties(self):
+        """Test mapping without properties key."""
+        converter = RAGFlowSchemaConverter()
+        
+        analysis = converter.analyze_es_mapping({"some_other_key": {}})
+        
+        assert analysis["known_fields"] == []
+
+    def test_multiple_vector_fields(self):
+        """Test detecting multiple vector fields."""
+        converter = RAGFlowSchemaConverter()
+        
+        es_mapping = {
+            "properties": {
+                "q_768_vec": {"type": "dense_vector", "dims": 768},
+                "q_1024_vec": {"type": "dense_vector", "dims": 1024},
+            }
+        }
+        
+        analysis = converter.analyze_es_mapping(es_mapping)
+        
+        assert len(analysis["vector_fields"]) == 2
+        # First detected should be set
+        assert converter.detected_vector_size in [768, 1024]
+
+    def test_get_column_definitions_without_analysis(self):
+        """Test getting columns without prior analysis."""
+        converter = RAGFlowSchemaConverter()
+        
+        columns = converter.get_column_definitions()
+        
+        # Should have all RAGFlow columns but no vector columns
+        column_names = [c["name"] for c in columns]
+        assert "id" in column_names
+        assert "kb_id" in column_names
+
+    def test_get_vector_fields(self):
+        """Test getting vector fields."""
+        converter = RAGFlowSchemaConverter()
+        
+        es_mapping = {
+            "properties": {
+                "q_1536_vec": {"type": "dense_vector", "dims": 1536},
+            }
+        }
+        converter.analyze_es_mapping(es_mapping)
+        
+        vec_fields = converter.get_vector_fields()
+        
+        assert len(vec_fields) == 1
+        assert vec_fields[0]["name"] == "q_1536_vec"
+        assert vec_fields[0]["dimension"] == 1536
+
+
+class TestRAGFlowDataConverterEdgeCases:
+    """Test edge cases for RAGFlowDataConverter."""
+
+    def test_convert_empty_document(self):
+        """Test converting empty document."""
+        converter = RAGFlowDataConverter()
+        
+        es_doc = {"_id": "empty_doc", "_source": {}}
+        row = converter.convert_document(es_doc)
+        
+        assert row["id"] == "empty_doc"
+
+    def test_convert_document_without_source(self):
+        """Test converting document without _source."""
+        converter = RAGFlowDataConverter()
+        
+        es_doc = {"_id": "no_source", "id": "no_source", "kb_id": "kb_001"}
+        row = converter.convert_document(es_doc)
+        
+        assert row["id"] == "no_source"
+        assert row["kb_id"] == "kb_001"
+
+    def test_convert_boolean_to_integer(self):
+        """Test converting boolean to integer."""
+        converter = RAGFlowDataConverter()
+        
+        es_doc = {
+            "_id": "bool_doc",
+            "_source": {
+                "id": "bool_doc",
+                "kb_id": "kb_001",
+                "available_int": True,
+            }
+        }
+        
+        row = converter.convert_document(es_doc)
+        
+        assert row["available_int"] == 1
+
+    def test_convert_invalid_integer(self):
+        """Test converting invalid integer value."""
+        converter = RAGFlowDataConverter()
+        
+        es_doc = {
+            "_id": "invalid_int",
+            "_source": {
+                "id": "invalid_int",
+                "kb_id": "kb_001",
+                "available_int": "not_a_number",
+            }
+        }
+        
+        row = converter.convert_document(es_doc)
+        
+        assert row["available_int"] is None
+
+    def test_convert_float_field(self):
+        """Test converting float fields."""
+        converter = RAGFlowDataConverter()
+        
+        es_doc = {
+            "_id": "float_doc",
+            "_source": {
+                "id": "float_doc",
+                "kb_id": "kb_001",
+                "weight_flt": 0.85,
+                "rank_flt": "0.95",  # String that should become float
+            }
+        }
+        
+        row = converter.convert_document(es_doc)
+        
+        assert row["weight_flt"] == 0.85
+        assert row["rank_flt"] == 0.95
+
+    def test_convert_array_with_special_characters(self):
+        """Test converting array with special characters."""
+        converter = RAGFlowDataConverter()
+        
+        es_doc = {
+            "_id": "special_array",
+            "_source": {
+                "id": "special_array",
+                "kb_id": "kb_001",
+                "important_kwd": ["key\nwith\nnewlines", "key\twith\ttabs"],
+            }
+        }
+        
+        row = converter.convert_document(es_doc)
+        
+        # Should be JSON string with escaped characters
+        assert isinstance(row["important_kwd"], str)
+        parsed = json.loads(row["important_kwd"])
+        assert len(parsed) == 2
+
+    def test_convert_already_json_array(self):
+        """Test converting already JSON-encoded array."""
+        converter = RAGFlowDataConverter()
+        
+        es_doc = {
+            "_id": "json_array",
+            "_source": {
+                "id": "json_array",
+                "kb_id": "kb_001",
+                "important_kwd": '["already", "json"]',
+            }
+        }
+        
+        row = converter.convert_document(es_doc)
+        
+        assert row["important_kwd"] == '["already", "json"]'
+
+    def test_convert_single_value_to_array(self):
+        """Test converting single value to array."""
+        converter = RAGFlowDataConverter()
+        
+        es_doc = {
+            "_id": "single_to_array",
+            "_source": {
+                "id": "single_to_array",
+                "kb_id": "kb_001",
+                "important_kwd": "single_keyword",
+            }
+        }
+        
+        row = converter.convert_document(es_doc)
+        
+        parsed = json.loads(row["important_kwd"])
+        assert parsed == ["single_keyword"]
+
+    def test_detect_vector_fields_from_document(self):
+        """Test detecting vector fields from document."""
+        converter = RAGFlowDataConverter()
+        
+        doc = {
+            "q_768_vec": [0.1] * 768,
+            "q_1024_vec": [0.2] * 1024,
+        }
+        
+        converter.detect_vector_fields(doc)
+        
+        assert "q_768_vec" in converter.vector_fields
+        assert "q_1024_vec" in converter.vector_fields
+
+    def test_convert_with_default_values(self):
+        """Test conversion uses default values."""
+        converter = RAGFlowDataConverter()
+        
+        es_doc = {
+            "_id": "default_test",
+            "_source": {
+                "id": "default_test",
+                "kb_id": "kb_001",
+                # available_int not provided, should get default
+            }
+        }
+        
+        row = converter.convert_document(es_doc)
+        
+        # available_int has default of 1
+        assert row.get("available_int") == 1
+
+    def test_convert_list_content(self):
+        """Test converting list content to JSON."""
+        converter = RAGFlowDataConverter()
+        
+        es_doc = {
+            "_id": "list_content",
+            "_source": {
+                "id": "list_content",
+                "kb_id": "kb_001",
+                "content_with_weight": ["part1", "part2", "part3"],
+            }
+        }
+        
+        row = converter.convert_document(es_doc)
+        
+        assert isinstance(row["content_with_weight"], str)
+        parsed = json.loads(row["content_with_weight"])
+        assert parsed == ["part1", "part2", "part3"]
+
+    def test_convert_batch_empty(self):
+        """Test batch conversion with empty list."""
+        converter = RAGFlowDataConverter()
+        
+        rows = converter.convert_batch([])
+        
+        assert rows == []
+
+    def test_existing_extra_field_merged(self):
+        """Test that existing extra field is merged with unknown fields."""
+        converter = RAGFlowDataConverter()
+        
+        es_doc = {
+            "_id": "merge_extra",
+            "_source": {
+                "id": "merge_extra",
+                "kb_id": "kb_001",
+                "extra": {"existing_key": "existing_value"},
+                "custom_field": "custom_value",
+            }
+        }
+        
+        row = converter.convert_document(es_doc)
+        
+        # extra should contain both existing and new fields
+        extra = json.loads(row["extra"])
+        assert "custom_field" in extra
--- a/tools/es-to-oceanbase-migration/tests/test_verify.py
+++ b/tools/es-to-oceanbase-migration/tests/test_verify.py
@ -0,0 +1,385 @@
+"""
+Tests for migration verification.
+"""
+
+import json
+import pytest
+from unittest.mock import Mock, MagicMock
+
+from es_ob_migration.verify import MigrationVerifier, VerificationResult
+
+
+class TestVerificationResult:
+    """Test VerificationResult dataclass."""
+
+    def test_create_basic_result(self):
+        """Test creating a basic result."""
+        result = VerificationResult(
+            es_index="ragflow_test",
+            ob_table="ragflow_test",
+        )
+        
+        assert result.es_index == "ragflow_test"
+        assert result.ob_table == "ragflow_test"
+        assert result.es_count == 0
+        assert result.ob_count == 0
+        assert result.passed is False
+
+    def test_result_default_values(self):
+        """Test default values."""
+        result = VerificationResult(
+            es_index="test",
+            ob_table="test",
+        )
+        
+        assert result.count_match is False
+        assert result.count_diff == 0
+        assert result.sample_size == 0
+        assert result.samples_verified == 0
+        assert result.samples_matched == 0
+        assert result.sample_match_rate == 0.0
+        assert result.missing_in_ob == []
+        assert result.data_mismatches == []
+        assert result.message == ""
+
+    def test_result_with_counts(self):
+        """Test result with count data."""
+        result = VerificationResult(
+            es_index="test",
+            ob_table="test",
+            es_count=1000,
+            ob_count=1000,
+            count_match=True,
+        )
+        
+        assert result.es_count == 1000
+        assert result.ob_count == 1000
+        assert result.count_match is True
+
+
+class TestMigrationVerifier:
+    """Test MigrationVerifier class."""
+
+    @pytest.fixture
+    def mock_es_client(self):
+        """Create mock ES client."""
+        client = Mock()
+        client.count_documents = Mock(return_value=100)
+        client.get_sample_documents = Mock(return_value=[])
+        return client
+
+    @pytest.fixture
+    def mock_ob_client(self):
+        """Create mock OB client."""
+        client = Mock()
+        client.count_rows = Mock(return_value=100)
+        client.get_row_by_id = Mock(return_value=None)
+        return client
+
+    @pytest.fixture
+    def verifier(self, mock_es_client, mock_ob_client):
+        """Create verifier with mock clients."""
+        return MigrationVerifier(mock_es_client, mock_ob_client)
+
+    def test_verify_counts_match(self, mock_es_client, mock_ob_client):
+        """Test verification when counts match."""
+        mock_es_client.count_documents.return_value = 1000
+        mock_ob_client.count_rows.return_value = 1000
+        mock_es_client.get_sample_documents.return_value = []
+        
+        verifier = MigrationVerifier(mock_es_client, mock_ob_client)
+        result = verifier.verify("ragflow_test", "ragflow_test", sample_size=0)
+        
+        assert result.es_count == 1000
+        assert result.ob_count == 1000
+        assert result.count_match is True
+        assert result.count_diff == 0
+
+    def test_verify_counts_mismatch(self, mock_es_client, mock_ob_client):
+        """Test verification when counts don't match."""
+        mock_es_client.count_documents.return_value = 1000
+        mock_ob_client.count_rows.return_value = 950
+        mock_es_client.get_sample_documents.return_value = []
+        
+        verifier = MigrationVerifier(mock_es_client, mock_ob_client)
+        result = verifier.verify("ragflow_test", "ragflow_test", sample_size=0)
+        
+        assert result.es_count == 1000
+        assert result.ob_count == 950
+        assert result.count_match is False
+        assert result.count_diff == 50
+
+    def test_verify_samples_all_match(self, mock_es_client, mock_ob_client):
+        """Test sample verification when all samples match."""
+        # Setup ES samples
+        es_samples = [
+            {"_id": f"doc_{i}", "id": f"doc_{i}", "kb_id": "kb_001", "content_with_weight": f"content_{i}"}
+            for i in range(10)
+        ]
+        mock_es_client.count_documents.return_value = 100
+        mock_es_client.get_sample_documents.return_value = es_samples
+        
+        # Setup OB to return matching documents
+        def get_row(table, doc_id):
+            return {"id": doc_id, "kb_id": "kb_001", "content_with_weight": f"content_{doc_id.split('_')[1]}"}
+        
+        mock_ob_client.count_rows.return_value = 100
+        mock_ob_client.get_row_by_id.side_effect = get_row
+        
+        verifier = MigrationVerifier(mock_es_client, mock_ob_client)
+        result = verifier.verify("ragflow_test", "ragflow_test", sample_size=10)
+        
+        assert result.samples_verified == 10
+        assert result.samples_matched == 10
+        assert result.sample_match_rate == 1.0
+
+    def test_verify_samples_some_missing(self, mock_es_client, mock_ob_client):
+        """Test sample verification when some documents are missing."""
+        es_samples = [
+            {"_id": f"doc_{i}", "id": f"doc_{i}", "kb_id": "kb_001"}
+            for i in range(10)
+        ]
+        mock_es_client.count_documents.return_value = 100
+        mock_es_client.get_sample_documents.return_value = es_samples
+        
+        # Only return some documents
+        def get_row(table, doc_id):
+            idx = int(doc_id.split("_")[1])
+            if idx < 7:  # Only return first 7
+                return {"id": doc_id, "kb_id": "kb_001"}
+            return None
+        
+        mock_ob_client.count_rows.return_value = 100
+        mock_ob_client.get_row_by_id.side_effect = get_row
+        
+        verifier = MigrationVerifier(mock_es_client, mock_ob_client)
+        result = verifier.verify("ragflow_test", "ragflow_test", sample_size=10)
+        
+        assert result.samples_verified == 10
+        assert result.samples_matched == 7
+        assert len(result.missing_in_ob) == 3
+
+    def test_verify_samples_data_mismatch(self, mock_es_client, mock_ob_client):
+        """Test sample verification when data doesn't match."""
+        es_samples = [
+            {"_id": "doc_1", "id": "doc_1", "kb_id": "kb_001", "available_int": 1}
+        ]
+        mock_es_client.count_documents.return_value = 100
+        mock_es_client.get_sample_documents.return_value = es_samples
+        
+        # Return document with different data
+        mock_ob_client.count_rows.return_value = 100
+        mock_ob_client.get_row_by_id.return_value = {
+            "id": "doc_1", "kb_id": "kb_002", "available_int": 0  # Different values
+        }
+        
+        verifier = MigrationVerifier(mock_es_client, mock_ob_client)
+        result = verifier.verify("ragflow_test", "ragflow_test", sample_size=1)
+        
+        assert result.samples_verified == 1
+        assert result.samples_matched == 0
+        assert len(result.data_mismatches) == 1
+
+    def test_values_equal_none_values(self, verifier):
+        """Test value comparison with None values."""
+        assert verifier._values_equal("field", None, None) is True
+        assert verifier._values_equal("field", "value", None) is False
+        assert verifier._values_equal("field", None, "value") is False
+
+    def test_values_equal_array_columns(self, verifier):
+        """Test value comparison for array columns."""
+        # Array stored as JSON string in OB
+        assert verifier._values_equal(
+            "important_kwd",
+            ["key1", "key2"],
+            '["key1", "key2"]'
+        ) is True
+        
+        # Order shouldn't matter for arrays
+        assert verifier._values_equal(
+            "important_kwd",
+            ["key2", "key1"],
+            '["key1", "key2"]'
+        ) is True
+
+    def test_values_equal_json_columns(self, verifier):
+        """Test value comparison for JSON columns."""
+        assert verifier._values_equal(
+            "metadata",
+            {"author": "John"},
+            '{"author": "John"}'
+        ) is True
+
+    def test_values_equal_kb_id_list(self, verifier):
+        """Test kb_id comparison when ES has list."""
+        # ES sometimes stores kb_id as list
+        assert verifier._values_equal(
+            "kb_id",
+            ["kb_001", "kb_002"],
+            "kb_001"
+        ) is True
+
+    def test_values_equal_content_with_weight_dict(self, verifier):
+        """Test content_with_weight comparison when OB has JSON string."""
+        assert verifier._values_equal(
+            "content_with_weight",
+            {"text": "content", "weight": 1.0},
+            '{"text": "content", "weight": 1.0}'
+        ) is True
+
+    def test_determine_result_passed(self, mock_es_client, mock_ob_client):
+        """Test result determination for passed verification."""
+        mock_es_client.count_documents.return_value = 1000
+        mock_ob_client.count_rows.return_value = 1000
+        
+        es_samples = [{"_id": f"doc_{i}", "id": f"doc_{i}", "kb_id": "kb_001"} for i in range(100)]
+        mock_es_client.get_sample_documents.return_value = es_samples
+        mock_ob_client.get_row_by_id.side_effect = lambda t, d: {"id": d, "kb_id": "kb_001"}
+        
+        verifier = MigrationVerifier(mock_es_client, mock_ob_client)
+        result = verifier.verify("test", "test", sample_size=100)
+        
+        assert result.passed is True
+        assert "PASSED" in result.message
+
+    def test_determine_result_failed_count(self, mock_es_client, mock_ob_client):
+        """Test result determination when count verification fails."""
+        mock_es_client.count_documents.return_value = 1000
+        mock_ob_client.count_rows.return_value = 500  # Big difference
+        mock_es_client.get_sample_documents.return_value = []
+        
+        verifier = MigrationVerifier(mock_es_client, mock_ob_client)
+        result = verifier.verify("test", "test", sample_size=0)
+        
+        assert result.passed is False
+        assert "FAILED" in result.message
+
+    def test_determine_result_failed_samples(self, mock_es_client, mock_ob_client):
+        """Test result determination when sample verification fails."""
+        mock_es_client.count_documents.return_value = 100
+        mock_ob_client.count_rows.return_value = 100
+        
+        es_samples = [{"_id": f"doc_{i}", "id": f"doc_{i}"} for i in range(10)]
+        mock_es_client.get_sample_documents.return_value = es_samples
+        mock_ob_client.get_row_by_id.return_value = None  # All missing
+        
+        verifier = MigrationVerifier(mock_es_client, mock_ob_client)
+        result = verifier.verify("test", "test", sample_size=10)
+        
+        assert result.passed is False
+
+    def test_generate_report(self, verifier):
+        """Test report generation."""
+        result = VerificationResult(
+            es_index="ragflow_test",
+            ob_table="ragflow_test",
+            es_count=1000,
+            ob_count=1000,
+            count_match=True,
+            count_diff=0,
+            sample_size=100,
+            samples_verified=100,
+            samples_matched=100,
+            sample_match_rate=1.0,
+            passed=True,
+            message="Verification PASSED",
+        )
+        
+        report = verifier.generate_report(result)
+        
+        assert "ragflow_test" in report
+        assert "1,000" in report
+        assert "PASSED" in report
+        assert "100.00%" in report
+
+    def test_generate_report_with_missing(self, verifier):
+        """Test report generation with missing documents."""
+        result = VerificationResult(
+            es_index="test",
+            ob_table="test",
+            es_count=100,
+            ob_count=95,
+            count_match=False,
+            count_diff=5,
+            sample_size=10,
+            samples_verified=10,
+            samples_matched=8,
+            sample_match_rate=0.8,
+            missing_in_ob=["doc_1", "doc_2"],
+            passed=False,
+            message="Verification FAILED",
+        )
+        
+        report = verifier.generate_report(result)
+        
+        assert "Missing in OceanBase" in report
+        assert "doc_1" in report
+        assert "FAILED" in report
+
+    def test_generate_report_with_mismatches(self, verifier):
+        """Test report generation with data mismatches."""
+        result = VerificationResult(
+            es_index="test",
+            ob_table="test",
+            es_count=100,
+            ob_count=100,
+            count_match=True,
+            sample_size=10,
+            samples_verified=10,
+            samples_matched=8,
+            sample_match_rate=0.8,
+            data_mismatches=[
+                {
+                    "id": "doc_1",
+                    "differences": [
+                        {"field": "kb_id", "es_value": "kb_001", "ob_value": "kb_002"}
+                    ]
+                }
+            ],
+            passed=False,
+            message="Verification FAILED",
+        )
+        
+        report = verifier.generate_report(result)
+        
+        assert "Data Mismatches" in report
+        assert "doc_1" in report
+        assert "kb_id" in report
+
+
+class TestValueComparison:
+    """Test value comparison edge cases."""
+
+    @pytest.fixture
+    def verifier(self):
+        """Create verifier with mock clients."""
+        return MigrationVerifier(Mock(), Mock())
+
+    def test_string_comparison(self, verifier):
+        """Test string comparison."""
+        assert verifier._values_equal("field", "value", "value") is True
+        assert verifier._values_equal("field", "value1", "value2") is False
+
+    def test_integer_comparison(self, verifier):
+        """Test integer comparison (converted to string)."""
+        assert verifier._values_equal("field", 123, "123") is True
+        assert verifier._values_equal("field", "123", 123) is True
+
+    def test_float_comparison(self, verifier):
+        """Test float comparison."""
+        assert verifier._values_equal("field", 1.5, "1.5") is True
+
+    def test_boolean_comparison(self, verifier):
+        """Test boolean comparison."""
+        assert verifier._values_equal("field", True, "True") is True
+        assert verifier._values_equal("field", False, "False") is True
+
+    def test_empty_array_comparison(self, verifier):
+        """Test empty array comparison."""
+        assert verifier._values_equal("important_kwd", [], "[]") is True
+
+    def test_nested_json_comparison(self, verifier):
+        """Test nested JSON comparison."""
+        es_value = {"nested": {"key": "value"}}
+        ob_value = '{"nested": {"key": "value"}}'
+        assert verifier._values_equal("metadata", es_value, ob_value) is True