mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 04:22:28 +08:00
### What problem does this PR solve? Feature: This PR implements a comprehensive RAG evaluation framework to address issue #11656. **Problem**: Developers using RAGFlow lack systematic ways to measure RAG accuracy and quality. They cannot objectively answer: 1. Are RAG results truly accurate? 2. How should configurations be adjusted to improve quality? 3. How to maintain and improve RAG performance over time? **Solution**: This PR adds a complete evaluation system with: - **Dataset & test case management** - Create ground truth datasets with questions and expected answers - **Automated evaluation** - Run RAG pipeline on test cases and compute metrics - **Comprehensive metrics** - Precision, recall, F1 score, MRR, hit rate for retrieval quality - **Smart recommendations** - Analyze results and suggest specific configuration improvements (e.g., "increase top_k", "enable reranking") - **20+ REST API endpoints** - Full CRUD operations for datasets, test cases, and evaluation runs **Impact**: Enables developers to objectively measure RAG quality, identify issues, and systematically improve their RAG systems through data-driven configuration tuning. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
558 lines
20 KiB
Python
558 lines
20 KiB
Python
#
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
"""
|
|
Unit tests for RAG Evaluation Service
|
|
|
|
Tests cover:
|
|
- Dataset management (CRUD operations)
|
|
- Test case management
|
|
- Evaluation execution
|
|
- Metrics computation
|
|
- Recommendations generation
|
|
"""
|
|
|
|
import pytest
|
|
from unittest.mock import patch
|
|
|
|
|
|
class TestEvaluationDatasetManagement:
|
|
"""Tests for evaluation dataset management"""
|
|
|
|
@pytest.fixture
|
|
def mock_evaluation_service(self):
|
|
"""Create a mock EvaluationService"""
|
|
with patch('api.db.services.evaluation_service.EvaluationService') as mock:
|
|
yield mock
|
|
|
|
@pytest.fixture
|
|
def sample_dataset_data(self):
|
|
"""Sample dataset data for testing"""
|
|
return {
|
|
"name": "Customer Support QA",
|
|
"description": "Test cases for customer support",
|
|
"kb_ids": ["kb_123", "kb_456"],
|
|
"tenant_id": "tenant_1",
|
|
"user_id": "user_1"
|
|
}
|
|
|
|
def test_create_dataset_success(self, mock_evaluation_service, sample_dataset_data):
|
|
"""Test successful dataset creation"""
|
|
mock_evaluation_service.create_dataset.return_value = (True, "dataset_123")
|
|
|
|
success, dataset_id = mock_evaluation_service.create_dataset(**sample_dataset_data)
|
|
|
|
assert success is True
|
|
assert dataset_id == "dataset_123"
|
|
mock_evaluation_service.create_dataset.assert_called_once()
|
|
|
|
def test_create_dataset_with_empty_name(self, mock_evaluation_service):
|
|
"""Test dataset creation with empty name"""
|
|
data = {
|
|
"name": "",
|
|
"description": "Test",
|
|
"kb_ids": ["kb_123"],
|
|
"tenant_id": "tenant_1",
|
|
"user_id": "user_1"
|
|
}
|
|
|
|
mock_evaluation_service.create_dataset.return_value = (False, "Dataset name cannot be empty")
|
|
success, error = mock_evaluation_service.create_dataset(**data)
|
|
|
|
assert success is False
|
|
assert "name" in error.lower() or "empty" in error.lower()
|
|
|
|
def test_create_dataset_with_empty_kb_ids(self, mock_evaluation_service):
|
|
"""Test dataset creation with empty kb_ids"""
|
|
data = {
|
|
"name": "Test Dataset",
|
|
"description": "Test",
|
|
"kb_ids": [],
|
|
"tenant_id": "tenant_1",
|
|
"user_id": "user_1"
|
|
}
|
|
|
|
mock_evaluation_service.create_dataset.return_value = (False, "kb_ids cannot be empty")
|
|
success, error = mock_evaluation_service.create_dataset(**data)
|
|
|
|
assert success is False
|
|
|
|
def test_get_dataset_success(self, mock_evaluation_service):
|
|
"""Test successful dataset retrieval"""
|
|
expected_dataset = {
|
|
"id": "dataset_123",
|
|
"name": "Test Dataset",
|
|
"kb_ids": ["kb_123"]
|
|
}
|
|
mock_evaluation_service.get_dataset.return_value = expected_dataset
|
|
|
|
dataset = mock_evaluation_service.get_dataset("dataset_123")
|
|
|
|
assert dataset is not None
|
|
assert dataset["id"] == "dataset_123"
|
|
|
|
def test_get_dataset_not_found(self, mock_evaluation_service):
|
|
"""Test getting non-existent dataset"""
|
|
mock_evaluation_service.get_dataset.return_value = None
|
|
|
|
dataset = mock_evaluation_service.get_dataset("nonexistent")
|
|
|
|
assert dataset is None
|
|
|
|
def test_list_datasets(self, mock_evaluation_service):
|
|
"""Test listing datasets"""
|
|
expected_result = {
|
|
"total": 2,
|
|
"datasets": [
|
|
{"id": "dataset_1", "name": "Dataset 1"},
|
|
{"id": "dataset_2", "name": "Dataset 2"}
|
|
]
|
|
}
|
|
mock_evaluation_service.list_datasets.return_value = expected_result
|
|
|
|
result = mock_evaluation_service.list_datasets(
|
|
tenant_id="tenant_1",
|
|
user_id="user_1",
|
|
page=1,
|
|
page_size=20
|
|
)
|
|
|
|
assert result["total"] == 2
|
|
assert len(result["datasets"]) == 2
|
|
|
|
def test_list_datasets_with_pagination(self, mock_evaluation_service):
|
|
"""Test listing datasets with pagination"""
|
|
mock_evaluation_service.list_datasets.return_value = {
|
|
"total": 50,
|
|
"datasets": [{"id": f"dataset_{i}"} for i in range(10)]
|
|
}
|
|
|
|
result = mock_evaluation_service.list_datasets(
|
|
tenant_id="tenant_1",
|
|
user_id="user_1",
|
|
page=2,
|
|
page_size=10
|
|
)
|
|
|
|
assert result["total"] == 50
|
|
assert len(result["datasets"]) == 10
|
|
|
|
def test_update_dataset_success(self, mock_evaluation_service):
|
|
"""Test successful dataset update"""
|
|
mock_evaluation_service.update_dataset.return_value = True
|
|
|
|
success = mock_evaluation_service.update_dataset(
|
|
"dataset_123",
|
|
name="Updated Name",
|
|
description="Updated Description"
|
|
)
|
|
|
|
assert success is True
|
|
|
|
def test_update_dataset_not_found(self, mock_evaluation_service):
|
|
"""Test updating non-existent dataset"""
|
|
mock_evaluation_service.update_dataset.return_value = False
|
|
|
|
success = mock_evaluation_service.update_dataset(
|
|
"nonexistent",
|
|
name="Updated Name"
|
|
)
|
|
|
|
assert success is False
|
|
|
|
def test_delete_dataset_success(self, mock_evaluation_service):
|
|
"""Test successful dataset deletion"""
|
|
mock_evaluation_service.delete_dataset.return_value = True
|
|
|
|
success = mock_evaluation_service.delete_dataset("dataset_123")
|
|
|
|
assert success is True
|
|
|
|
def test_delete_dataset_not_found(self, mock_evaluation_service):
|
|
"""Test deleting non-existent dataset"""
|
|
mock_evaluation_service.delete_dataset.return_value = False
|
|
|
|
success = mock_evaluation_service.delete_dataset("nonexistent")
|
|
|
|
assert success is False
|
|
|
|
|
|
class TestEvaluationTestCaseManagement:
|
|
"""Tests for test case management"""
|
|
|
|
@pytest.fixture
|
|
def mock_evaluation_service(self):
|
|
"""Create a mock EvaluationService"""
|
|
with patch('api.db.services.evaluation_service.EvaluationService') as mock:
|
|
yield mock
|
|
|
|
@pytest.fixture
|
|
def sample_test_case(self):
|
|
"""Sample test case data"""
|
|
return {
|
|
"dataset_id": "dataset_123",
|
|
"question": "How do I reset my password?",
|
|
"reference_answer": "Click on 'Forgot Password' and follow the email instructions.",
|
|
"relevant_doc_ids": ["doc_789"],
|
|
"relevant_chunk_ids": ["chunk_101", "chunk_102"]
|
|
}
|
|
|
|
def test_add_test_case_success(self, mock_evaluation_service, sample_test_case):
|
|
"""Test successful test case addition"""
|
|
mock_evaluation_service.add_test_case.return_value = (True, "case_123")
|
|
|
|
success, case_id = mock_evaluation_service.add_test_case(**sample_test_case)
|
|
|
|
assert success is True
|
|
assert case_id == "case_123"
|
|
|
|
def test_add_test_case_with_empty_question(self, mock_evaluation_service):
|
|
"""Test adding test case with empty question"""
|
|
mock_evaluation_service.add_test_case.return_value = (False, "Question cannot be empty")
|
|
|
|
success, error = mock_evaluation_service.add_test_case(
|
|
dataset_id="dataset_123",
|
|
question=""
|
|
)
|
|
|
|
assert success is False
|
|
assert "question" in error.lower() or "empty" in error.lower()
|
|
|
|
def test_add_test_case_without_reference_answer(self, mock_evaluation_service):
|
|
"""Test adding test case without reference answer (optional)"""
|
|
mock_evaluation_service.add_test_case.return_value = (True, "case_123")
|
|
|
|
success, case_id = mock_evaluation_service.add_test_case(
|
|
dataset_id="dataset_123",
|
|
question="Test question",
|
|
reference_answer=None
|
|
)
|
|
|
|
assert success is True
|
|
|
|
def test_get_test_cases(self, mock_evaluation_service):
|
|
"""Test getting all test cases for a dataset"""
|
|
expected_cases = [
|
|
{"id": "case_1", "question": "Question 1"},
|
|
{"id": "case_2", "question": "Question 2"}
|
|
]
|
|
mock_evaluation_service.get_test_cases.return_value = expected_cases
|
|
|
|
cases = mock_evaluation_service.get_test_cases("dataset_123")
|
|
|
|
assert len(cases) == 2
|
|
assert cases[0]["id"] == "case_1"
|
|
|
|
def test_get_test_cases_empty_dataset(self, mock_evaluation_service):
|
|
"""Test getting test cases from empty dataset"""
|
|
mock_evaluation_service.get_test_cases.return_value = []
|
|
|
|
cases = mock_evaluation_service.get_test_cases("dataset_123")
|
|
|
|
assert len(cases) == 0
|
|
|
|
def test_delete_test_case_success(self, mock_evaluation_service):
|
|
"""Test successful test case deletion"""
|
|
mock_evaluation_service.delete_test_case.return_value = True
|
|
|
|
success = mock_evaluation_service.delete_test_case("case_123")
|
|
|
|
assert success is True
|
|
|
|
def test_import_test_cases_success(self, mock_evaluation_service):
|
|
"""Test bulk import of test cases"""
|
|
cases = [
|
|
{"question": "Question 1", "reference_answer": "Answer 1"},
|
|
{"question": "Question 2", "reference_answer": "Answer 2"},
|
|
{"question": "Question 3", "reference_answer": "Answer 3"}
|
|
]
|
|
mock_evaluation_service.import_test_cases.return_value = (3, 0)
|
|
|
|
success_count, failure_count = mock_evaluation_service.import_test_cases(
|
|
"dataset_123",
|
|
cases
|
|
)
|
|
|
|
assert success_count == 3
|
|
assert failure_count == 0
|
|
|
|
def test_import_test_cases_with_failures(self, mock_evaluation_service):
|
|
"""Test bulk import with some failures"""
|
|
cases = [
|
|
{"question": "Question 1"},
|
|
{"question": ""}, # Invalid
|
|
{"question": "Question 3"}
|
|
]
|
|
mock_evaluation_service.import_test_cases.return_value = (2, 1)
|
|
|
|
success_count, failure_count = mock_evaluation_service.import_test_cases(
|
|
"dataset_123",
|
|
cases
|
|
)
|
|
|
|
assert success_count == 2
|
|
assert failure_count == 1
|
|
|
|
|
|
class TestEvaluationExecution:
|
|
"""Tests for evaluation execution"""
|
|
|
|
@pytest.fixture
|
|
def mock_evaluation_service(self):
|
|
"""Create a mock EvaluationService"""
|
|
with patch('api.db.services.evaluation_service.EvaluationService') as mock:
|
|
yield mock
|
|
|
|
def test_start_evaluation_success(self, mock_evaluation_service):
|
|
"""Test successful evaluation start"""
|
|
mock_evaluation_service.start_evaluation.return_value = (True, "run_123")
|
|
|
|
success, run_id = mock_evaluation_service.start_evaluation(
|
|
dataset_id="dataset_123",
|
|
dialog_id="dialog_456",
|
|
user_id="user_1"
|
|
)
|
|
|
|
assert success is True
|
|
assert run_id == "run_123"
|
|
|
|
def test_start_evaluation_with_invalid_dialog(self, mock_evaluation_service):
|
|
"""Test starting evaluation with invalid dialog"""
|
|
mock_evaluation_service.start_evaluation.return_value = (False, "Dialog not found")
|
|
|
|
success, error = mock_evaluation_service.start_evaluation(
|
|
dataset_id="dataset_123",
|
|
dialog_id="nonexistent",
|
|
user_id="user_1"
|
|
)
|
|
|
|
assert success is False
|
|
assert "dialog" in error.lower()
|
|
|
|
def test_start_evaluation_with_custom_name(self, mock_evaluation_service):
|
|
"""Test starting evaluation with custom name"""
|
|
mock_evaluation_service.start_evaluation.return_value = (True, "run_123")
|
|
|
|
success, run_id = mock_evaluation_service.start_evaluation(
|
|
dataset_id="dataset_123",
|
|
dialog_id="dialog_456",
|
|
user_id="user_1",
|
|
name="My Custom Evaluation"
|
|
)
|
|
|
|
assert success is True
|
|
|
|
def test_get_run_results(self, mock_evaluation_service):
|
|
"""Test getting evaluation run results"""
|
|
expected_results = {
|
|
"run": {
|
|
"id": "run_123",
|
|
"status": "COMPLETED",
|
|
"metrics_summary": {
|
|
"avg_precision": 0.85,
|
|
"avg_recall": 0.78
|
|
}
|
|
},
|
|
"results": [
|
|
{"case_id": "case_1", "metrics": {"precision": 0.9}},
|
|
{"case_id": "case_2", "metrics": {"precision": 0.8}}
|
|
]
|
|
}
|
|
mock_evaluation_service.get_run_results.return_value = expected_results
|
|
|
|
results = mock_evaluation_service.get_run_results("run_123")
|
|
|
|
assert results["run"]["id"] == "run_123"
|
|
assert len(results["results"]) == 2
|
|
|
|
def test_get_run_results_not_found(self, mock_evaluation_service):
|
|
"""Test getting results for non-existent run"""
|
|
mock_evaluation_service.get_run_results.return_value = {}
|
|
|
|
results = mock_evaluation_service.get_run_results("nonexistent")
|
|
|
|
assert results == {}
|
|
|
|
|
|
class TestEvaluationMetrics:
|
|
"""Tests for metrics computation"""
|
|
|
|
@pytest.fixture
|
|
def mock_evaluation_service(self):
|
|
"""Create a mock EvaluationService"""
|
|
with patch('api.db.services.evaluation_service.EvaluationService') as mock:
|
|
yield mock
|
|
|
|
def test_compute_retrieval_metrics_perfect_match(self, mock_evaluation_service):
|
|
"""Test retrieval metrics with perfect match"""
|
|
retrieved_ids = ["chunk_1", "chunk_2", "chunk_3"]
|
|
relevant_ids = ["chunk_1", "chunk_2", "chunk_3"]
|
|
|
|
expected_metrics = {
|
|
"precision": 1.0,
|
|
"recall": 1.0,
|
|
"f1_score": 1.0,
|
|
"hit_rate": 1.0,
|
|
"mrr": 1.0
|
|
}
|
|
mock_evaluation_service._compute_retrieval_metrics.return_value = expected_metrics
|
|
|
|
metrics = mock_evaluation_service._compute_retrieval_metrics(retrieved_ids, relevant_ids)
|
|
|
|
assert metrics["precision"] == 1.0
|
|
assert metrics["recall"] == 1.0
|
|
assert metrics["f1_score"] == 1.0
|
|
|
|
def test_compute_retrieval_metrics_partial_match(self, mock_evaluation_service):
|
|
"""Test retrieval metrics with partial match"""
|
|
retrieved_ids = ["chunk_1", "chunk_2", "chunk_4", "chunk_5"]
|
|
relevant_ids = ["chunk_1", "chunk_2", "chunk_3"]
|
|
|
|
expected_metrics = {
|
|
"precision": 0.5, # 2 out of 4 retrieved are relevant
|
|
"recall": 0.67, # 2 out of 3 relevant were retrieved
|
|
"f1_score": 0.57,
|
|
"hit_rate": 1.0, # At least one relevant was retrieved
|
|
"mrr": 1.0 # First retrieved is relevant
|
|
}
|
|
mock_evaluation_service._compute_retrieval_metrics.return_value = expected_metrics
|
|
|
|
metrics = mock_evaluation_service._compute_retrieval_metrics(retrieved_ids, relevant_ids)
|
|
|
|
assert metrics["precision"] < 1.0
|
|
assert metrics["recall"] < 1.0
|
|
assert metrics["hit_rate"] == 1.0
|
|
|
|
def test_compute_retrieval_metrics_no_match(self, mock_evaluation_service):
|
|
"""Test retrieval metrics with no match"""
|
|
retrieved_ids = ["chunk_4", "chunk_5", "chunk_6"]
|
|
relevant_ids = ["chunk_1", "chunk_2", "chunk_3"]
|
|
|
|
expected_metrics = {
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1_score": 0.0,
|
|
"hit_rate": 0.0,
|
|
"mrr": 0.0
|
|
}
|
|
mock_evaluation_service._compute_retrieval_metrics.return_value = expected_metrics
|
|
|
|
metrics = mock_evaluation_service._compute_retrieval_metrics(retrieved_ids, relevant_ids)
|
|
|
|
assert metrics["precision"] == 0.0
|
|
assert metrics["recall"] == 0.0
|
|
assert metrics["hit_rate"] == 0.0
|
|
|
|
def test_compute_summary_metrics(self, mock_evaluation_service):
|
|
"""Test summary metrics computation"""
|
|
results = [
|
|
{"metrics": {"precision": 0.9, "recall": 0.8}, "execution_time": 1.2},
|
|
{"metrics": {"precision": 0.8, "recall": 0.7}, "execution_time": 1.5},
|
|
{"metrics": {"precision": 0.85, "recall": 0.75}, "execution_time": 1.3}
|
|
]
|
|
|
|
expected_summary = {
|
|
"total_cases": 3,
|
|
"avg_execution_time": 1.33,
|
|
"avg_precision": 0.85,
|
|
"avg_recall": 0.75
|
|
}
|
|
mock_evaluation_service._compute_summary_metrics.return_value = expected_summary
|
|
|
|
summary = mock_evaluation_service._compute_summary_metrics(results)
|
|
|
|
assert summary["total_cases"] == 3
|
|
assert summary["avg_precision"] > 0.8
|
|
|
|
|
|
class TestEvaluationRecommendations:
|
|
"""Tests for configuration recommendations"""
|
|
|
|
@pytest.fixture
|
|
def mock_evaluation_service(self):
|
|
"""Create a mock EvaluationService"""
|
|
with patch('api.db.services.evaluation_service.EvaluationService') as mock:
|
|
yield mock
|
|
|
|
def test_get_recommendations_low_precision(self, mock_evaluation_service):
|
|
"""Test recommendations for low precision"""
|
|
recommendations = [
|
|
{
|
|
"issue": "Low Precision",
|
|
"severity": "high",
|
|
"suggestions": [
|
|
"Increase similarity_threshold",
|
|
"Enable reranking"
|
|
]
|
|
}
|
|
]
|
|
mock_evaluation_service.get_recommendations.return_value = recommendations
|
|
|
|
recs = mock_evaluation_service.get_recommendations("run_123")
|
|
|
|
assert len(recs) > 0
|
|
assert any("precision" in r["issue"].lower() for r in recs)
|
|
|
|
def test_get_recommendations_low_recall(self, mock_evaluation_service):
|
|
"""Test recommendations for low recall"""
|
|
recommendations = [
|
|
{
|
|
"issue": "Low Recall",
|
|
"severity": "high",
|
|
"suggestions": [
|
|
"Increase top_k",
|
|
"Lower similarity_threshold"
|
|
]
|
|
}
|
|
]
|
|
mock_evaluation_service.get_recommendations.return_value = recommendations
|
|
|
|
recs = mock_evaluation_service.get_recommendations("run_123")
|
|
|
|
assert len(recs) > 0
|
|
assert any("recall" in r["issue"].lower() for r in recs)
|
|
|
|
def test_get_recommendations_slow_response(self, mock_evaluation_service):
|
|
"""Test recommendations for slow response time"""
|
|
recommendations = [
|
|
{
|
|
"issue": "Slow Response Time",
|
|
"severity": "medium",
|
|
"suggestions": [
|
|
"Reduce top_k",
|
|
"Optimize embedding model"
|
|
]
|
|
}
|
|
]
|
|
mock_evaluation_service.get_recommendations.return_value = recommendations
|
|
|
|
recs = mock_evaluation_service.get_recommendations("run_123")
|
|
|
|
assert len(recs) > 0
|
|
assert any("response" in r["issue"].lower() or "slow" in r["issue"].lower() for r in recs)
|
|
|
|
def test_get_recommendations_no_issues(self, mock_evaluation_service):
|
|
"""Test recommendations when metrics are good"""
|
|
mock_evaluation_service.get_recommendations.return_value = []
|
|
|
|
recs = mock_evaluation_service.get_recommendations("run_123")
|
|
|
|
assert len(recs) == 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
pytest.main([__file__, "-v"])
|