Feat: RAG evaluation (#11674)

### What problem does this PR solve?

Feature: This PR implements a comprehensive RAG evaluation framework to
address issue #11656.

**Problem**: Developers using RAGFlow lack systematic ways to measure
RAG accuracy and quality. They cannot objectively answer:
1. Are RAG results truly accurate?
2. How should configurations be adjusted to improve quality?
3. How to maintain and improve RAG performance over time?

**Solution**: This PR adds a complete evaluation system with:
- **Dataset & test case management** - Create ground truth datasets with
questions and expected answers
- **Automated evaluation** - Run RAG pipeline on test cases and compute
metrics
- **Comprehensive metrics** - Precision, recall, F1 score, MRR, hit rate
for retrieval quality
- **Smart recommendations** - Analyze results and suggest specific
configuration improvements (e.g., "increase top_k", "enable reranking")
- **20+ REST API endpoints** - Full CRUD operations for datasets, test
cases, and evaluation runs

**Impact**: Enables developers to objectively measure RAG quality,
identify issues, and systematically improve their RAG systems through
data-driven configuration tuning.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
hsparks-codes
2025-12-03 04:00:58 -05:00
committed by GitHub
parent 3c50c7d3ac
commit 237a66913b
5 changed files with 2060 additions and 0 deletions

View File

@ -1113,6 +1113,70 @@ class SyncLogs(DataBaseModel):
db_table = "sync_logs"
class EvaluationDataset(DataBaseModel):
"""Ground truth dataset for RAG evaluation"""
id = CharField(max_length=32, primary_key=True)
tenant_id = CharField(max_length=32, null=False, index=True, help_text="tenant ID")
name = CharField(max_length=255, null=False, index=True, help_text="dataset name")
description = TextField(null=True, help_text="dataset description")
kb_ids = JSONField(null=False, help_text="knowledge base IDs to evaluate against")
created_by = CharField(max_length=32, null=False, index=True, help_text="creator user ID")
create_time = BigIntegerField(null=False, index=True, help_text="creation timestamp")
update_time = BigIntegerField(null=False, help_text="last update timestamp")
status = IntegerField(null=False, default=1, help_text="1=valid, 0=invalid")
class Meta:
db_table = "evaluation_datasets"
class EvaluationCase(DataBaseModel):
"""Individual test case in an evaluation dataset"""
id = CharField(max_length=32, primary_key=True)
dataset_id = CharField(max_length=32, null=False, index=True, help_text="FK to evaluation_datasets")
question = TextField(null=False, help_text="test question")
reference_answer = TextField(null=True, help_text="optional ground truth answer")
relevant_doc_ids = JSONField(null=True, help_text="expected relevant document IDs")
relevant_chunk_ids = JSONField(null=True, help_text="expected relevant chunk IDs")
metadata = JSONField(null=True, help_text="additional context/tags")
create_time = BigIntegerField(null=False, help_text="creation timestamp")
class Meta:
db_table = "evaluation_cases"
class EvaluationRun(DataBaseModel):
"""A single evaluation run"""
id = CharField(max_length=32, primary_key=True)
dataset_id = CharField(max_length=32, null=False, index=True, help_text="FK to evaluation_datasets")
dialog_id = CharField(max_length=32, null=False, index=True, help_text="dialog configuration being evaluated")
name = CharField(max_length=255, null=False, help_text="run name")
config_snapshot = JSONField(null=False, help_text="dialog config at time of evaluation")
metrics_summary = JSONField(null=True, help_text="aggregated metrics")
status = CharField(max_length=32, null=False, default="PENDING", help_text="PENDING/RUNNING/COMPLETED/FAILED")
created_by = CharField(max_length=32, null=False, index=True, help_text="user who started the run")
create_time = BigIntegerField(null=False, index=True, help_text="creation timestamp")
complete_time = BigIntegerField(null=True, help_text="completion timestamp")
class Meta:
db_table = "evaluation_runs"
class EvaluationResult(DataBaseModel):
"""Result for a single test case in an evaluation run"""
id = CharField(max_length=32, primary_key=True)
run_id = CharField(max_length=32, null=False, index=True, help_text="FK to evaluation_runs")
case_id = CharField(max_length=32, null=False, index=True, help_text="FK to evaluation_cases")
generated_answer = TextField(null=False, help_text="generated answer")
retrieved_chunks = JSONField(null=False, help_text="chunks that were retrieved")
metrics = JSONField(null=False, help_text="all computed metrics")
execution_time = FloatField(null=False, help_text="response time in seconds")
token_usage = JSONField(null=True, help_text="prompt/completion tokens")
create_time = BigIntegerField(null=False, help_text="creation timestamp")
class Meta:
db_table = "evaluation_results"
def migrate_db():
logging.disable(logging.ERROR)
migrator = DatabaseMigrator[settings.DATABASE_TYPE.upper()].value(DB)
@ -1293,4 +1357,43 @@ def migrate_db():
migrate(migrator.add_column("llm_factories", "rank", IntegerField(default=0, index=False)))
except Exception:
pass
# RAG Evaluation tables
try:
migrate(migrator.add_column("evaluation_datasets", "id", CharField(max_length=32, primary_key=True)))
except Exception:
pass
try:
migrate(migrator.add_column("evaluation_datasets", "tenant_id", CharField(max_length=32, null=False, index=True)))
except Exception:
pass
try:
migrate(migrator.add_column("evaluation_datasets", "name", CharField(max_length=255, null=False, index=True)))
except Exception:
pass
try:
migrate(migrator.add_column("evaluation_datasets", "description", TextField(null=True)))
except Exception:
pass
try:
migrate(migrator.add_column("evaluation_datasets", "kb_ids", JSONField(null=False)))
except Exception:
pass
try:
migrate(migrator.add_column("evaluation_datasets", "created_by", CharField(max_length=32, null=False, index=True)))
except Exception:
pass
try:
migrate(migrator.add_column("evaluation_datasets", "create_time", BigIntegerField(null=False, index=True)))
except Exception:
pass
try:
migrate(migrator.add_column("evaluation_datasets", "update_time", BigIntegerField(null=False)))
except Exception:
pass
try:
migrate(migrator.add_column("evaluation_datasets", "status", IntegerField(null=False, default=1)))
except Exception:
pass
logging.disable(logging.NOTSET)