mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: RAG evaluation (#11674)
### What problem does this PR solve? Feature: This PR implements a comprehensive RAG evaluation framework to address issue #11656. **Problem**: Developers using RAGFlow lack systematic ways to measure RAG accuracy and quality. They cannot objectively answer: 1. Are RAG results truly accurate? 2. How should configurations be adjusted to improve quality? 3. How to maintain and improve RAG performance over time? **Solution**: This PR adds a complete evaluation system with: - **Dataset & test case management** - Create ground truth datasets with questions and expected answers - **Automated evaluation** - Run RAG pipeline on test cases and compute metrics - **Comprehensive metrics** - Precision, recall, F1 score, MRR, hit rate for retrieval quality - **Smart recommendations** - Analyze results and suggest specific configuration improvements (e.g., "increase top_k", "enable reranking") - **20+ REST API endpoints** - Full CRUD operations for datasets, test cases, and evaluation runs **Impact**: Enables developers to objectively measure RAG quality, identify issues, and systematically improve their RAG systems through data-driven configuration tuning. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -1113,6 +1113,70 @@ class SyncLogs(DataBaseModel):
|
||||
db_table = "sync_logs"
|
||||
|
||||
|
||||
class EvaluationDataset(DataBaseModel):
|
||||
"""Ground truth dataset for RAG evaluation"""
|
||||
id = CharField(max_length=32, primary_key=True)
|
||||
tenant_id = CharField(max_length=32, null=False, index=True, help_text="tenant ID")
|
||||
name = CharField(max_length=255, null=False, index=True, help_text="dataset name")
|
||||
description = TextField(null=True, help_text="dataset description")
|
||||
kb_ids = JSONField(null=False, help_text="knowledge base IDs to evaluate against")
|
||||
created_by = CharField(max_length=32, null=False, index=True, help_text="creator user ID")
|
||||
create_time = BigIntegerField(null=False, index=True, help_text="creation timestamp")
|
||||
update_time = BigIntegerField(null=False, help_text="last update timestamp")
|
||||
status = IntegerField(null=False, default=1, help_text="1=valid, 0=invalid")
|
||||
|
||||
class Meta:
|
||||
db_table = "evaluation_datasets"
|
||||
|
||||
|
||||
class EvaluationCase(DataBaseModel):
|
||||
"""Individual test case in an evaluation dataset"""
|
||||
id = CharField(max_length=32, primary_key=True)
|
||||
dataset_id = CharField(max_length=32, null=False, index=True, help_text="FK to evaluation_datasets")
|
||||
question = TextField(null=False, help_text="test question")
|
||||
reference_answer = TextField(null=True, help_text="optional ground truth answer")
|
||||
relevant_doc_ids = JSONField(null=True, help_text="expected relevant document IDs")
|
||||
relevant_chunk_ids = JSONField(null=True, help_text="expected relevant chunk IDs")
|
||||
metadata = JSONField(null=True, help_text="additional context/tags")
|
||||
create_time = BigIntegerField(null=False, help_text="creation timestamp")
|
||||
|
||||
class Meta:
|
||||
db_table = "evaluation_cases"
|
||||
|
||||
|
||||
class EvaluationRun(DataBaseModel):
|
||||
"""A single evaluation run"""
|
||||
id = CharField(max_length=32, primary_key=True)
|
||||
dataset_id = CharField(max_length=32, null=False, index=True, help_text="FK to evaluation_datasets")
|
||||
dialog_id = CharField(max_length=32, null=False, index=True, help_text="dialog configuration being evaluated")
|
||||
name = CharField(max_length=255, null=False, help_text="run name")
|
||||
config_snapshot = JSONField(null=False, help_text="dialog config at time of evaluation")
|
||||
metrics_summary = JSONField(null=True, help_text="aggregated metrics")
|
||||
status = CharField(max_length=32, null=False, default="PENDING", help_text="PENDING/RUNNING/COMPLETED/FAILED")
|
||||
created_by = CharField(max_length=32, null=False, index=True, help_text="user who started the run")
|
||||
create_time = BigIntegerField(null=False, index=True, help_text="creation timestamp")
|
||||
complete_time = BigIntegerField(null=True, help_text="completion timestamp")
|
||||
|
||||
class Meta:
|
||||
db_table = "evaluation_runs"
|
||||
|
||||
|
||||
class EvaluationResult(DataBaseModel):
|
||||
"""Result for a single test case in an evaluation run"""
|
||||
id = CharField(max_length=32, primary_key=True)
|
||||
run_id = CharField(max_length=32, null=False, index=True, help_text="FK to evaluation_runs")
|
||||
case_id = CharField(max_length=32, null=False, index=True, help_text="FK to evaluation_cases")
|
||||
generated_answer = TextField(null=False, help_text="generated answer")
|
||||
retrieved_chunks = JSONField(null=False, help_text="chunks that were retrieved")
|
||||
metrics = JSONField(null=False, help_text="all computed metrics")
|
||||
execution_time = FloatField(null=False, help_text="response time in seconds")
|
||||
token_usage = JSONField(null=True, help_text="prompt/completion tokens")
|
||||
create_time = BigIntegerField(null=False, help_text="creation timestamp")
|
||||
|
||||
class Meta:
|
||||
db_table = "evaluation_results"
|
||||
|
||||
|
||||
def migrate_db():
|
||||
logging.disable(logging.ERROR)
|
||||
migrator = DatabaseMigrator[settings.DATABASE_TYPE.upper()].value(DB)
|
||||
@ -1293,4 +1357,43 @@ def migrate_db():
|
||||
migrate(migrator.add_column("llm_factories", "rank", IntegerField(default=0, index=False)))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# RAG Evaluation tables
|
||||
try:
|
||||
migrate(migrator.add_column("evaluation_datasets", "id", CharField(max_length=32, primary_key=True)))
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
migrate(migrator.add_column("evaluation_datasets", "tenant_id", CharField(max_length=32, null=False, index=True)))
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
migrate(migrator.add_column("evaluation_datasets", "name", CharField(max_length=255, null=False, index=True)))
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
migrate(migrator.add_column("evaluation_datasets", "description", TextField(null=True)))
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
migrate(migrator.add_column("evaluation_datasets", "kb_ids", JSONField(null=False)))
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
migrate(migrator.add_column("evaluation_datasets", "created_by", CharField(max_length=32, null=False, index=True)))
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
migrate(migrator.add_column("evaluation_datasets", "create_time", BigIntegerField(null=False, index=True)))
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
migrate(migrator.add_column("evaluation_datasets", "update_time", BigIntegerField(null=False)))
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
migrate(migrator.add_column("evaluation_datasets", "status", IntegerField(null=False, default=1)))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
logging.disable(logging.NOTSET)
|
||||
|
||||
Reference in New Issue
Block a user