Feat: RAG evaluation (#11674)

### What problem does this PR solve? Feature: This PR implements a comprehensive RAG evaluation framework to address issue #11656. **Problem**: Developers using RAGFlow lack systematic ways to measure RAG accuracy and quality. They cannot objectively answer: 1. Are RAG results truly accurate? 2. How should configurations be adjusted to improve quality? 3. How to maintain and improve RAG performance over time? **Solution**: This PR adds a complete evaluation system with: - **Dataset & test case management** - Create ground truth datasets with questions and expected answers - **Automated evaluation** - Run RAG pipeline on test cases and compute metrics - **Comprehensive metrics** - Precision, recall, F1 score, MRR, hit rate for retrieval quality - **Smart recommendations** - Analyze results and suggest specific configuration improvements (e.g., "increase top_k", "enable reranking") - **20+ REST API endpoints** - Full CRUD operations for datasets, test cases, and evaluation runs **Impact**: Enables developers to objectively measure RAG quality, identify issues, and systematically improve their RAG systems through data-driven configuration tuning. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2025-12-08 20:42:30 +08:00 · 2025-12-03 04:00:58 -05:00
parent 3c50c7d3ac
commit 237a66913b
5 changed files with 2060 additions and 0 deletions
--- a/api/db/db_models.py
+++ b/api/db/db_models.py
@ -1113,6 +1113,70 @@ class SyncLogs(DataBaseModel):
        db_table = "sync_logs"


+class EvaluationDataset(DataBaseModel):
+    """Ground truth dataset for RAG evaluation"""
+    id = CharField(max_length=32, primary_key=True)
+    tenant_id = CharField(max_length=32, null=False, index=True, help_text="tenant ID")
+    name = CharField(max_length=255, null=False, index=True, help_text="dataset name")
+    description = TextField(null=True, help_text="dataset description")
+    kb_ids = JSONField(null=False, help_text="knowledge base IDs to evaluate against")
+    created_by = CharField(max_length=32, null=False, index=True, help_text="creator user ID")
+    create_time = BigIntegerField(null=False, index=True, help_text="creation timestamp")
+    update_time = BigIntegerField(null=False, help_text="last update timestamp")
+    status = IntegerField(null=False, default=1, help_text="1=valid, 0=invalid")
+
+    class Meta:
+        db_table = "evaluation_datasets"
+
+
+class EvaluationCase(DataBaseModel):
+    """Individual test case in an evaluation dataset"""
+    id = CharField(max_length=32, primary_key=True)
+    dataset_id = CharField(max_length=32, null=False, index=True, help_text="FK to evaluation_datasets")
+    question = TextField(null=False, help_text="test question")
+    reference_answer = TextField(null=True, help_text="optional ground truth answer")
+    relevant_doc_ids = JSONField(null=True, help_text="expected relevant document IDs")
+    relevant_chunk_ids = JSONField(null=True, help_text="expected relevant chunk IDs")
+    metadata = JSONField(null=True, help_text="additional context/tags")
+    create_time = BigIntegerField(null=False, help_text="creation timestamp")
+
+    class Meta:
+        db_table = "evaluation_cases"
+
+
+class EvaluationRun(DataBaseModel):
+    """A single evaluation run"""
+    id = CharField(max_length=32, primary_key=True)
+    dataset_id = CharField(max_length=32, null=False, index=True, help_text="FK to evaluation_datasets")
+    dialog_id = CharField(max_length=32, null=False, index=True, help_text="dialog configuration being evaluated")
+    name = CharField(max_length=255, null=False, help_text="run name")
+    config_snapshot = JSONField(null=False, help_text="dialog config at time of evaluation")
+    metrics_summary = JSONField(null=True, help_text="aggregated metrics")
+    status = CharField(max_length=32, null=False, default="PENDING", help_text="PENDING/RUNNING/COMPLETED/FAILED")
+    created_by = CharField(max_length=32, null=False, index=True, help_text="user who started the run")
+    create_time = BigIntegerField(null=False, index=True, help_text="creation timestamp")
+    complete_time = BigIntegerField(null=True, help_text="completion timestamp")
+
+    class Meta:
+        db_table = "evaluation_runs"
+
+
+class EvaluationResult(DataBaseModel):
+    """Result for a single test case in an evaluation run"""
+    id = CharField(max_length=32, primary_key=True)
+    run_id = CharField(max_length=32, null=False, index=True, help_text="FK to evaluation_runs")
+    case_id = CharField(max_length=32, null=False, index=True, help_text="FK to evaluation_cases")
+    generated_answer = TextField(null=False, help_text="generated answer")
+    retrieved_chunks = JSONField(null=False, help_text="chunks that were retrieved")
+    metrics = JSONField(null=False, help_text="all computed metrics")
+    execution_time = FloatField(null=False, help_text="response time in seconds")
+    token_usage = JSONField(null=True, help_text="prompt/completion tokens")
+    create_time = BigIntegerField(null=False, help_text="creation timestamp")
+
+    class Meta:
+        db_table = "evaluation_results"
+
+
 def migrate_db():
    logging.disable(logging.ERROR)
    migrator = DatabaseMigrator[settings.DATABASE_TYPE.upper()].value(DB)
@ -1293,4 +1357,43 @@ def migrate_db():
        migrate(migrator.add_column("llm_factories", "rank", IntegerField(default=0, index=False)))
    except Exception:
        pass
+    
+    # RAG Evaluation tables
+    try:
+        migrate(migrator.add_column("evaluation_datasets", "id", CharField(max_length=32, primary_key=True)))
+    except Exception:
+        pass
+    try:
+        migrate(migrator.add_column("evaluation_datasets", "tenant_id", CharField(max_length=32, null=False, index=True)))
+    except Exception:
+        pass
+    try:
+        migrate(migrator.add_column("evaluation_datasets", "name", CharField(max_length=255, null=False, index=True)))
+    except Exception:
+        pass
+    try:
+        migrate(migrator.add_column("evaluation_datasets", "description", TextField(null=True)))
+    except Exception:
+        pass
+    try:
+        migrate(migrator.add_column("evaluation_datasets", "kb_ids", JSONField(null=False)))
+    except Exception:
+        pass
+    try:
+        migrate(migrator.add_column("evaluation_datasets", "created_by", CharField(max_length=32, null=False, index=True)))
+    except Exception:
+        pass
+    try:
+        migrate(migrator.add_column("evaluation_datasets", "create_time", BigIntegerField(null=False, index=True)))
+    except Exception:
+        pass
+    try:
+        migrate(migrator.add_column("evaluation_datasets", "update_time", BigIntegerField(null=False)))
+    except Exception:
+        pass
+    try:
+        migrate(migrator.add_column("evaluation_datasets", "status", IntegerField(null=False, default=1)))
+    except Exception:
+        pass
+    
    logging.disable(logging.NOTSET)