Feat: Benchmark CLI additions and documentation (#12536)

### What problem does this PR solve? This PR adds a dedicated HTTP benchmark CLI for RAGFlow chat and retrieval endpoints so we can measure latency/QPS. ### Type of change - [x] Documentation Update - [x] Other (please describe): Adds a CLI benchmarking tool for chat/retrieval latency/QPS --------- Co-authored-by: Liu An <asiro@qq.com>
2026-02-01 08:05:07 +08:00 · 2026-01-14 13:49:16 +08:00
parent a7671583b3
commit 5b22f94502
20 changed files with 1978 additions and 0 deletions
--- a/test/benchmark/report.py
+++ b/test/benchmark/report.py
@ -0,0 +1,105 @@
+from typing import Dict, List, Optional
+
+
+def _fmt_seconds(value: Optional[float]) -> str:
+    if value is None:
+        return "n/a"
+    return f"{value:.4f}s"
+
+
+def _fmt_ms(value: Optional[float]) -> str:
+    if value is None:
+        return "n/a"
+    return f"{value * 1000.0:.2f}ms"
+
+
+def _fmt_qps(qps: Optional[float]) -> str:
+    if qps is None or qps <= 0:
+        return "n/a"
+    return f"{qps:.2f}"
+
+
+def _calc_qps(total_duration_s: Optional[float], total_requests: int) -> Optional[float]:
+    if total_duration_s is None or total_duration_s <= 0:
+        return None
+    return total_requests / total_duration_s
+
+
+def render_report(lines: List[str]) -> str:
+    return "\n".join(lines).strip() + "\n"
+
+
+def chat_report(
+    *,
+    interface: str,
+    concurrency: int,
+    total_duration_s: Optional[float],
+    iterations: int,
+    success: int,
+    failure: int,
+    model: str,
+    total_stats: Dict[str, Optional[float]],
+    first_token_stats: Dict[str, Optional[float]],
+    errors: List[str],
+    created: Dict[str, str],
+) -> str:
+    lines = [
+        f"Interface: {interface}",
+        f"Concurrency: {concurrency}",
+        f"Iterations: {iterations}",
+        f"Success: {success}",
+        f"Failure: {failure}",
+        f"Model: {model}",
+    ]
+    for key, value in created.items():
+        lines.append(f"{key}: {value}")
+    lines.extend(
+        [
+            "Latency (total): "
+            f"avg={_fmt_ms(total_stats['avg'])}, min={_fmt_ms(total_stats['min'])}, "
+            f"p50={_fmt_ms(total_stats['p50'])}, p90={_fmt_ms(total_stats['p90'])}, p95={_fmt_ms(total_stats['p95'])}",
+            "Latency (first token): "
+            f"avg={_fmt_ms(first_token_stats['avg'])}, min={_fmt_ms(first_token_stats['min'])}, "
+            f"p50={_fmt_ms(first_token_stats['p50'])}, p90={_fmt_ms(first_token_stats['p90'])}, p95={_fmt_ms(first_token_stats['p95'])}",
+            f"Total Duration: {_fmt_seconds(total_duration_s)}",
+            f"QPS (requests / total duration): {_fmt_qps(_calc_qps(total_duration_s, iterations))}",
+        ]
+    )
+    if errors:
+        lines.append("Errors: " + "; ".join(errors[:5]))
+    return render_report(lines)
+
+
+def retrieval_report(
+    *,
+    interface: str,
+    concurrency: int,
+    total_duration_s: Optional[float],
+    iterations: int,
+    success: int,
+    failure: int,
+    stats: Dict[str, Optional[float]],
+    errors: List[str],
+    created: Dict[str, str],
+) -> str:
+    lines = [
+        f"Interface: {interface}",
+        f"Concurrency: {concurrency}",
+        f"Iterations: {iterations}",
+        f"Success: {success}",
+        f"Failure: {failure}",
+    ]
+    for key, value in created.items():
+        lines.append(f"{key}: {value}")
+    lines.extend(
+        [
+            "Latency: "
+            f"avg={_fmt_ms(stats['avg'])}, min={_fmt_ms(stats['min'])}, "
+            f"p50={_fmt_ms(stats['p50'])}, p90={_fmt_ms(stats['p90'])}, p95={_fmt_ms(stats['p95'])}",
+            f"Total Duration: {_fmt_seconds(total_duration_s)}",
+            f"QPS (requests / total duration): {_fmt_qps(_calc_qps(total_duration_s, iterations))}",
+        ]
+    )
+    if errors:
+        lines.append("Errors: " + "; ".join(errors[:5]))
+    return render_report(lines)