mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-01 08:05:07 +08:00
Feat: Benchmark CLI additions and documentation (#12536)
### What problem does this PR solve? This PR adds a dedicated HTTP benchmark CLI for RAGFlow chat and retrieval endpoints so we can measure latency/QPS. ### Type of change - [x] Documentation Update - [x] Other (please describe): Adds a CLI benchmarking tool for chat/retrieval latency/QPS --------- Co-authored-by: Liu An <asiro@qq.com>
This commit is contained in:
105
test/benchmark/report.py
Normal file
105
test/benchmark/report.py
Normal file
@ -0,0 +1,105 @@
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
|
||||
def _fmt_seconds(value: Optional[float]) -> str:
|
||||
if value is None:
|
||||
return "n/a"
|
||||
return f"{value:.4f}s"
|
||||
|
||||
|
||||
def _fmt_ms(value: Optional[float]) -> str:
|
||||
if value is None:
|
||||
return "n/a"
|
||||
return f"{value * 1000.0:.2f}ms"
|
||||
|
||||
|
||||
def _fmt_qps(qps: Optional[float]) -> str:
|
||||
if qps is None or qps <= 0:
|
||||
return "n/a"
|
||||
return f"{qps:.2f}"
|
||||
|
||||
|
||||
def _calc_qps(total_duration_s: Optional[float], total_requests: int) -> Optional[float]:
|
||||
if total_duration_s is None or total_duration_s <= 0:
|
||||
return None
|
||||
return total_requests / total_duration_s
|
||||
|
||||
|
||||
def render_report(lines: List[str]) -> str:
|
||||
return "\n".join(lines).strip() + "\n"
|
||||
|
||||
|
||||
def chat_report(
|
||||
*,
|
||||
interface: str,
|
||||
concurrency: int,
|
||||
total_duration_s: Optional[float],
|
||||
iterations: int,
|
||||
success: int,
|
||||
failure: int,
|
||||
model: str,
|
||||
total_stats: Dict[str, Optional[float]],
|
||||
first_token_stats: Dict[str, Optional[float]],
|
||||
errors: List[str],
|
||||
created: Dict[str, str],
|
||||
) -> str:
|
||||
lines = [
|
||||
f"Interface: {interface}",
|
||||
f"Concurrency: {concurrency}",
|
||||
f"Iterations: {iterations}",
|
||||
f"Success: {success}",
|
||||
f"Failure: {failure}",
|
||||
f"Model: {model}",
|
||||
]
|
||||
for key, value in created.items():
|
||||
lines.append(f"{key}: {value}")
|
||||
lines.extend(
|
||||
[
|
||||
"Latency (total): "
|
||||
f"avg={_fmt_ms(total_stats['avg'])}, min={_fmt_ms(total_stats['min'])}, "
|
||||
f"p50={_fmt_ms(total_stats['p50'])}, p90={_fmt_ms(total_stats['p90'])}, p95={_fmt_ms(total_stats['p95'])}",
|
||||
"Latency (first token): "
|
||||
f"avg={_fmt_ms(first_token_stats['avg'])}, min={_fmt_ms(first_token_stats['min'])}, "
|
||||
f"p50={_fmt_ms(first_token_stats['p50'])}, p90={_fmt_ms(first_token_stats['p90'])}, p95={_fmt_ms(first_token_stats['p95'])}",
|
||||
f"Total Duration: {_fmt_seconds(total_duration_s)}",
|
||||
f"QPS (requests / total duration): {_fmt_qps(_calc_qps(total_duration_s, iterations))}",
|
||||
]
|
||||
)
|
||||
if errors:
|
||||
lines.append("Errors: " + "; ".join(errors[:5]))
|
||||
return render_report(lines)
|
||||
|
||||
|
||||
def retrieval_report(
|
||||
*,
|
||||
interface: str,
|
||||
concurrency: int,
|
||||
total_duration_s: Optional[float],
|
||||
iterations: int,
|
||||
success: int,
|
||||
failure: int,
|
||||
stats: Dict[str, Optional[float]],
|
||||
errors: List[str],
|
||||
created: Dict[str, str],
|
||||
) -> str:
|
||||
lines = [
|
||||
f"Interface: {interface}",
|
||||
f"Concurrency: {concurrency}",
|
||||
f"Iterations: {iterations}",
|
||||
f"Success: {success}",
|
||||
f"Failure: {failure}",
|
||||
]
|
||||
for key, value in created.items():
|
||||
lines.append(f"{key}: {value}")
|
||||
lines.extend(
|
||||
[
|
||||
"Latency: "
|
||||
f"avg={_fmt_ms(stats['avg'])}, min={_fmt_ms(stats['min'])}, "
|
||||
f"p50={_fmt_ms(stats['p50'])}, p90={_fmt_ms(stats['p90'])}, p95={_fmt_ms(stats['p95'])}",
|
||||
f"Total Duration: {_fmt_seconds(total_duration_s)}",
|
||||
f"QPS (requests / total duration): {_fmt_qps(_calc_qps(total_duration_s, iterations))}",
|
||||
]
|
||||
)
|
||||
if errors:
|
||||
lines.append("Errors: " + "; ".join(errors[:5]))
|
||||
return render_report(lines)
|
||||
Reference in New Issue
Block a user