mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-02-02 00:25:06 +08:00
feat: Add OceanBase Performance Monitoring and Health Check Integration (#12886)
## Description This PR implements comprehensive OceanBase performance monitoring and health check functionality as requested in issue #12772. The implementation follows the existing ES/Infinity health check patterns and provides detailed metrics for operations teams. ## Problem Currently, RAGFlow lacks detailed health monitoring for OceanBase when used as the document engine. Operations teams need visibility into: - Connection status and latency - Storage space usage - Query throughput (QPS) - Slow query statistics - Connection pool utilization ## Solution ### 1. Enhanced OBConnection Class (`rag/utils/ob_conn.py`) Added comprehensive performance monitoring methods: - `get_performance_metrics()` - Main method returning all performance metrics - `_get_storage_info()` - Retrieves database storage usage - `_get_connection_pool_stats()` - Gets connection pool statistics - `_get_slow_query_count()` - Counts queries exceeding threshold - `_estimate_qps()` - Estimates queries per second - Enhanced `health()` method with connection status ### 2. Health Check Utilities (`api/utils/health_utils.py`) Added two new functions following ES/Infinity patterns: - `get_oceanbase_status()` - Returns OceanBase status with health and performance metrics - `check_oceanbase_health()` - Comprehensive health check with detailed metrics ### 3. API Endpoint (`api/apps/system_app.py`) Added new endpoint: - `GET /v1/system/oceanbase/status` - Returns OceanBase health status and performance metrics ### 4. Comprehensive Unit Tests (`test/unit_test/utils/test_oceanbase_health.py`) Added 340+ lines of unit tests covering: - Health check success/failure scenarios - Performance metrics retrieval - Error handling and edge cases - Connection pool statistics - Storage information retrieval - QPS estimation - Slow query detection ## Metrics Provided - **Connection Status**: connected/disconnected - **Latency**: Query latency in milliseconds - **Storage**: Used and total storage space - **QPS**: Estimated queries per second - **Slow Queries**: Count of queries exceeding threshold - **Connection Pool**: Active connections, max connections, pool size ## Testing - All unit tests pass - Error handling tested for connection failures - Edge cases covered (missing tables, connection errors) - Follows existing code patterns and conventions ## Code Statistics - **Total Lines Changed**: 665+ lines - **New Code**: ~600 lines - **Test Coverage**: 340+ lines of comprehensive tests - **Files Modified**: 3 - **Files Created**: 1 (test file) ## Acceptance Criteria Met ✅ `/system/oceanbase/status` API returns OceanBase health status ✅ Monitoring metrics accurately reflect OceanBase running status ✅ Clear error messages when health checks fail ✅ Response time optimized (metrics cached where possible) ✅ Follows existing ES/Infinity health check patterns ✅ Comprehensive test coverage ## Related Files - `rag/utils/ob_conn.py` - OceanBase connection class - `api/utils/health_utils.py` - Health check utilities - `api/apps/system_app.py` - System API endpoints - `test/unit_test/utils/test_oceanbase_health.py` - Unit tests Fixes #12772 --------- Co-authored-by: Daniel <daniel@example.com>
This commit is contained in:
@ -23,6 +23,7 @@ from api.db.db_models import DB
|
||||
from rag.utils.redis_conn import REDIS_CONN
|
||||
from rag.utils.es_conn import ESConnection
|
||||
from rag.utils.infinity_conn import InfinityConnection
|
||||
from rag.utils.ob_conn import OBConnection
|
||||
from common import settings
|
||||
|
||||
|
||||
@ -100,6 +101,121 @@ def get_infinity_status():
|
||||
}
|
||||
|
||||
|
||||
def get_oceanbase_status():
|
||||
"""
|
||||
Get OceanBase health status and performance metrics.
|
||||
|
||||
Returns:
|
||||
dict: OceanBase status with health information and performance metrics
|
||||
"""
|
||||
doc_engine = os.getenv('DOC_ENGINE', 'elasticsearch')
|
||||
if doc_engine != 'oceanbase':
|
||||
raise Exception("OceanBase is not in use.")
|
||||
try:
|
||||
ob_conn = OBConnection()
|
||||
health_info = ob_conn.health()
|
||||
performance_metrics = ob_conn.get_performance_metrics()
|
||||
|
||||
# Combine health and performance metrics
|
||||
status = "alive" if health_info.get("status") == "healthy" else "timeout"
|
||||
|
||||
return {
|
||||
"status": status,
|
||||
"message": {
|
||||
"health": health_info,
|
||||
"performance": performance_metrics
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"status": "timeout",
|
||||
"message": f"error: {str(e)}",
|
||||
}
|
||||
|
||||
|
||||
def check_oceanbase_health() -> dict:
|
||||
"""
|
||||
Check OceanBase health status with comprehensive metrics.
|
||||
|
||||
This function provides detailed health information including:
|
||||
- Connection status
|
||||
- Query latency
|
||||
- Storage usage
|
||||
- Query throughput (QPS)
|
||||
- Slow query statistics
|
||||
- Connection pool statistics
|
||||
|
||||
Returns:
|
||||
dict: Health status with detailed metrics
|
||||
"""
|
||||
doc_engine = os.getenv('DOC_ENGINE', 'elasticsearch')
|
||||
if doc_engine != 'oceanbase':
|
||||
return {
|
||||
"status": "not_configured",
|
||||
"details": {
|
||||
"connection": "not_configured",
|
||||
"message": "OceanBase is not configured as the document engine"
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
ob_conn = OBConnection()
|
||||
health_info = ob_conn.health()
|
||||
performance_metrics = ob_conn.get_performance_metrics()
|
||||
|
||||
# Determine overall health status
|
||||
connection_status = performance_metrics.get("connection", "unknown")
|
||||
|
||||
# If connection is disconnected, return unhealthy
|
||||
if connection_status == "disconnected" or health_info.get("status") != "healthy":
|
||||
return {
|
||||
"status": "unhealthy",
|
||||
"details": {
|
||||
"connection": connection_status,
|
||||
"latency_ms": performance_metrics.get("latency_ms", 0),
|
||||
"storage_used": performance_metrics.get("storage_used", "N/A"),
|
||||
"storage_total": performance_metrics.get("storage_total", "N/A"),
|
||||
"query_per_second": performance_metrics.get("query_per_second", 0),
|
||||
"slow_queries": performance_metrics.get("slow_queries", 0),
|
||||
"active_connections": performance_metrics.get("active_connections", 0),
|
||||
"max_connections": performance_metrics.get("max_connections", 0),
|
||||
"uri": health_info.get("uri", "unknown"),
|
||||
"version": health_info.get("version_comment", "unknown"),
|
||||
"error": health_info.get("error", performance_metrics.get("error"))
|
||||
}
|
||||
}
|
||||
|
||||
# Check if healthy (connected and low latency)
|
||||
is_healthy = (
|
||||
connection_status == "connected" and
|
||||
performance_metrics.get("latency_ms", float('inf')) < 1000 # Latency under 1 second
|
||||
)
|
||||
|
||||
return {
|
||||
"status": "healthy" if is_healthy else "degraded",
|
||||
"details": {
|
||||
"connection": performance_metrics.get("connection", "unknown"),
|
||||
"latency_ms": performance_metrics.get("latency_ms", 0),
|
||||
"storage_used": performance_metrics.get("storage_used", "N/A"),
|
||||
"storage_total": performance_metrics.get("storage_total", "N/A"),
|
||||
"query_per_second": performance_metrics.get("query_per_second", 0),
|
||||
"slow_queries": performance_metrics.get("slow_queries", 0),
|
||||
"active_connections": performance_metrics.get("active_connections", 0),
|
||||
"max_connections": performance_metrics.get("max_connections", 0),
|
||||
"uri": health_info.get("uri", "unknown"),
|
||||
"version": health_info.get("version_comment", "unknown")
|
||||
}
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"status": "unhealthy",
|
||||
"details": {
|
||||
"connection": "disconnected",
|
||||
"error": str(e)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def get_mysql_status():
|
||||
try:
|
||||
cursor = DB.execute_sql("SHOW PROCESSLIST;")
|
||||
|
||||
Reference in New Issue
Block a user