diff --git a/agent/tools/code_exec.py b/agent/tools/code_exec.py index 866a523ab..b94dc8d5e 100644 --- a/agent/tools/code_exec.py +++ b/agent/tools/code_exec.py @@ -157,7 +157,7 @@ class CodeExec(ToolBase, ABC): try: resp = requests.post(url=f"http://{settings.SANDBOX_HOST}:9385/run", json=code_req, timeout=os.environ.get("COMPONENT_EXEC_TIMEOUT", 10*60)) - logging.info(f"http://{settings.SANDBOX_HOST}:9385/run", code_req, resp.status_code) + logging.info(f"http://{settings.SANDBOX_HOST}:9385/run, code_req: {code_req}, resp.status_code {resp.status_code}:") if resp.status_code != 200: resp.raise_for_status() body = resp.json() diff --git a/api/apps/HEALTHCHECK_TESTING.md b/api/apps/HEALTHCHECK_TESTING.md new file mode 100644 index 000000000..a97a03c0e --- /dev/null +++ b/api/apps/HEALTHCHECK_TESTING.md @@ -0,0 +1,105 @@ +# 健康检查与 Kubernetes 探针简明说明 + +本文件说明:什么是 K8s 探针、如何用 `/v1/system/healthz` 做健康检查,以及下文用例中的关键词含义。 + +## 什么是 K8s 探针(Probe) +- 探针是 K8s 用来“探测”容器是否健康/可对外服务的机制。 +- 常见三类: + - livenessProbe:活性探针。失败时 K8s 会重启容器,用于“应用卡死/失去连接时自愈”。 + - readinessProbe:就绪探针。失败时 Endpoint 不会被加入 Service 负载均衡,用于“应用尚未准备好时不接流量”。 + - startupProbe:启动探针。给慢启动应用更长的初始化窗口,期间不执行 liveness/readiness。 +- 这些探针通常通过 HTTP GET 访问一个公开且轻量的健康端点(无需鉴权),以 HTTP 状态码判定结果:200=通过;5xx/超时=失败。 + +## 本项目健康端点 +- 已实现:`GET /v1/system/healthz`(无需认证)。 +- 语义: + - 200:关键依赖正常。 + - 500:任一关键依赖异常(当前判定为 DB 或 Chat)。 + - 响应体:JSON,最小字段 `status, db, chat`;并包含 `redis, doc_engine, storage` 等可观测项。失败项会在 `_meta` 中包含 `error/elapsed`。 +- 示例(DB 故障): +```json +{"status":"nok","chat":"ok","db":"nok"} +``` + +## 用例背景(Problem/use case) +- 现状:Ragflow 跑在 K8s,数据库是 AWS RDS Postgres,凭证由 Secret Manager 管理并每 7 天轮换。轮换后应用连接失效,需要手动重启 Pod 才能重新建立连接。 +- 目标:通过 K8s 探针自动化检测并重启异常 Pod,减少人工操作。 +- 需求:一个“无需鉴权”的公共健康端点,能在依赖异常时返回非 200(如 500)且提供 JSON 详情。 +- 现已满足:`/v1/system/healthz` 正是为此设计。 + +## 关键术语解释(对应你提供的描述) +- Ragflow instance:部署在 K8s 的 Ragflow 服务。 +- AWS RDS Postgres:托管的 PostgreSQL 数据库实例。 +- Secret Manager rotation:Secrets 定期轮换(每 7 天),会导致旧连接失效。 +- Probes(K8s 探针):liveness/readiness,用于自动重启或摘除不健康实例。 +- Public endpoint without API key:无需 Authorization 的 HTTP 路由,便于探针直接访问。 +- Dependencies statuses:依赖健康状态(db、chat、redis、doc_engine、storage 等)。 +- HTTP 500 with JSON:当依赖异常时返回 500,并附带 JSON 说明哪个子系统失败。 + +## 快速测试 +- 正常: +```bash +curl -i http:///v1/system/healthz +``` +- 制造 DB 故障(docker-compose 示例): +```bash +docker compose stop db && curl -i http:///v1/system/healthz +``` +(预期 500,JSON 中 `db:"nok"`) + +## 更完整的测试清单 +### 1) 仅查看 HTTP 状态码 +```bash +curl -s -o /dev/null -w "%{http_code}\n" http:///v1/system/healthz +``` +期望:`200` 或 `500`。 + +### 2) Windows PowerShell +```powershell +# 状态码 +(Invoke-WebRequest -Uri "http:///v1/system/healthz" -Method GET -TimeoutSec 3 -ErrorAction SilentlyContinue).StatusCode +# 完整响应 +Invoke-RestMethod -Uri "http:///v1/system/healthz" -Method GET +``` + +### 3) 通过 kubectl 端口转发本地测试 +```bash +# 前端/网关暴露端口不同环境自行调整 +kubectl port-forward deploy/ 8080:80 -n +curl -i http://127.0.0.1:8080/v1/system/healthz +``` + +### 4) 制造常见失败场景 +- DB 失败(推荐): +```bash +docker compose stop db +curl -i http:///v1/system/healthz # 预期 500 +``` +- Chat 失败(可选):将 `CHAT_CFG` 的 `factory`/`base_url` 设为无效并重启后端,再请求应为 500,且 `chat:"nok"`。 +- Redis/存储/文档引擎:停用对应服务后再次请求,可在 JSON 中看到相应字段为 `"nok"`(不影响 200/500 判定)。 + +### 5) 浏览器验证 +- 直接打开 `http:///v1/system/healthz`,在 DevTools Network 查看 200/500;页面正文就是 JSON。 +- 反向代理注意:若有自定义 500 错页,需对 `/healthz` 关闭错误页拦截(如 `proxy_intercept_errors off;`)。 + +## K8s 探针示例 +```yaml +readinessProbe: + httpGet: + path: /v1/system/healthz + port: 80 + initialDelaySeconds: 5 + periodSeconds: 10 + timeoutSeconds: 2 + failureThreshold: 1 +livenessProbe: + httpGet: + path: /v1/system/healthz + port: 80 + initialDelaySeconds: 10 + periodSeconds: 10 + timeoutSeconds: 2 + failureThreshold: 3 +``` + +提示:如有反向代理(Nginx)自定义 500 错页,需对 `/healthz` 关闭错误页拦截,以便保留 JSON。 diff --git a/api/apps/system_app.py b/api/apps/system_app.py index c4a70bcac..df17e4b57 100644 --- a/api/apps/system_app.py +++ b/api/apps/system_app.py @@ -36,6 +36,8 @@ from rag.utils.storage_factory import STORAGE_IMPL, STORAGE_IMPL_TYPE from timeit import default_timer as timer from rag.utils.redis_conn import REDIS_CONN +from flask import jsonify +from api.utils.health import run_health_checks @manager.route("/version", methods=["GET"]) # noqa: F821 @login_required @@ -169,6 +171,12 @@ def status(): return get_json_result(data=res) +@manager.route("/healthz", methods=["GET"]) # noqa: F821 +def healthz(): + result, all_ok = run_health_checks() + return jsonify(result), (200 if all_ok else 500) + + @manager.route("/new_token", methods=["POST"]) # noqa: F821 @login_required def new_token(): diff --git a/api/utils/health.py b/api/utils/health.py new file mode 100644 index 000000000..394154b9a --- /dev/null +++ b/api/utils/health.py @@ -0,0 +1,104 @@ +from timeit import default_timer as timer + +from api import settings +from api.db.db_models import DB +from rag.utils.redis_conn import REDIS_CONN +from rag.utils.storage_factory import STORAGE_IMPL + + +def _ok_nok(ok: bool) -> str: + return "ok" if ok else "nok" + + +def check_db() -> tuple[bool, dict]: + st = timer() + try: + # lightweight probe; works for MySQL/Postgres + DB.execute_sql("SELECT 1") + return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"} + except Exception as e: + return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)} + + +def check_redis() -> tuple[bool, dict]: + st = timer() + try: + ok = bool(REDIS_CONN.health()) + return ok, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"} + except Exception as e: + return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)} + + +def check_doc_engine() -> tuple[bool, dict]: + st = timer() + try: + meta = settings.docStoreConn.health() + # treat any successful call as ok + return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", **(meta or {})} + except Exception as e: + return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)} + + +def check_storage() -> tuple[bool, dict]: + st = timer() + try: + STORAGE_IMPL.health() + return True, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"} + except Exception as e: + return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)} + + +def check_chat() -> tuple[bool, dict]: + st = timer() + try: + cfg = getattr(settings, "CHAT_CFG", None) + ok = bool(cfg and cfg.get("factory")) + return ok, {"elapsed": f"{(timer() - st) * 1000.0:.1f}"} + except Exception as e: + return False, {"elapsed": f"{(timer() - st) * 1000.0:.1f}", "error": str(e)} + + +def run_health_checks() -> tuple[dict, bool]: + result: dict[str, str | dict] = {} + + db_ok, db_meta = check_db() + chat_ok, chat_meta = check_chat() + + result["db"] = _ok_nok(db_ok) + if not db_ok: + result.setdefault("_meta", {})["db"] = db_meta + + result["chat"] = _ok_nok(chat_ok) + if not chat_ok: + result.setdefault("_meta", {})["chat"] = chat_meta + + # Optional probes (do not change minimal contract but exposed for observability) + try: + redis_ok, redis_meta = check_redis() + result["redis"] = _ok_nok(redis_ok) + if not redis_ok: + result.setdefault("_meta", {})["redis"] = redis_meta + except Exception: + result["redis"] = "nok" + + try: + doc_ok, doc_meta = check_doc_engine() + result["doc_engine"] = _ok_nok(doc_ok) + if not doc_ok: + result.setdefault("_meta", {})["doc_engine"] = doc_meta + except Exception: + result["doc_engine"] = "nok" + + try: + sto_ok, sto_meta = check_storage() + result["storage"] = _ok_nok(sto_ok) + if not sto_ok: + result.setdefault("_meta", {})["storage"] = sto_meta + except Exception: + result["storage"] = "nok" + + all_ok = (result.get("db") == "ok") and (result.get("chat") == "ok") + result["status"] = "ok" if all_ok else "nok" + return result, all_ok + + diff --git a/rag/app/naive.py b/rag/app/naive.py index 1e110929c..9265ae776 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -507,16 +507,29 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128))) sections, tables = markdown_parser(filename, binary, separate_tables=False) - # Process images for each section - section_images = [] - for section_text, _ in sections: - images = markdown_parser.get_pictures(section_text) if section_text else None - if images: - # If multiple images found, combine them using concat_img - combined_image = reduce(concat_img, images) if len(images) > 1 else images[0] - section_images.append(combined_image) - else: - section_images.append(None) + try: + vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) + callback(0.2, "Visual model detected. Attempting to enhance figure extraction...") + except Exception: + vision_model = None + + if vision_model: + # Process images for each section + section_images = [] + for idx, (section_text, _) in enumerate(sections): + images = markdown_parser.get_pictures(section_text) if section_text else None + + if images: + # If multiple images found, combine them using concat_img + combined_image = reduce(concat_img, images) if len(images) > 1 else images[0] + section_images.append(combined_image) + markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data= [((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs) + boosted_figures = markdown_vision_parser(callback=callback) + sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1]) + else: + section_images.append(None) + else: + logging.warning("No visual model detected. Skipping figure parsing enhancement.") res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.")