From 606f4e6c9e2792781f5712176927b575bedefc6a Mon Sep 17 00:00:00 2001 From: Liu An Date: Mon, 5 Jan 2026 10:02:42 +0800 Subject: [PATCH] Refa: improve TOC building with better error handling (#12427) ### What problem does this PR solve? Refactor TOC building logic to use enumerate instead of while loop, add comprehensive error handling for missing/invalid chunk_id values, and improve logging with more specific error messages. The changes make the code more robust against malformed TOC data while maintaining the same functionality for valid inputs. ### Type of change - [x] Refactoring --- rag/svr/task_executor.py | 32 +++++++++++++++++++++----------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 6dc2f929e..360d1c959 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -512,19 +512,29 @@ def build_TOC(task, docs, progress_callback): toc: list[dict] = asyncio.run( run_toc_from_text([d["content_with_weight"] for d in docs], chat_mdl, progress_callback)) logging.info("------------ T O C -------------\n" + json.dumps(toc, ensure_ascii=False, indent=' ')) - ii = 0 - while ii < len(toc): + for ii, item in enumerate(toc): try: - idx = int(toc[ii]["chunk_id"]) - del toc[ii]["chunk_id"] - toc[ii]["ids"] = [docs[idx]["id"]] - if ii == len(toc) - 1: - break - for jj in range(idx + 1, int(toc[ii + 1]["chunk_id"]) + 1): - toc[ii]["ids"].append(docs[jj]["id"]) + chunk_val = item.pop("chunk_id", None) + if chunk_val is None or str(chunk_val).strip() == "": + logging.warning(f"Index {ii}: chunk_id is missing or empty. Skipping.") + continue + curr_idx = int(chunk_val) + if curr_idx >= len(docs): + logging.error(f"Index {ii}: chunk_id {curr_idx} exceeds docs length {len(docs)}.") + continue + item["ids"] = [docs[curr_idx]["id"]] + if ii + 1 < len(toc): + next_chunk_val = toc[ii + 1].get("chunk_id", "") + if str(next_chunk_val).strip() != "": + next_idx = int(next_chunk_val) + for jj in range(curr_idx + 1, min(next_idx + 1, len(docs))): + item["ids"].append(docs[jj]["id"]) + else: + logging.warning(f"Index {ii + 1}: next chunk_id is empty, range fill skipped.") + except (ValueError, TypeError) as e: + logging.error(f"Index {ii}: Data conversion error - {e}") except Exception as e: - logging.exception(e) - ii += 1 + logging.exception(f"Index {ii}: Unexpected error - {e}") if toc: d = copy.deepcopy(docs[-1])