Feat: TOC retrieval (#10456)

### What problem does this PR solve? #10436 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-03 09:05:07 +08:00 · 2025-10-10 17:07:55 +08:00
parent 5d167cd772
commit 0d8791936e
12 changed files with 251 additions and 90 deletions
--- a/rag/svr/task_executor.py
+++ b/rag/svr/task_executor.py
@ -32,7 +32,7 @@ from api.utils.log_utils import init_root_logger, get_project_base_directory
 from graphrag.general.index import run_graphrag_for_kb
 from graphrag.utils import get_llm_cache, set_llm_cache, get_tags_from_cache, set_tags_to_cache
 from rag.flow.pipeline import Pipeline
-from rag.prompts.generator import keyword_extraction, question_proposal, content_tagging
+from rag.prompts.generator import keyword_extraction, question_proposal, content_tagging, run_toc_from_text
 import logging
 import os
 from datetime import datetime
@ -370,6 +370,38 @@ async def build_chunks(task, progress_callback):
                nursery.start_soon(doc_question_proposal, chat_mdl, d, task["parser_config"]["auto_questions"])
        progress_callback(msg="Question generation {} chunks completed in {:.2f}s".format(len(docs), timer() - st))

+    if task["parser_config"].get("toc_extraction", True):
+        progress_callback(msg="Start to generate table of content ...")
+        chat_mdl = LLMBundle(task["tenant_id"], LLMType.CHAT, llm_name=task["llm_id"], lang=task["language"])
+        docs = sorted(docs, key=lambda d:(
+            d.get("page_num_int", 0)[0] if isinstance(d.get("page_num_int", 0), list) else d.get("page_num_int", 0),
+            d.get("top_int", 0)[0] if isinstance(d.get("top_int", 0), list) else d.get("top_int", 0)
+        ))
+        toc: list[dict] = await run_toc_from_text([d["content_with_weight"] for d in docs], chat_mdl)
+        logging.info("------------ T O C -------------\n"+json.dumps(toc, ensure_ascii=False, indent='  '))
+        ii = 0
+        while ii < len(toc):
+            try:
+                idx = int(toc[ii]["chunk_id"])
+                del toc[ii]["chunk_id"]
+                toc[ii]["ids"] = [docs[idx]["id"]]
+                if ii == len(toc) -1:
+                    break
+                for jj in range(idx+1, int(toc[ii+1]["chunk_id"])):
+                    toc[ii]["ids"].append(docs[jj]["id"])
+            except Exception as e:
+                logging.exception(e)
+            ii += 1
+
+        if toc:
+            d = copy.deepcopy(docs[-1])
+            d["content_with_weight"] = json.dumps(toc, ensure_ascii=False)
+            d["toc_kwd"] = "toc"
+            d["available_int"] = 0
+            d["page_num_int"] = 100000000
+            d["id"] = xxhash.xxh64((d["content_with_weight"] + str(d["doc_id"])).encode("utf-8", "surrogatepass")).hexdigest()
+            docs.append(d)
+
    if task["kb_parser_config"].get("tag_kb_ids", []):
        progress_callback(msg="Start to tag for every chunk ...")
        kb_ids = task["kb_parser_config"]["tag_kb_ids"]