Feat: TOC retrieval (#10456)

### What problem does this PR solve?

#10436

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Kevin Hu
2025-10-10 17:07:55 +08:00
committed by GitHub
parent 5d167cd772
commit 0d8791936e
12 changed files with 251 additions and 90 deletions

View File

@ -57,6 +57,7 @@ class RetrievalParam(ToolParamBase):
self.empty_response = "" self.empty_response = ""
self.use_kg = False self.use_kg = False
self.cross_languages = [] self.cross_languages = []
self.toc_enhance = False
def check(self): def check(self):
self.check_decimal_float(self.similarity_threshold, "[Retrieval] Similarity threshold") self.check_decimal_float(self.similarity_threshold, "[Retrieval] Similarity threshold")
@ -134,6 +135,11 @@ class Retrieval(ToolBase, ABC):
rerank_mdl=rerank_mdl, rerank_mdl=rerank_mdl,
rank_feature=label_question(query, kbs), rank_feature=label_question(query, kbs),
) )
if self._param.toc_enhance:
chat_mdl = LLMBundle(self._canvas._tenant_id, LLMType.CHAT)
cks = settings.retriever.retrieval_by_toc(query, kbinfos["chunks"], [kb.tenant_id for kb in kbs], chat_mdl, self._param.top_n)
if cks:
kbinfos["chunks"] = cks
if self._param.use_kg: if self._param.use_kg:
ck = settings.kg_retriever.retrieval(query, ck = settings.kg_retriever.retrieval(query,
[kb.tenant_id for kb in kbs], [kb.tenant_id for kb in kbs],

View File

@ -466,6 +466,10 @@ def chat(dialog, messages, stream=True, **kwargs):
rerank_mdl=rerank_mdl, rerank_mdl=rerank_mdl,
rank_feature=label_question(" ".join(questions), kbs), rank_feature=label_question(" ".join(questions), kbs),
) )
if prompt_config.get("toc_enhance"):
cks = retriever.retrieval_by_toc(" ".join(questions), kbinfos["chunks"], tenant_ids, chat_mdl, dialog.top_n)
if cks:
kbinfos["chunks"] = cks
if prompt_config.get("tavily_api_key"): if prompt_config.get("tavily_api_key"):
tav = Tavily(prompt_config["tavily_api_key"]) tav = Tavily(prompt_config["tavily_api_key"])
tav_res = tav.retrieve_chunks(" ".join(questions)) tav_res = tav.retrieve_chunks(" ".join(questions))

View File

@ -51,9 +51,6 @@ from api import settings
from api.constants import REQUEST_MAX_WAIT_SEC, REQUEST_WAIT_SEC from api.constants import REQUEST_MAX_WAIT_SEC, REQUEST_WAIT_SEC
from api.db import ActiveEnum from api.db import ActiveEnum
from api.db.db_models import APIToken from api.db.db_models import APIToken
from api.db.services import UserService
from api.db.services.llm_service import LLMService
from api.db.services.tenant_llm_service import TenantLLMService
from api.utils.json import CustomJSONEncoder, json_dumps from api.utils.json import CustomJSONEncoder, json_dumps
from api.utils import get_uuid from api.utils import get_uuid
from rag.utils.mcp_tool_call_conn import MCPToolCallSession, close_multiple_mcp_toolcall_sessions from rag.utils.mcp_tool_call_conn import MCPToolCallSession, close_multiple_mcp_toolcall_sessions
@ -239,6 +236,7 @@ def not_allowed_parameters(*params):
def active_required(f): def active_required(f):
@wraps(f) @wraps(f)
def wrapper(*args, **kwargs): def wrapper(*args, **kwargs):
from api.db.services import UserService
user_id = current_user.id user_id = current_user.id
usr = UserService.filter_by_id(user_id) usr = UserService.filter_by_id(user_id)
# check is_active # check is_active
@ -544,6 +542,8 @@ def check_duplicate_ids(ids, id_type="item"):
def verify_embedding_availability(embd_id: str, tenant_id: str) -> tuple[bool, Response | None]: def verify_embedding_availability(embd_id: str, tenant_id: str) -> tuple[bool, Response | None]:
from api.db.services.llm_service import LLMService
from api.db.services.tenant_llm_service import TenantLLMService
""" """
Verifies availability of an embedding model for a specific tenant. Verifies availability of an embedding model for a specific tenant.

View File

@ -1048,7 +1048,7 @@ class RAGFlowPdfParser:
def parse_into_bboxes(self, fnm, callback=None, zoomin=3): def parse_into_bboxes(self, fnm, callback=None, zoomin=3):
start = timer() start = timer()
self.__images__(fnm, zoomin) self.__images__(fnm, zoomin, callback=callback)
if callback: if callback:
callback(0.40, "OCR finished ({:.2f}s)".format(timer() - start)) callback(0.40, "OCR finished ({:.2f}s)".format(timer() - start))

View File

@ -92,10 +92,7 @@ def dict_has_keys_with_types(data: dict, expected_fields: list[tuple[str, type]]
def get_llm_cache(llmnm, txt, history, genconf): def get_llm_cache(llmnm, txt, history, genconf):
hasher = xxhash.xxh64() hasher = xxhash.xxh64()
hasher.update(str(llmnm).encode("utf-8")) hasher.update((str(llmnm)+str(txt)+str(history)+str(genconf)).encode("utf-8"))
hasher.update(str(txt).encode("utf-8"))
hasher.update(str(history).encode("utf-8"))
hasher.update(str(genconf).encode("utf-8"))
k = hasher.hexdigest() k = hasher.hexdigest()
bin = REDIS_CONN.get(k) bin = REDIS_CONN.get(k)
@ -106,11 +103,7 @@ def get_llm_cache(llmnm, txt, history, genconf):
def set_llm_cache(llmnm, txt, v, history, genconf): def set_llm_cache(llmnm, txt, v, history, genconf):
hasher = xxhash.xxh64() hasher = xxhash.xxh64()
hasher.update(str(llmnm).encode("utf-8")) hasher.update((str(llmnm)+str(txt)+str(history)+str(genconf)).encode("utf-8"))
hasher.update(str(txt).encode("utf-8"))
hasher.update(str(history).encode("utf-8"))
hasher.update(str(genconf).encode("utf-8"))
k = hasher.hexdigest() k = hasher.hexdigest()
REDIS_CONN.set(k, v.encode("utf-8"), 24 * 3600) REDIS_CONN.set(k, v.encode("utf-8"), 24 * 3600)

View File

@ -366,6 +366,7 @@ class Parser(ProcessBase):
email_content = {} email_content = {}
conf = self._param.setups["email"] conf = self._param.setups["email"]
self.set_output("output_format", conf["output_format"])
target_fields = conf["fields"] target_fields = conf["fields"]
_, ext = os.path.splitext(name) _, ext = os.path.splitext(name)
@ -442,8 +443,9 @@ class Parser(ProcessBase):
} }
# get body # get body
if "body" in target_fields: if "body" in target_fields:
email_content["text"] = msg.body # usually empty. try text_html instead email_content["text"] = msg.body[0] if isinstance(msg.body, list) and msg.body else msg.body
email_content["text_html"] = msg.htmlBody if not email_content["text"] and msg.htmlBody:
email_content["text"] = msg.htmlBody[0] if isinstance(msg.htmlBody, list) and msg.htmlBody else msg.htmlBody
# get attachments # get attachments
if "attachments" in target_fields: if "attachments" in target_fields:
attachments = [] attachments = []

View File

@ -132,8 +132,7 @@ class Base(ABC):
"tool_choice", "tool_choice",
"logprobs", "logprobs",
"top_logprobs", "top_logprobs",
"extra_headers", "extra_headers"
"enable_thinking"
} }
gen_conf = {k: v for k, v in gen_conf.items() if k in allowed_conf} gen_conf = {k: v for k, v in gen_conf.items() if k in allowed_conf}

View File

@ -13,12 +13,14 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# #
import json
import logging import logging
import re import re
import math import math
from collections import OrderedDict from collections import OrderedDict
from dataclasses import dataclass from dataclasses import dataclass
from rag.prompts.generator import relevant_chunks_with_toc
from rag.settings import TAG_FLD, PAGERANK_FLD from rag.settings import TAG_FLD, PAGERANK_FLD
from rag.utils import rmSpace, get_float from rag.utils import rmSpace, get_float
from rag.nlp import rag_tokenizer, query from rag.nlp import rag_tokenizer, query
@ -514,3 +516,63 @@ class Dealer:
tag_fea = sorted([(a, round(0.1*(c + 1) / (cnt + S) / max(1e-6, all_tags.get(a, 0.0001)))) for a, c in aggs], tag_fea = sorted([(a, round(0.1*(c + 1) / (cnt + S) / max(1e-6, all_tags.get(a, 0.0001)))) for a, c in aggs],
key=lambda x: x[1] * -1)[:topn_tags] key=lambda x: x[1] * -1)[:topn_tags]
return {a.replace(".", "_"): max(1, c) for a, c in tag_fea} return {a.replace(".", "_"): max(1, c) for a, c in tag_fea}
def retrieval_by_toc(self, query:str, chunks:list[dict], tenant_ids:list[str], chat_mdl, topn: int=6):
if not chunks:
return []
idx_nms = [index_name(tid) for tid in tenant_ids]
ranks, doc_id2kb_id = {}, {}
for ck in chunks:
if ck["doc_id"] not in ranks:
ranks[ck["doc_id"]] = 0
ranks[ck["doc_id"]] += ck["similarity"]
doc_id2kb_id[ck["doc_id"]] = ck["kb_id"]
doc_id = sorted(ranks.items(), key=lambda x: x[1]*-1.)[0][0]
kb_ids = [doc_id2kb_id[doc_id]]
es_res = self.dataStore.search(["content_with_weight"], [], {"doc_id": doc_id, "toc_kwd": "toc"}, [], OrderByExpr(), 0, 128, idx_nms,
kb_ids)
toc = []
dict_chunks = self.dataStore.getFields(es_res, ["content_with_weight"])
for _, doc in dict_chunks.items():
try:
toc.extend(json.loads(doc["content_with_weight"]))
except Exception as e:
logging.exception(e)
if not toc:
return chunks
ids = relevant_chunks_with_toc(query, toc, chat_mdl, topn*2)
if not ids:
return chunks
vector_size = 1024
id2idx = {ck["chunk_id"]: i for i, ck in enumerate(chunks)}
for cid, sim in ids:
if cid in id2idx:
chunks[id2idx[cid]]["similarity"] += sim
continue
chunk = self.dataStore.get(cid, idx_nms, kb_ids)
d = {
"chunk_id": cid,
"content_ltks": chunk["content_ltks"],
"content_with_weight": chunk["content_with_weight"],
"doc_id": doc_id,
"docnm_kwd": chunk.get("docnm_kwd", ""),
"kb_id": chunk["kb_id"],
"important_kwd": chunk.get("important_kwd", []),
"image_id": chunk.get("img_id", ""),
"similarity": sim,
"vector_similarity": sim,
"term_similarity": sim,
"vector": [0.0] * vector_size,
"positions": chunk.get("position_int", []),
"doc_type_kwd": chunk.get("doc_type_kwd", "")
}
for k in chunk.keys():
if k[-4:] == "_vec":
d["vector"] = chunk[k]
vector_size = len(chunk[k])
break
chunks.append(d)
return sorted(chunks, key=lambda x:x["similarity"]*-1)[:topn]

View File

@ -1,4 +1,4 @@
You are given a JSON array of TOC items. Each item has at least {"title": string} and may include an existing structure. You are given a JSON array of TOC(tabel of content) items. Each item has at least {"title": string} and may include an existing title hierarchical level.
Task Task
- For each item, assign a depth label using Arabic numerals only: top-level = 1, second-level = 2, third-level = 3, etc. - For each item, assign a depth label using Arabic numerals only: top-level = 1, second-level = 2, third-level = 3, etc.
@ -9,7 +9,7 @@ Task
Output Output
- Return a valid JSON array only (no extra text). - Return a valid JSON array only (no extra text).
- Each element must be {"structure": "1|2|3", "title": <original title string>}. - Each element must be {"level": "1|2|3", "title": <original title string>}.
- title must be the original title string. - title must be the original title string.
Examples Examples
@ -20,10 +20,10 @@ Input:
Output: Output:
[ [
{"structure":"1","title":"Chapter 1 Methods"}, {"level":"1","title":"Chapter 1 Methods"},
{"structure":"2","title":"Section 1 Definition"}, {"level":"2","title":"Section 1 Definition"},
{"structure":"2","title":"Section 2 Process"}, {"level":"2","title":"Section 2 Process"},
{"structure":"1","title":"Chapter 2 Experiment"} {"level":"1","title":"Chapter 2 Experiment"}
] ]
Example B (parts with chapters) Example B (parts with chapters)
@ -32,11 +32,11 @@ Input:
Output: Output:
[ [
{"structure":"1","title":"Part I Theory"}, {"level":"1","title":"Part I Theory"},
{"structure":"2","title":"Chapter 1 Basics"}, {"level":"2","title":"Chapter 1 Basics"},
{"structure":"2","title":"Chapter 2 Methods"}, {"level":"2","title":"Chapter 2 Methods"},
{"structure":"1","title":"Part II Applications"}, {"level":"1","title":"Part II Applications"},
{"structure":"2","title":"Chapter 3 Case Studies"} {"level":"2","title":"Chapter 3 Case Studies"}
] ]
Example C (plain headings) Example C (plain headings)
@ -45,9 +45,9 @@ Input:
Output: Output:
[ [
{"structure":"1","title":"Introduction"}, {"level":"1","title":"Introduction"},
{"structure":"2","title":"Background and Motivation"}, {"level":"2","title":"Background and Motivation"},
{"structure":"2","title":"Related Work"}, {"level":"2","title":"Related Work"},
{"structure":"1","title":"Methodology"}, {"level":"1","title":"Methodology"},
{"structure":"1","title":"Evaluation"} {"level":"1","title":"Evaluation"}
] ]

View File

@ -21,7 +21,9 @@ from copy import deepcopy
from typing import Tuple from typing import Tuple
import jinja2 import jinja2
import json_repair import json_repair
import trio
from api.utils import hash_str2int from api.utils import hash_str2int
from rag.nlp import is_chinese
from rag.prompts.template import load_prompt from rag.prompts.template import load_prompt
from rag.settings import TAG_FLD from rag.settings import TAG_FLD
from rag.utils import encoder, num_tokens_from_string from rag.utils import encoder, num_tokens_from_string
@ -440,11 +442,17 @@ def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
def gen_json(system_prompt:str, user_prompt:str, chat_mdl, gen_conf = None): def gen_json(system_prompt:str, user_prompt:str, chat_mdl, gen_conf = None):
from graphrag.utils import get_llm_cache, set_llm_cache
cached = get_llm_cache(chat_mdl.llm_name, system_prompt, user_prompt, gen_conf)
if cached:
return json_repair.loads(cached)
_, msg = message_fit_in(form_message(system_prompt, user_prompt), chat_mdl.max_length) _, msg = message_fit_in(form_message(system_prompt, user_prompt), chat_mdl.max_length)
ans = chat_mdl.chat(msg[0]["content"], msg[1:],gen_conf=gen_conf) ans = chat_mdl.chat(msg[0]["content"], msg[1:],gen_conf=gen_conf)
ans = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", ans, flags=re.DOTALL) ans = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", ans, flags=re.DOTALL)
try: try:
return json_repair.loads(ans) res = json_repair.loads(ans)
set_llm_cache(chat_mdl.llm_name, system_prompt, ans, user_prompt, gen_conf)
return res
except Exception: except Exception:
logging.exception(f"Loading json failure: {ans}") logging.exception(f"Loading json failure: {ans}")
@ -651,29 +659,31 @@ def toc_transformer(toc_pages, chat_mdl):
TOC_LEVELS = load_prompt("assign_toc_levels") TOC_LEVELS = load_prompt("assign_toc_levels")
def assign_toc_levels(toc_secs, chat_mdl, gen_conf = {"temperature": 0.2}): def assign_toc_levels(toc_secs, chat_mdl, gen_conf = {"temperature": 0.2}):
print("\nBegin TOC level assignment...\n") if not toc_secs:
return []
ans = gen_json( return gen_json(
PROMPT_JINJA_ENV.from_string(TOC_LEVELS).render(), PROMPT_JINJA_ENV.from_string(TOC_LEVELS).render(),
str(toc_secs), str(toc_secs),
chat_mdl, chat_mdl,
gen_conf gen_conf
) )
return ans
TOC_FROM_TEXT_SYSTEM = load_prompt("toc_from_text_system") TOC_FROM_TEXT_SYSTEM = load_prompt("toc_from_text_system")
TOC_FROM_TEXT_USER = load_prompt("toc_from_text_user") TOC_FROM_TEXT_USER = load_prompt("toc_from_text_user")
# Generate TOC from text chunks with text llms # Generate TOC from text chunks with text llms
def gen_toc_from_text(text, chat_mdl): async def gen_toc_from_text(txt_info: dict, chat_mdl):
ans = gen_json( try:
PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_SYSTEM).render(), ans = gen_json(
PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_USER).render(text=text), PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_SYSTEM).render(),
chat_mdl, PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_USER).render(text="\n".join([json.dumps(d, ensure_ascii=False) for d in txt_info["chunks"]])),
gen_conf={"temperature": 0.0, "top_p": 0.9, "enable_thinking": False, } chat_mdl,
) gen_conf={"temperature": 0.0, "top_p": 0.9}
return ans )
print(ans, "::::::::::::::::::::::::::::::::::::", flush=True)
txt_info["toc"] = ans if ans else []
except Exception as e:
logging.exception(e)
def split_chunks(chunks, max_length: int): def split_chunks(chunks, max_length: int):
@ -690,44 +700,91 @@ def split_chunks(chunks, max_length: int):
if batch_tokens + t > max_length: if batch_tokens + t > max_length:
result.append(batch) result.append(batch)
batch, batch_tokens = [], 0 batch, batch_tokens = [], 0
batch.append({"id": idx, "text": chunk}) batch.append({idx: chunk})
batch_tokens += t batch_tokens += t
if batch: if batch:
result.append(batch) result.append(batch)
return result return result
def run_toc_from_text(chunks, chat_mdl): async def run_toc_from_text(chunks, chat_mdl):
input_budget = int(chat_mdl.max_length * INPUT_UTILIZATION) - num_tokens_from_string( input_budget = int(chat_mdl.max_length * INPUT_UTILIZATION) - num_tokens_from_string(
TOC_FROM_TEXT_USER + TOC_FROM_TEXT_SYSTEM TOC_FROM_TEXT_USER + TOC_FROM_TEXT_SYSTEM
) )
input_budget = 2000 if input_budget > 2000 else input_budget input_budget = 1024 if input_budget > 1024 else input_budget
chunk_sections = split_chunks(chunks, input_budget) chunk_sections = split_chunks(chunks, input_budget)
res = [] res = []
for chunk in chunk_sections: chunks_res = []
ans = gen_toc_from_text(chunk, chat_mdl) async with trio.open_nursery() as nursery:
res.extend(ans) for i, chunk in enumerate(chunk_sections):
if not chunk:
continue
chunks_res.append({"chunks": chunk})
nursery.start_soon(gen_toc_from_text, chunks_res[-1], chat_mdl)
for chunk in chunks_res:
res.extend(chunk.get("toc", []))
print(res, ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
# Filter out entries with title == -1 # Filter out entries with title == -1
filtered = [x for x in res if x.get("title") and x.get("title") != "-1"] filtered = []
for x in res:
if not x.get("title") or x["title"] == "-1":
continue
if is_chinese(x["title"]) and len(x["title"]) > 12:
continue
if len(x["title"].split(" ")) > 12:
continue
if re.match(r"[0-9,.()/ -]+$", x["title"]):
continue
filtered.append(x)
print("\n\nFiltered TOC sections:\n", filtered) logging.info(f"\n\nFiltered TOC sections:\n{filtered}")
# Generate initial structure (structure/title) # Generate initial level (level/title)
raw_structure = [{"structure": "0", "title": x.get("title", "")} for x in filtered] raw_structure = [x.get("title", "") for x in filtered]
# Assign hierarchy levels using LLM # Assign hierarchy levels using LLM
toc_with_levels = assign_toc_levels(raw_structure, chat_mdl, {"temperature": 0.0, "top_p": 0.9, "enable_thinking": False}) toc_with_levels = assign_toc_levels(raw_structure, chat_mdl, {"temperature": 0.0, "top_p": 0.9})
# Merge structure and content (by index) # Merge structure and content (by index)
merged = [] merged = []
for _ , (toc_item, src_item) in enumerate(zip(toc_with_levels, filtered)): for _ , (toc_item, src_item) in enumerate(zip(toc_with_levels, filtered)):
merged.append({ merged.append({
"structure": toc_item.get("structure", "0"), "level": toc_item.get("level", "0"),
"title": toc_item.get("title", ""), "title": toc_item.get("title", ""),
"content": src_item.get("content", ""), "chunk_id": src_item.get("chunk_id", ""),
}) })
return merged return merged
TOC_RELEVANCE_SYSTEM = load_prompt("toc_relevance_system")
TOC_RELEVANCE_USER = load_prompt("toc_relevance_user")
def relevant_chunks_with_toc(query: str, toc:list[dict], chat_mdl, topn: int=6):
import numpy as np
try:
ans = gen_json(
PROMPT_JINJA_ENV.from_string(TOC_RELEVANCE_SYSTEM).render(),
PROMPT_JINJA_ENV.from_string(TOC_RELEVANCE_USER).render(query=query, toc_json="[\n%s\n]\n"%"\n".join([json.dumps({"level": d["level"], "title":d["title"]}, ensure_ascii=False) for d in toc])),
chat_mdl,
gen_conf={"temperature": 0.0, "top_p": 0.9}
)
print(ans, "::::::::::::::::::::::::::::::::::::", flush=True)
id2score = {}
for ti, sc in zip(toc, ans):
if sc.get("score", -1) < 1:
continue
for id in ti.get("ids", []):
if id not in id2score:
id2score[id] = []
id2score[id].append(sc["score"]/5.)
for id in id2score.keys():
id2score[id] = np.mean(id2score[id])
return [(id, sc) for id, sc in list(id2score.items()) if sc>=0.3][:topn]
except Exception as e:
logging.exception(e)
return []

View File

@ -1,25 +1,25 @@
You are a robust Table-of-Contents (TOC) extractor. You are a robust Table-of-Contents (TOC) extractor.
GOAL GOAL
Given a dictionary of chunks {chunk_id: chunk_text}, extract TOC-like headings and return a strict JSON array of objects: Given a dictionary of chunks {"<chunk_ID>": chunk_text}, extract TOC-like headings and return a strict JSON array of objects:
[ [
{"title": , "content": ""}, {"title": "", "chunk_id": ""},
... ...
] ]
FIELDS FIELDS
- "title": the heading text (clean, no page numbers or leader dots). - "title": the heading text (clean, no page numbers or leader dots).
- If any part of a chunk has no valid heading, output that part as {"title":"-1", ...}. - If any part of a chunk has no valid heading, output that part as {"title":"-1", ...}.
- "content": the chunk_id (string). - "chunk_id": the chunk ID (string).
- One chunk can yield multiple JSON objects in order (unmatched text + one or more headings). - One chunk can yield multiple JSON objects in order (unmatched text + one or more headings).
RULES RULES
1) Preserve input chunk order strictly. 1) Preserve input chunk order strictly.
2) If a chunk contains multiple headings, expand them in order: 2) If a chunk contains multiple headings, expand them in order:
- Pre-heading narrative → {"title":"-1","content":chunk_id} - Pre-heading narrative → {"title":"-1","chunk_id":"<chunk_ID>"}
- Then each heading → {"title":"...","content":chunk_id} - Then each heading → {"title":"...","chunk_id":"<chunk_ID>"}
3) Do not merge outputs across chunks; each object refers to exactly one chunk_id. 3) Do not merge outputs across chunks; each object refers to exactly one chunk ID.
4) "title" must be non-empty (or exactly "-1"). "content" must be a string (chunk_id). 4) "title" must be non-empty (or exactly "-1"). "chunk_id" must be a string (chunk ID).
5) When ambiguous, prefer "-1" unless the text strongly looks like a heading. 5) When ambiguous, prefer "-1" unless the text strongly looks like a heading.
HEADING DETECTION (cues, not hard rules) HEADING DETECTION (cues, not hard rules)
@ -51,63 +51,69 @@ EXAMPLES
Example 1 — No heading Example 1 — No heading
Input: Input:
{0: "Copyright page · Publication info (ISBN 123-456). All rights reserved."} [{"0": "Copyright page · Publication info (ISBN 123-456). All rights reserved."}, ...]
Output: Output:
[ [
{"title":"-1","content":"0"} {"title":"-1","chunk_id":"0"},
...
] ]
Example 2 — One heading Example 2 — One heading
Input: Input:
{1: "Chapter 1: General Provisions This chapter defines the overall rules…"} [{"1": "Chapter 1: General Provisions This chapter defines the overall rules…"}, ...]
Output: Output:
[ [
{"title":"Chapter 1: General Provisions","content":"1"} {"title":"Chapter 1: General Provisions","chunk_id":"1"},
...
] ]
Example 3 — Narrative + heading Example 3 — Narrative + heading
Input: Input:
{2: "This paragraph introduces the background and goals. Section 2: Definitions Key terms are explained…"} [{"2": "This paragraph introduces the background and goals. Section 2: Definitions Key terms are explained…"}, ...]
Output: Output:
[ [
{"title":"-1","content":"2"}, {"title":"Section 2: Definitions","chunk_id":"2"},
{"title":"Section 2: Definitions","content":"2"} ...
] ]
Example 4 — Multiple headings in one chunk Example 4 — Multiple headings in one chunk
Input: Input:
{3: "Declarations and Commitments (I) Party B commits… (II) Party C commits… Appendix A Data Specification"} [{"3": "Declarations and Commitments (I) Party B commits… (II) Party C commits… Appendix A Data Specification"}, ...]
Output: Output:
[ [
{"title":"Declarations and Commitments (I)","content":"3"}, {"title":"Declarations and Commitments","chunk_id":"3"},
{"title":"(II)","content":"3"}, {"title":"(I) Party B commits","chunk_id":"3"},
{"title":"Appendix A","content":"3"} {"title":"(II) Party C commits","chunk_id":"3"},
{"title":"Appendix A Data Specification","chunk_id":"3"},
...
] ]
Example 5 — Numbering styles Example 5 — Numbering styles
Input: Input:
{4: "1. Scope: Defines boundaries. 2) Definitions: Terms used. III) Methods Overview."} [{"4": "1. Scope: Defines boundaries. 2) Definitions: Terms used. III) Methods Overview."}, ...]
Output: Output:
[ [
{"title":"1. Scope","content":"4"}, {"title":"1. Scope","chunk_id":"4"},
{"title":"2) Definitions","content":"4"}, {"title":"2) Definitions","chunk_id":"4"},
{"title":"III) Methods","content":"4"} {"title":"III) Methods Overview","chunk_id":"4"},
...
] ]
Example 6 — Long list (NOT headings) Example 6 — Long list (NOT headings)
Input: Input:
{5: "Item list: apples, bananas, strawberries, blueberries, mangos, peaches"} {"5": "Item list: apples, bananas, strawberries, blueberries, mangos, peaches"}, ...]
Output: Output:
[ [
{"title":"-1","content":"5"} {"title":"-1","chunk_id":"5"},
...
] ]
Example 7 — Mixed Chinese/English Example 7 — Mixed Chinese/English
Input: Input:
{6: "出版信息略This standard follows industry practices. Chapter 1: Overview 摘要… 第2节术语与缩略语"} {"6": "出版信息略This standard follows industry practices. Chapter 1: Overview 摘要… 第2节术语与缩略语"}, ...]
Output: Output:
[ [
{"title":"-1","content":"6"}, {"title":"Chapter 1: Overview","chunk_id":"6"},
{"title":"Chapter 1: Overview","content":"6"}, {"title":"第2节术语与缩略语","chunk_id":"6"},
{"title":"第2节术语与缩略语","content":"6"} ...
] ]

View File

@ -32,7 +32,7 @@ from api.utils.log_utils import init_root_logger, get_project_base_directory
from graphrag.general.index import run_graphrag_for_kb from graphrag.general.index import run_graphrag_for_kb
from graphrag.utils import get_llm_cache, set_llm_cache, get_tags_from_cache, set_tags_to_cache from graphrag.utils import get_llm_cache, set_llm_cache, get_tags_from_cache, set_tags_to_cache
from rag.flow.pipeline import Pipeline from rag.flow.pipeline import Pipeline
from rag.prompts.generator import keyword_extraction, question_proposal, content_tagging from rag.prompts.generator import keyword_extraction, question_proposal, content_tagging, run_toc_from_text
import logging import logging
import os import os
from datetime import datetime from datetime import datetime
@ -370,6 +370,38 @@ async def build_chunks(task, progress_callback):
nursery.start_soon(doc_question_proposal, chat_mdl, d, task["parser_config"]["auto_questions"]) nursery.start_soon(doc_question_proposal, chat_mdl, d, task["parser_config"]["auto_questions"])
progress_callback(msg="Question generation {} chunks completed in {:.2f}s".format(len(docs), timer() - st)) progress_callback(msg="Question generation {} chunks completed in {:.2f}s".format(len(docs), timer() - st))
if task["parser_config"].get("toc_extraction", True):
progress_callback(msg="Start to generate table of content ...")
chat_mdl = LLMBundle(task["tenant_id"], LLMType.CHAT, llm_name=task["llm_id"], lang=task["language"])
docs = sorted(docs, key=lambda d:(
d.get("page_num_int", 0)[0] if isinstance(d.get("page_num_int", 0), list) else d.get("page_num_int", 0),
d.get("top_int", 0)[0] if isinstance(d.get("top_int", 0), list) else d.get("top_int", 0)
))
toc: list[dict] = await run_toc_from_text([d["content_with_weight"] for d in docs], chat_mdl)
logging.info("------------ T O C -------------\n"+json.dumps(toc, ensure_ascii=False, indent=' '))
ii = 0
while ii < len(toc):
try:
idx = int(toc[ii]["chunk_id"])
del toc[ii]["chunk_id"]
toc[ii]["ids"] = [docs[idx]["id"]]
if ii == len(toc) -1:
break
for jj in range(idx+1, int(toc[ii+1]["chunk_id"])):
toc[ii]["ids"].append(docs[jj]["id"])
except Exception as e:
logging.exception(e)
ii += 1
if toc:
d = copy.deepcopy(docs[-1])
d["content_with_weight"] = json.dumps(toc, ensure_ascii=False)
d["toc_kwd"] = "toc"
d["available_int"] = 0
d["page_num_int"] = 100000000
d["id"] = xxhash.xxh64((d["content_with_weight"] + str(d["doc_id"])).encode("utf-8", "surrogatepass")).hexdigest()
docs.append(d)
if task["kb_parser_config"].get("tag_kb_ids", []): if task["kb_parser_config"].get("tag_kb_ids", []):
progress_callback(msg="Start to tag for every chunk ...") progress_callback(msg="Start to tag for every chunk ...")
kb_ids = task["kb_parser_config"]["tag_kb_ids"] kb_ids = task["kb_parser_config"]["tag_kb_ids"]