mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: TOC retrieval (#10456)
### What problem does this PR solve? #10436 ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -1,4 +1,4 @@
|
||||
You are given a JSON array of TOC items. Each item has at least {"title": string} and may include an existing structure.
|
||||
You are given a JSON array of TOC(tabel of content) items. Each item has at least {"title": string} and may include an existing title hierarchical level.
|
||||
|
||||
Task
|
||||
- For each item, assign a depth label using Arabic numerals only: top-level = 1, second-level = 2, third-level = 3, etc.
|
||||
@ -9,7 +9,7 @@ Task
|
||||
|
||||
Output
|
||||
- Return a valid JSON array only (no extra text).
|
||||
- Each element must be {"structure": "1|2|3", "title": <original title string>}.
|
||||
- Each element must be {"level": "1|2|3", "title": <original title string>}.
|
||||
- title must be the original title string.
|
||||
|
||||
Examples
|
||||
@ -20,10 +20,10 @@ Input:
|
||||
|
||||
Output:
|
||||
[
|
||||
{"structure":"1","title":"Chapter 1 Methods"},
|
||||
{"structure":"2","title":"Section 1 Definition"},
|
||||
{"structure":"2","title":"Section 2 Process"},
|
||||
{"structure":"1","title":"Chapter 2 Experiment"}
|
||||
{"level":"1","title":"Chapter 1 Methods"},
|
||||
{"level":"2","title":"Section 1 Definition"},
|
||||
{"level":"2","title":"Section 2 Process"},
|
||||
{"level":"1","title":"Chapter 2 Experiment"}
|
||||
]
|
||||
|
||||
Example B (parts with chapters)
|
||||
@ -32,11 +32,11 @@ Input:
|
||||
|
||||
Output:
|
||||
[
|
||||
{"structure":"1","title":"Part I Theory"},
|
||||
{"structure":"2","title":"Chapter 1 Basics"},
|
||||
{"structure":"2","title":"Chapter 2 Methods"},
|
||||
{"structure":"1","title":"Part II Applications"},
|
||||
{"structure":"2","title":"Chapter 3 Case Studies"}
|
||||
{"level":"1","title":"Part I Theory"},
|
||||
{"level":"2","title":"Chapter 1 Basics"},
|
||||
{"level":"2","title":"Chapter 2 Methods"},
|
||||
{"level":"1","title":"Part II Applications"},
|
||||
{"level":"2","title":"Chapter 3 Case Studies"}
|
||||
]
|
||||
|
||||
Example C (plain headings)
|
||||
@ -45,9 +45,9 @@ Input:
|
||||
|
||||
Output:
|
||||
[
|
||||
{"structure":"1","title":"Introduction"},
|
||||
{"structure":"2","title":"Background and Motivation"},
|
||||
{"structure":"2","title":"Related Work"},
|
||||
{"structure":"1","title":"Methodology"},
|
||||
{"structure":"1","title":"Evaluation"}
|
||||
{"level":"1","title":"Introduction"},
|
||||
{"level":"2","title":"Background and Motivation"},
|
||||
{"level":"2","title":"Related Work"},
|
||||
{"level":"1","title":"Methodology"},
|
||||
{"level":"1","title":"Evaluation"}
|
||||
]
|
||||
@ -21,7 +21,9 @@ from copy import deepcopy
|
||||
from typing import Tuple
|
||||
import jinja2
|
||||
import json_repair
|
||||
import trio
|
||||
from api.utils import hash_str2int
|
||||
from rag.nlp import is_chinese
|
||||
from rag.prompts.template import load_prompt
|
||||
from rag.settings import TAG_FLD
|
||||
from rag.utils import encoder, num_tokens_from_string
|
||||
@ -440,11 +442,17 @@ def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
|
||||
|
||||
|
||||
def gen_json(system_prompt:str, user_prompt:str, chat_mdl, gen_conf = None):
|
||||
from graphrag.utils import get_llm_cache, set_llm_cache
|
||||
cached = get_llm_cache(chat_mdl.llm_name, system_prompt, user_prompt, gen_conf)
|
||||
if cached:
|
||||
return json_repair.loads(cached)
|
||||
_, msg = message_fit_in(form_message(system_prompt, user_prompt), chat_mdl.max_length)
|
||||
ans = chat_mdl.chat(msg[0]["content"], msg[1:],gen_conf=gen_conf)
|
||||
ans = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", ans, flags=re.DOTALL)
|
||||
try:
|
||||
return json_repair.loads(ans)
|
||||
res = json_repair.loads(ans)
|
||||
set_llm_cache(chat_mdl.llm_name, system_prompt, ans, user_prompt, gen_conf)
|
||||
return res
|
||||
except Exception:
|
||||
logging.exception(f"Loading json failure: {ans}")
|
||||
|
||||
@ -651,29 +659,31 @@ def toc_transformer(toc_pages, chat_mdl):
|
||||
|
||||
TOC_LEVELS = load_prompt("assign_toc_levels")
|
||||
def assign_toc_levels(toc_secs, chat_mdl, gen_conf = {"temperature": 0.2}):
|
||||
print("\nBegin TOC level assignment...\n")
|
||||
|
||||
ans = gen_json(
|
||||
if not toc_secs:
|
||||
return []
|
||||
return gen_json(
|
||||
PROMPT_JINJA_ENV.from_string(TOC_LEVELS).render(),
|
||||
str(toc_secs),
|
||||
chat_mdl,
|
||||
gen_conf
|
||||
)
|
||||
|
||||
return ans
|
||||
|
||||
|
||||
TOC_FROM_TEXT_SYSTEM = load_prompt("toc_from_text_system")
|
||||
TOC_FROM_TEXT_USER = load_prompt("toc_from_text_user")
|
||||
# Generate TOC from text chunks with text llms
|
||||
def gen_toc_from_text(text, chat_mdl):
|
||||
ans = gen_json(
|
||||
PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_SYSTEM).render(),
|
||||
PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_USER).render(text=text),
|
||||
chat_mdl,
|
||||
gen_conf={"temperature": 0.0, "top_p": 0.9, "enable_thinking": False, }
|
||||
)
|
||||
return ans
|
||||
async def gen_toc_from_text(txt_info: dict, chat_mdl):
|
||||
try:
|
||||
ans = gen_json(
|
||||
PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_SYSTEM).render(),
|
||||
PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_USER).render(text="\n".join([json.dumps(d, ensure_ascii=False) for d in txt_info["chunks"]])),
|
||||
chat_mdl,
|
||||
gen_conf={"temperature": 0.0, "top_p": 0.9}
|
||||
)
|
||||
print(ans, "::::::::::::::::::::::::::::::::::::", flush=True)
|
||||
txt_info["toc"] = ans if ans else []
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
|
||||
|
||||
def split_chunks(chunks, max_length: int):
|
||||
@ -690,44 +700,91 @@ def split_chunks(chunks, max_length: int):
|
||||
if batch_tokens + t > max_length:
|
||||
result.append(batch)
|
||||
batch, batch_tokens = [], 0
|
||||
batch.append({"id": idx, "text": chunk})
|
||||
batch.append({idx: chunk})
|
||||
batch_tokens += t
|
||||
if batch:
|
||||
result.append(batch)
|
||||
return result
|
||||
|
||||
|
||||
def run_toc_from_text(chunks, chat_mdl):
|
||||
async def run_toc_from_text(chunks, chat_mdl):
|
||||
input_budget = int(chat_mdl.max_length * INPUT_UTILIZATION) - num_tokens_from_string(
|
||||
TOC_FROM_TEXT_USER + TOC_FROM_TEXT_SYSTEM
|
||||
)
|
||||
|
||||
input_budget = 2000 if input_budget > 2000 else input_budget
|
||||
input_budget = 1024 if input_budget > 1024 else input_budget
|
||||
chunk_sections = split_chunks(chunks, input_budget)
|
||||
res = []
|
||||
|
||||
for chunk in chunk_sections:
|
||||
ans = gen_toc_from_text(chunk, chat_mdl)
|
||||
res.extend(ans)
|
||||
chunks_res = []
|
||||
async with trio.open_nursery() as nursery:
|
||||
for i, chunk in enumerate(chunk_sections):
|
||||
if not chunk:
|
||||
continue
|
||||
chunks_res.append({"chunks": chunk})
|
||||
nursery.start_soon(gen_toc_from_text, chunks_res[-1], chat_mdl)
|
||||
|
||||
for chunk in chunks_res:
|
||||
res.extend(chunk.get("toc", []))
|
||||
|
||||
print(res, ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>")
|
||||
|
||||
# Filter out entries with title == -1
|
||||
filtered = [x for x in res if x.get("title") and x.get("title") != "-1"]
|
||||
filtered = []
|
||||
for x in res:
|
||||
if not x.get("title") or x["title"] == "-1":
|
||||
continue
|
||||
if is_chinese(x["title"]) and len(x["title"]) > 12:
|
||||
continue
|
||||
if len(x["title"].split(" ")) > 12:
|
||||
continue
|
||||
if re.match(r"[0-9,.()/ -]+$", x["title"]):
|
||||
continue
|
||||
filtered.append(x)
|
||||
|
||||
print("\n\nFiltered TOC sections:\n", filtered)
|
||||
logging.info(f"\n\nFiltered TOC sections:\n{filtered}")
|
||||
|
||||
# Generate initial structure (structure/title)
|
||||
raw_structure = [{"structure": "0", "title": x.get("title", "")} for x in filtered]
|
||||
# Generate initial level (level/title)
|
||||
raw_structure = [x.get("title", "") for x in filtered]
|
||||
|
||||
# Assign hierarchy levels using LLM
|
||||
toc_with_levels = assign_toc_levels(raw_structure, chat_mdl, {"temperature": 0.0, "top_p": 0.9, "enable_thinking": False})
|
||||
toc_with_levels = assign_toc_levels(raw_structure, chat_mdl, {"temperature": 0.0, "top_p": 0.9})
|
||||
|
||||
# Merge structure and content (by index)
|
||||
merged = []
|
||||
for _ , (toc_item, src_item) in enumerate(zip(toc_with_levels, filtered)):
|
||||
merged.append({
|
||||
"structure": toc_item.get("structure", "0"),
|
||||
"level": toc_item.get("level", "0"),
|
||||
"title": toc_item.get("title", ""),
|
||||
"content": src_item.get("content", ""),
|
||||
"chunk_id": src_item.get("chunk_id", ""),
|
||||
})
|
||||
|
||||
return merged
|
||||
return merged
|
||||
|
||||
|
||||
TOC_RELEVANCE_SYSTEM = load_prompt("toc_relevance_system")
|
||||
TOC_RELEVANCE_USER = load_prompt("toc_relevance_user")
|
||||
def relevant_chunks_with_toc(query: str, toc:list[dict], chat_mdl, topn: int=6):
|
||||
import numpy as np
|
||||
try:
|
||||
ans = gen_json(
|
||||
PROMPT_JINJA_ENV.from_string(TOC_RELEVANCE_SYSTEM).render(),
|
||||
PROMPT_JINJA_ENV.from_string(TOC_RELEVANCE_USER).render(query=query, toc_json="[\n%s\n]\n"%"\n".join([json.dumps({"level": d["level"], "title":d["title"]}, ensure_ascii=False) for d in toc])),
|
||||
chat_mdl,
|
||||
gen_conf={"temperature": 0.0, "top_p": 0.9}
|
||||
)
|
||||
print(ans, "::::::::::::::::::::::::::::::::::::", flush=True)
|
||||
id2score = {}
|
||||
for ti, sc in zip(toc, ans):
|
||||
if sc.get("score", -1) < 1:
|
||||
continue
|
||||
for id in ti.get("ids", []):
|
||||
if id not in id2score:
|
||||
id2score[id] = []
|
||||
id2score[id].append(sc["score"]/5.)
|
||||
for id in id2score.keys():
|
||||
id2score[id] = np.mean(id2score[id])
|
||||
return [(id, sc) for id, sc in list(id2score.items()) if sc>=0.3][:topn]
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
return []
|
||||
|
||||
@ -1,25 +1,25 @@
|
||||
You are a robust Table-of-Contents (TOC) extractor.
|
||||
|
||||
GOAL
|
||||
Given a dictionary of chunks {chunk_id: chunk_text}, extract TOC-like headings and return a strict JSON array of objects:
|
||||
Given a dictionary of chunks {"<chunk_ID>": chunk_text}, extract TOC-like headings and return a strict JSON array of objects:
|
||||
[
|
||||
{"title": , "content": ""},
|
||||
{"title": "", "chunk_id": ""},
|
||||
...
|
||||
]
|
||||
|
||||
FIELDS
|
||||
- "title": the heading text (clean, no page numbers or leader dots).
|
||||
- If any part of a chunk has no valid heading, output that part as {"title":"-1", ...}.
|
||||
- "content": the chunk_id (string).
|
||||
- "chunk_id": the chunk ID (string).
|
||||
- One chunk can yield multiple JSON objects in order (unmatched text + one or more headings).
|
||||
|
||||
RULES
|
||||
1) Preserve input chunk order strictly.
|
||||
2) If a chunk contains multiple headings, expand them in order:
|
||||
- Pre-heading narrative → {"title":"-1","content":chunk_id}
|
||||
- Then each heading → {"title":"...","content":chunk_id}
|
||||
3) Do not merge outputs across chunks; each object refers to exactly one chunk_id.
|
||||
4) "title" must be non-empty (or exactly "-1"). "content" must be a string (chunk_id).
|
||||
- Pre-heading narrative → {"title":"-1","chunk_id":"<chunk_ID>"}
|
||||
- Then each heading → {"title":"...","chunk_id":"<chunk_ID>"}
|
||||
3) Do not merge outputs across chunks; each object refers to exactly one chunk ID.
|
||||
4) "title" must be non-empty (or exactly "-1"). "chunk_id" must be a string (chunk ID).
|
||||
5) When ambiguous, prefer "-1" unless the text strongly looks like a heading.
|
||||
|
||||
HEADING DETECTION (cues, not hard rules)
|
||||
@ -51,63 +51,69 @@ EXAMPLES
|
||||
|
||||
Example 1 — No heading
|
||||
Input:
|
||||
{0: "Copyright page · Publication info (ISBN 123-456). All rights reserved."}
|
||||
[{"0": "Copyright page · Publication info (ISBN 123-456). All rights reserved."}, ...]
|
||||
Output:
|
||||
[
|
||||
{"title":"-1","content":"0"}
|
||||
{"title":"-1","chunk_id":"0"},
|
||||
...
|
||||
]
|
||||
|
||||
Example 2 — One heading
|
||||
Input:
|
||||
{1: "Chapter 1: General Provisions This chapter defines the overall rules…"}
|
||||
[{"1": "Chapter 1: General Provisions This chapter defines the overall rules…"}, ...]
|
||||
Output:
|
||||
[
|
||||
{"title":"Chapter 1: General Provisions","content":"1"}
|
||||
{"title":"Chapter 1: General Provisions","chunk_id":"1"},
|
||||
...
|
||||
]
|
||||
|
||||
Example 3 — Narrative + heading
|
||||
Input:
|
||||
{2: "This paragraph introduces the background and goals. Section 2: Definitions Key terms are explained…"}
|
||||
[{"2": "This paragraph introduces the background and goals. Section 2: Definitions Key terms are explained…"}, ...]
|
||||
Output:
|
||||
[
|
||||
{"title":"-1","content":"2"},
|
||||
{"title":"Section 2: Definitions","content":"2"}
|
||||
{"title":"Section 2: Definitions","chunk_id":"2"},
|
||||
...
|
||||
]
|
||||
|
||||
Example 4 — Multiple headings in one chunk
|
||||
Input:
|
||||
{3: "Declarations and Commitments (I) Party B commits… (II) Party C commits… Appendix A Data Specification"}
|
||||
[{"3": "Declarations and Commitments (I) Party B commits… (II) Party C commits… Appendix A Data Specification"}, ...]
|
||||
Output:
|
||||
[
|
||||
{"title":"Declarations and Commitments (I)","content":"3"},
|
||||
{"title":"(II)","content":"3"},
|
||||
{"title":"Appendix A","content":"3"}
|
||||
{"title":"Declarations and Commitments","chunk_id":"3"},
|
||||
{"title":"(I) Party B commits","chunk_id":"3"},
|
||||
{"title":"(II) Party C commits","chunk_id":"3"},
|
||||
{"title":"Appendix A Data Specification","chunk_id":"3"},
|
||||
...
|
||||
]
|
||||
|
||||
Example 5 — Numbering styles
|
||||
Input:
|
||||
{4: "1. Scope: Defines boundaries. 2) Definitions: Terms used. III) Methods Overview."}
|
||||
[{"4": "1. Scope: Defines boundaries. 2) Definitions: Terms used. III) Methods Overview."}, ...]
|
||||
Output:
|
||||
[
|
||||
{"title":"1. Scope","content":"4"},
|
||||
{"title":"2) Definitions","content":"4"},
|
||||
{"title":"III) Methods","content":"4"}
|
||||
{"title":"1. Scope","chunk_id":"4"},
|
||||
{"title":"2) Definitions","chunk_id":"4"},
|
||||
{"title":"III) Methods Overview","chunk_id":"4"},
|
||||
...
|
||||
]
|
||||
|
||||
Example 6 — Long list (NOT headings)
|
||||
Input:
|
||||
{5: "Item list: apples, bananas, strawberries, blueberries, mangos, peaches"}
|
||||
{"5": "Item list: apples, bananas, strawberries, blueberries, mangos, peaches"}, ...]
|
||||
Output:
|
||||
[
|
||||
{"title":"-1","content":"5"}
|
||||
{"title":"-1","chunk_id":"5"},
|
||||
...
|
||||
]
|
||||
|
||||
Example 7 — Mixed Chinese/English
|
||||
Input:
|
||||
{6: "(出版信息略)This standard follows industry practices. Chapter 1: Overview 摘要… 第2节:术语与缩略语"}
|
||||
{"6": "(出版信息略)This standard follows industry practices. Chapter 1: Overview 摘要… 第2节:术语与缩略语"}, ...]
|
||||
Output:
|
||||
[
|
||||
{"title":"-1","content":"6"},
|
||||
{"title":"Chapter 1: Overview","content":"6"},
|
||||
{"title":"第2节:术语与缩略语","content":"6"}
|
||||
{"title":"Chapter 1: Overview","chunk_id":"6"},
|
||||
{"title":"第2节:术语与缩略语","chunk_id":"6"},
|
||||
...
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user