mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 12:32:30 +08:00
Feat: Adds a new feature that enables the LLM to extract a structured table of contents (TOC) directly from plain text. (#10428)
### What problem does this PR solve? **Adds a new feature that enables the LLM to extract a structured table of contents (TOC) directly from plain text.** _This implementation prioritizes efficiency over reasoning — the model runs in a strictly deterministic mode (thinking disabled) to minimize latency. As a result, overall performance may be less optimal, but the extraction speed and consistency are guaranteed._ ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -133,6 +133,7 @@ class Base(ABC):
|
|||||||
"logprobs",
|
"logprobs",
|
||||||
"top_logprobs",
|
"top_logprobs",
|
||||||
"extra_headers",
|
"extra_headers",
|
||||||
|
"enable_thinking"
|
||||||
}
|
}
|
||||||
|
|
||||||
gen_conf = {k: v for k, v in gen_conf.items() if k in allowed_conf}
|
gen_conf = {k: v for k, v in gen_conf.items() if k in allowed_conf}
|
||||||
|
|||||||
53
rag/prompts/assign_toc_levels.md
Normal file
53
rag/prompts/assign_toc_levels.md
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
You are given a JSON array of TOC items. Each item has at least {"title": string} and may include an existing structure.
|
||||||
|
|
||||||
|
Task
|
||||||
|
- For each item, assign a depth label using Arabic numerals only: top-level = 1, second-level = 2, third-level = 3, etc.
|
||||||
|
- Multiple items may share the same depth (e.g., many 1s, many 2s).
|
||||||
|
- Do not use dotted numbering (no 1.1/1.2). Use a single digit string per item indicating its depth only.
|
||||||
|
- Preserve the original item order exactly. Do not insert, delete, or reorder.
|
||||||
|
- Decide levels yourself to keep a coherent hierarchy. Keep peers at the same depth.
|
||||||
|
|
||||||
|
Output
|
||||||
|
- Return a valid JSON array only (no extra text).
|
||||||
|
- Each element must be {"structure": "1|2|3", "title": <original title string>}.
|
||||||
|
- title must be the original title string.
|
||||||
|
|
||||||
|
Examples
|
||||||
|
|
||||||
|
Example A (chapters with sections)
|
||||||
|
Input:
|
||||||
|
["Chapter 1 Methods", "Section 1 Definition", "Section 2 Process", "Chapter 2 Experiment"]
|
||||||
|
|
||||||
|
Output:
|
||||||
|
[
|
||||||
|
{"structure":"1","title":"Chapter 1 Methods"},
|
||||||
|
{"structure":"2","title":"Section 1 Definition"},
|
||||||
|
{"structure":"2","title":"Section 2 Process"},
|
||||||
|
{"structure":"1","title":"Chapter 2 Experiment"}
|
||||||
|
]
|
||||||
|
|
||||||
|
Example B (parts with chapters)
|
||||||
|
Input:
|
||||||
|
["Part I Theory", "Chapter 1 Basics", "Chapter 2 Methods", "Part II Applications", "Chapter 3 Case Studies"]
|
||||||
|
|
||||||
|
Output:
|
||||||
|
[
|
||||||
|
{"structure":"1","title":"Part I Theory"},
|
||||||
|
{"structure":"2","title":"Chapter 1 Basics"},
|
||||||
|
{"structure":"2","title":"Chapter 2 Methods"},
|
||||||
|
{"structure":"1","title":"Part II Applications"},
|
||||||
|
{"structure":"2","title":"Chapter 3 Case Studies"}
|
||||||
|
]
|
||||||
|
|
||||||
|
Example C (plain headings)
|
||||||
|
Input:
|
||||||
|
["Introduction", "Background and Motivation", "Related Work", "Methodology", "Evaluation"]
|
||||||
|
|
||||||
|
Output:
|
||||||
|
[
|
||||||
|
{"structure":"1","title":"Introduction"},
|
||||||
|
{"structure":"2","title":"Background and Motivation"},
|
||||||
|
{"structure":"2","title":"Related Work"},
|
||||||
|
{"structure":"1","title":"Methodology"},
|
||||||
|
{"structure":"1","title":"Evaluation"}
|
||||||
|
]
|
||||||
@ -29,7 +29,7 @@ from rag.utils import encoder, num_tokens_from_string
|
|||||||
|
|
||||||
STOP_TOKEN="<|STOP|>"
|
STOP_TOKEN="<|STOP|>"
|
||||||
COMPLETE_TASK="complete_task"
|
COMPLETE_TASK="complete_task"
|
||||||
|
INPUT_UTILIZATION = 0.5
|
||||||
|
|
||||||
def get_value(d, k1, k2):
|
def get_value(d, k1, k2):
|
||||||
return d.get(k1, d.get(k2))
|
return d.get(k1, d.get(k2))
|
||||||
@ -439,9 +439,9 @@ def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
|
|||||||
return []
|
return []
|
||||||
|
|
||||||
|
|
||||||
def gen_json(system_prompt:str, user_prompt:str, chat_mdl):
|
def gen_json(system_prompt:str, user_prompt:str, chat_mdl, gen_conf = None):
|
||||||
_, msg = message_fit_in(form_message(system_prompt, user_prompt), chat_mdl.max_length)
|
_, msg = message_fit_in(form_message(system_prompt, user_prompt), chat_mdl.max_length)
|
||||||
ans = chat_mdl.chat(msg[0]["content"], msg[1:])
|
ans = chat_mdl.chat(msg[0]["content"], msg[1:],gen_conf=gen_conf)
|
||||||
ans = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", ans, flags=re.DOTALL)
|
ans = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", ans, flags=re.DOTALL)
|
||||||
try:
|
try:
|
||||||
return json_repair.loads(ans)
|
return json_repair.loads(ans)
|
||||||
@ -649,4 +649,85 @@ def toc_transformer(toc_pages, chat_mdl):
|
|||||||
return last_complete
|
return last_complete
|
||||||
|
|
||||||
|
|
||||||
|
TOC_LEVELS = load_prompt("assign_toc_levels")
|
||||||
|
def assign_toc_levels(toc_secs, chat_mdl, gen_conf = {"temperature": 0.2}):
|
||||||
|
print("\nBegin TOC level assignment...\n")
|
||||||
|
|
||||||
|
ans = gen_json(
|
||||||
|
PROMPT_JINJA_ENV.from_string(TOC_LEVELS).render(),
|
||||||
|
str(toc_secs),
|
||||||
|
chat_mdl,
|
||||||
|
gen_conf
|
||||||
|
)
|
||||||
|
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
TOC_FROM_TEXT_SYSTEM = load_prompt("toc_from_text_system")
|
||||||
|
TOC_FROM_TEXT_USER = load_prompt("toc_from_text_user")
|
||||||
|
# Generate TOC from text chunks with text llms
|
||||||
|
def gen_toc_from_text(text, chat_mdl):
|
||||||
|
ans = gen_json(
|
||||||
|
PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_SYSTEM).render(),
|
||||||
|
PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_USER).render(text=text),
|
||||||
|
chat_mdl,
|
||||||
|
gen_conf={"temperature": 0.0, "top_p": 0.9, "enable_thinking": False, }
|
||||||
|
)
|
||||||
|
return ans
|
||||||
|
|
||||||
|
|
||||||
|
def split_chunks(chunks, max_length: int):
|
||||||
|
"""
|
||||||
|
Pack chunks into batches according to max_length, returning [{"id": idx, "text": chunk_text}, ...].
|
||||||
|
Do not split a single chunk, even if it exceeds max_length.
|
||||||
|
"""
|
||||||
|
|
||||||
|
result = []
|
||||||
|
batch, batch_tokens = [], 0
|
||||||
|
|
||||||
|
for idx, chunk in enumerate(chunks):
|
||||||
|
t = num_tokens_from_string(chunk)
|
||||||
|
if batch_tokens + t > max_length:
|
||||||
|
result.append(batch)
|
||||||
|
batch, batch_tokens = [], 0
|
||||||
|
batch.append({"id": idx, "text": chunk})
|
||||||
|
batch_tokens += t
|
||||||
|
if batch:
|
||||||
|
result.append(batch)
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def run_toc_from_text(chunks, chat_mdl):
|
||||||
|
input_budget = int(chat_mdl.max_length * INPUT_UTILIZATION) - num_tokens_from_string(
|
||||||
|
TOC_FROM_TEXT_USER + TOC_FROM_TEXT_SYSTEM
|
||||||
|
)
|
||||||
|
|
||||||
|
input_budget = 2000 if input_budget > 2000 else input_budget
|
||||||
|
chunk_sections = split_chunks(chunks, input_budget)
|
||||||
|
res = []
|
||||||
|
|
||||||
|
for chunk in chunk_sections:
|
||||||
|
ans = gen_toc_from_text(chunk, chat_mdl)
|
||||||
|
res.extend(ans)
|
||||||
|
|
||||||
|
# Filter out entries with title == -1
|
||||||
|
filtered = [x for x in res if x.get("title") and x.get("title") != "-1"]
|
||||||
|
|
||||||
|
print("\n\nFiltered TOC sections:\n", filtered)
|
||||||
|
|
||||||
|
# Generate initial structure (structure/title)
|
||||||
|
raw_structure = [{"structure": "0", "title": x.get("title", "")} for x in filtered]
|
||||||
|
|
||||||
|
# Assign hierarchy levels using LLM
|
||||||
|
toc_with_levels = assign_toc_levels(raw_structure, chat_mdl, {"temperature": 0.0, "top_p": 0.9, "enable_thinking": False})
|
||||||
|
|
||||||
|
# Merge structure and content (by index)
|
||||||
|
merged = []
|
||||||
|
for _ , (toc_item, src_item) in enumerate(zip(toc_with_levels, filtered)):
|
||||||
|
merged.append({
|
||||||
|
"structure": toc_item.get("structure", "0"),
|
||||||
|
"title": toc_item.get("title", ""),
|
||||||
|
"content": src_item.get("content", ""),
|
||||||
|
})
|
||||||
|
|
||||||
|
return merged
|
||||||
113
rag/prompts/toc_from_text_system.md
Normal file
113
rag/prompts/toc_from_text_system.md
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
You are a robust Table-of-Contents (TOC) extractor.
|
||||||
|
|
||||||
|
GOAL
|
||||||
|
Given a dictionary of chunks {chunk_id: chunk_text}, extract TOC-like headings and return a strict JSON array of objects:
|
||||||
|
[
|
||||||
|
{"title": , "content": ""},
|
||||||
|
...
|
||||||
|
]
|
||||||
|
|
||||||
|
FIELDS
|
||||||
|
- "title": the heading text (clean, no page numbers or leader dots).
|
||||||
|
- If any part of a chunk has no valid heading, output that part as {"title":"-1", ...}.
|
||||||
|
- "content": the chunk_id (string).
|
||||||
|
- One chunk can yield multiple JSON objects in order (unmatched text + one or more headings).
|
||||||
|
|
||||||
|
RULES
|
||||||
|
1) Preserve input chunk order strictly.
|
||||||
|
2) If a chunk contains multiple headings, expand them in order:
|
||||||
|
- Pre-heading narrative → {"title":"-1","content":chunk_id}
|
||||||
|
- Then each heading → {"title":"...","content":chunk_id}
|
||||||
|
3) Do not merge outputs across chunks; each object refers to exactly one chunk_id.
|
||||||
|
4) "title" must be non-empty (or exactly "-1"). "content" must be a string (chunk_id).
|
||||||
|
5) When ambiguous, prefer "-1" unless the text strongly looks like a heading.
|
||||||
|
|
||||||
|
HEADING DETECTION (cues, not hard rules)
|
||||||
|
- Appears near line start, short isolated phrase, often followed by content.
|
||||||
|
- May contain separators: — —— - : : · •
|
||||||
|
- Numbering styles:
|
||||||
|
• 第[一二三四五六七八九十百]+(篇|章|节|条)
|
||||||
|
• [((]?[一二三四五六七八九十]+[))]?
|
||||||
|
• [((]?[①②③④⑤⑥⑦⑧⑨⑩][))]?
|
||||||
|
• ^\d+(\.\d+)*[)..]?\s*
|
||||||
|
• ^[IVXLCDM]+[).]
|
||||||
|
• ^[A-Z][).]
|
||||||
|
- Canonical section cues (general only):
|
||||||
|
Common heading indicators include words such as:
|
||||||
|
"Overview", "Introduction", "Background", "Purpose", "Scope", "Definition",
|
||||||
|
"Method", "Procedure", "Result", "Discussion", "Summary", "Conclusion",
|
||||||
|
"Appendix", "Reference", "Annex", "Acknowledgment", "Disclaimer".
|
||||||
|
These are soft cues, not strict requirements.
|
||||||
|
- Length restriction:
|
||||||
|
• Chinese heading: ≤25 characters
|
||||||
|
• English heading: ≤80 characters
|
||||||
|
- Exclude long narrative sentences, continuous prose, or bullet-style lists → output as "-1".
|
||||||
|
|
||||||
|
OUTPUT FORMAT
|
||||||
|
- Return ONLY a valid JSON array of {"title","content"} objects.
|
||||||
|
- No reasoning or commentary.
|
||||||
|
|
||||||
|
EXAMPLES
|
||||||
|
|
||||||
|
Example 1 — No heading
|
||||||
|
Input:
|
||||||
|
{0: "Copyright page · Publication info (ISBN 123-456). All rights reserved."}
|
||||||
|
Output:
|
||||||
|
[
|
||||||
|
{"title":"-1","content":"0"}
|
||||||
|
]
|
||||||
|
|
||||||
|
Example 2 — One heading
|
||||||
|
Input:
|
||||||
|
{1: "Chapter 1: General Provisions This chapter defines the overall rules…"}
|
||||||
|
Output:
|
||||||
|
[
|
||||||
|
{"title":"Chapter 1: General Provisions","content":"1"}
|
||||||
|
]
|
||||||
|
|
||||||
|
Example 3 — Narrative + heading
|
||||||
|
Input:
|
||||||
|
{2: "This paragraph introduces the background and goals. Section 2: Definitions Key terms are explained…"}
|
||||||
|
Output:
|
||||||
|
[
|
||||||
|
{"title":"-1","content":"2"},
|
||||||
|
{"title":"Section 2: Definitions","content":"2"}
|
||||||
|
]
|
||||||
|
|
||||||
|
Example 4 — Multiple headings in one chunk
|
||||||
|
Input:
|
||||||
|
{3: "Declarations and Commitments (I) Party B commits… (II) Party C commits… Appendix A Data Specification"}
|
||||||
|
Output:
|
||||||
|
[
|
||||||
|
{"title":"Declarations and Commitments (I)","content":"3"},
|
||||||
|
{"title":"(II)","content":"3"},
|
||||||
|
{"title":"Appendix A","content":"3"}
|
||||||
|
]
|
||||||
|
|
||||||
|
Example 5 — Numbering styles
|
||||||
|
Input:
|
||||||
|
{4: "1. Scope: Defines boundaries. 2) Definitions: Terms used. III) Methods Overview."}
|
||||||
|
Output:
|
||||||
|
[
|
||||||
|
{"title":"1. Scope","content":"4"},
|
||||||
|
{"title":"2) Definitions","content":"4"},
|
||||||
|
{"title":"III) Methods","content":"4"}
|
||||||
|
]
|
||||||
|
|
||||||
|
Example 6 — Long list (NOT headings)
|
||||||
|
Input:
|
||||||
|
{5: "Item list: apples, bananas, strawberries, blueberries, mangos, peaches"}
|
||||||
|
Output:
|
||||||
|
[
|
||||||
|
{"title":"-1","content":"5"}
|
||||||
|
]
|
||||||
|
|
||||||
|
Example 7 — Mixed Chinese/English
|
||||||
|
Input:
|
||||||
|
{6: "(出版信息略)This standard follows industry practices. Chapter 1: Overview 摘要… 第2节:术语与缩略语"}
|
||||||
|
Output:
|
||||||
|
[
|
||||||
|
{"title":"-1","content":"6"},
|
||||||
|
{"title":"Chapter 1: Overview","content":"6"},
|
||||||
|
{"title":"第2节:术语与缩略语","content":"6"}
|
||||||
|
]
|
||||||
8
rag/prompts/toc_from_text_user.md
Normal file
8
rag/prompts/toc_from_text_user.md
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
OUTPUT FORMAT
|
||||||
|
- Return ONLY the JSON array.
|
||||||
|
- Use double quotes.
|
||||||
|
- No extra commentary.
|
||||||
|
- Keep language of "title" the same as the input.
|
||||||
|
|
||||||
|
INPUT
|
||||||
|
{{text}}
|
||||||
Reference in New Issue
Block a user