diff --git a/rag/llm/chat_model.py b/rag/llm/chat_model.py index e43362ac2..81e1a3459 100644 --- a/rag/llm/chat_model.py +++ b/rag/llm/chat_model.py @@ -133,6 +133,7 @@ class Base(ABC): "logprobs", "top_logprobs", "extra_headers", + "enable_thinking" } gen_conf = {k: v for k, v in gen_conf.items() if k in allowed_conf} diff --git a/rag/prompts/assign_toc_levels.md b/rag/prompts/assign_toc_levels.md new file mode 100644 index 000000000..fff0cd8b3 --- /dev/null +++ b/rag/prompts/assign_toc_levels.md @@ -0,0 +1,53 @@ +You are given a JSON array of TOC items. Each item has at least {"title": string} and may include an existing structure. + +Task +- For each item, assign a depth label using Arabic numerals only: top-level = 1, second-level = 2, third-level = 3, etc. +- Multiple items may share the same depth (e.g., many 1s, many 2s). +- Do not use dotted numbering (no 1.1/1.2). Use a single digit string per item indicating its depth only. +- Preserve the original item order exactly. Do not insert, delete, or reorder. +- Decide levels yourself to keep a coherent hierarchy. Keep peers at the same depth. + +Output +- Return a valid JSON array only (no extra text). +- Each element must be {"structure": "1|2|3", "title": }. +- title must be the original title string. + +Examples + +Example A (chapters with sections) +Input: +["Chapter 1 Methods", "Section 1 Definition", "Section 2 Process", "Chapter 2 Experiment"] + +Output: +[ + {"structure":"1","title":"Chapter 1 Methods"}, + {"structure":"2","title":"Section 1 Definition"}, + {"structure":"2","title":"Section 2 Process"}, + {"structure":"1","title":"Chapter 2 Experiment"} +] + +Example B (parts with chapters) +Input: +["Part I Theory", "Chapter 1 Basics", "Chapter 2 Methods", "Part II Applications", "Chapter 3 Case Studies"] + +Output: +[ + {"structure":"1","title":"Part I Theory"}, + {"structure":"2","title":"Chapter 1 Basics"}, + {"structure":"2","title":"Chapter 2 Methods"}, + {"structure":"1","title":"Part II Applications"}, + {"structure":"2","title":"Chapter 3 Case Studies"} +] + +Example C (plain headings) +Input: +["Introduction", "Background and Motivation", "Related Work", "Methodology", "Evaluation"] + +Output: +[ + {"structure":"1","title":"Introduction"}, + {"structure":"2","title":"Background and Motivation"}, + {"structure":"2","title":"Related Work"}, + {"structure":"1","title":"Methodology"}, + {"structure":"1","title":"Evaluation"} +] \ No newline at end of file diff --git a/rag/prompts/generator.py b/rag/prompts/generator.py index 89c9c5c1e..fa812c1ff 100644 --- a/rag/prompts/generator.py +++ b/rag/prompts/generator.py @@ -29,7 +29,7 @@ from rag.utils import encoder, num_tokens_from_string STOP_TOKEN="<|STOP|>" COMPLETE_TASK="complete_task" - +INPUT_UTILIZATION = 0.5 def get_value(d, k1, k2): return d.get(k1, d.get(k2)) @@ -439,9 +439,9 @@ def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list: return [] -def gen_json(system_prompt:str, user_prompt:str, chat_mdl): +def gen_json(system_prompt:str, user_prompt:str, chat_mdl, gen_conf = None): _, msg = message_fit_in(form_message(system_prompt, user_prompt), chat_mdl.max_length) - ans = chat_mdl.chat(msg[0]["content"], msg[1:]) + ans = chat_mdl.chat(msg[0]["content"], msg[1:],gen_conf=gen_conf) ans = re.sub(r"(^.*|```json\n|```\n*$)", "", ans, flags=re.DOTALL) try: return json_repair.loads(ans) @@ -649,4 +649,85 @@ def toc_transformer(toc_pages, chat_mdl): return last_complete +TOC_LEVELS = load_prompt("assign_toc_levels") +def assign_toc_levels(toc_secs, chat_mdl, gen_conf = {"temperature": 0.2}): + print("\nBegin TOC level assignment...\n") + ans = gen_json( + PROMPT_JINJA_ENV.from_string(TOC_LEVELS).render(), + str(toc_secs), + chat_mdl, + gen_conf + ) + + return ans + + +TOC_FROM_TEXT_SYSTEM = load_prompt("toc_from_text_system") +TOC_FROM_TEXT_USER = load_prompt("toc_from_text_user") +# Generate TOC from text chunks with text llms +def gen_toc_from_text(text, chat_mdl): + ans = gen_json( + PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_SYSTEM).render(), + PROMPT_JINJA_ENV.from_string(TOC_FROM_TEXT_USER).render(text=text), + chat_mdl, + gen_conf={"temperature": 0.0, "top_p": 0.9, "enable_thinking": False, } + ) + return ans + + +def split_chunks(chunks, max_length: int): + """ + Pack chunks into batches according to max_length, returning [{"id": idx, "text": chunk_text}, ...]. + Do not split a single chunk, even if it exceeds max_length. + """ + + result = [] + batch, batch_tokens = [], 0 + + for idx, chunk in enumerate(chunks): + t = num_tokens_from_string(chunk) + if batch_tokens + t > max_length: + result.append(batch) + batch, batch_tokens = [], 0 + batch.append({"id": idx, "text": chunk}) + batch_tokens += t + if batch: + result.append(batch) + return result + + +def run_toc_from_text(chunks, chat_mdl): + input_budget = int(chat_mdl.max_length * INPUT_UTILIZATION) - num_tokens_from_string( + TOC_FROM_TEXT_USER + TOC_FROM_TEXT_SYSTEM + ) + + input_budget = 2000 if input_budget > 2000 else input_budget + chunk_sections = split_chunks(chunks, input_budget) + res = [] + + for chunk in chunk_sections: + ans = gen_toc_from_text(chunk, chat_mdl) + res.extend(ans) + + # Filter out entries with title == -1 + filtered = [x for x in res if x.get("title") and x.get("title") != "-1"] + + print("\n\nFiltered TOC sections:\n", filtered) + + # Generate initial structure (structure/title) + raw_structure = [{"structure": "0", "title": x.get("title", "")} for x in filtered] + + # Assign hierarchy levels using LLM + toc_with_levels = assign_toc_levels(raw_structure, chat_mdl, {"temperature": 0.0, "top_p": 0.9, "enable_thinking": False}) + + # Merge structure and content (by index) + merged = [] + for _ , (toc_item, src_item) in enumerate(zip(toc_with_levels, filtered)): + merged.append({ + "structure": toc_item.get("structure", "0"), + "title": toc_item.get("title", ""), + "content": src_item.get("content", ""), + }) + + return merged \ No newline at end of file diff --git a/rag/prompts/toc_from_text_system.md b/rag/prompts/toc_from_text_system.md new file mode 100644 index 000000000..f982df47d --- /dev/null +++ b/rag/prompts/toc_from_text_system.md @@ -0,0 +1,113 @@ +You are a robust Table-of-Contents (TOC) extractor. + +GOAL +Given a dictionary of chunks {chunk_id: chunk_text}, extract TOC-like headings and return a strict JSON array of objects: +[ + {"title": , "content": ""}, + ... +] + +FIELDS +- "title": the heading text (clean, no page numbers or leader dots). + - If any part of a chunk has no valid heading, output that part as {"title":"-1", ...}. +- "content": the chunk_id (string). + - One chunk can yield multiple JSON objects in order (unmatched text + one or more headings). + +RULES +1) Preserve input chunk order strictly. +2) If a chunk contains multiple headings, expand them in order: + - Pre-heading narrative → {"title":"-1","content":chunk_id} + - Then each heading → {"title":"...","content":chunk_id} +3) Do not merge outputs across chunks; each object refers to exactly one chunk_id. +4) "title" must be non-empty (or exactly "-1"). "content" must be a string (chunk_id). +5) When ambiguous, prefer "-1" unless the text strongly looks like a heading. + +HEADING DETECTION (cues, not hard rules) +- Appears near line start, short isolated phrase, often followed by content. +- May contain separators: — —— - : : · • +- Numbering styles: + • 第[一二三四五六七八九十百]+(篇|章|节|条) + • [((]?[一二三四五六七八九十]+[))]? + • [((]?[①②③④⑤⑥⑦⑧⑨⑩][))]? + • ^\d+(\.\d+)*[)..]?\s* + • ^[IVXLCDM]+[).] + • ^[A-Z][).] +- Canonical section cues (general only): + Common heading indicators include words such as: + "Overview", "Introduction", "Background", "Purpose", "Scope", "Definition", + "Method", "Procedure", "Result", "Discussion", "Summary", "Conclusion", + "Appendix", "Reference", "Annex", "Acknowledgment", "Disclaimer". + These are soft cues, not strict requirements. +- Length restriction: + • Chinese heading: ≤25 characters + • English heading: ≤80 characters +- Exclude long narrative sentences, continuous prose, or bullet-style lists → output as "-1". + +OUTPUT FORMAT +- Return ONLY a valid JSON array of {"title","content"} objects. +- No reasoning or commentary. + +EXAMPLES + +Example 1 — No heading +Input: +{0: "Copyright page · Publication info (ISBN 123-456). All rights reserved."} +Output: +[ + {"title":"-1","content":"0"} +] + +Example 2 — One heading +Input: +{1: "Chapter 1: General Provisions This chapter defines the overall rules…"} +Output: +[ + {"title":"Chapter 1: General Provisions","content":"1"} +] + +Example 3 — Narrative + heading +Input: +{2: "This paragraph introduces the background and goals. Section 2: Definitions Key terms are explained…"} +Output: +[ + {"title":"-1","content":"2"}, + {"title":"Section 2: Definitions","content":"2"} +] + +Example 4 — Multiple headings in one chunk +Input: +{3: "Declarations and Commitments (I) Party B commits… (II) Party C commits… Appendix A Data Specification"} +Output: +[ + {"title":"Declarations and Commitments (I)","content":"3"}, + {"title":"(II)","content":"3"}, + {"title":"Appendix A","content":"3"} +] + +Example 5 — Numbering styles +Input: +{4: "1. Scope: Defines boundaries. 2) Definitions: Terms used. III) Methods Overview."} +Output: +[ + {"title":"1. Scope","content":"4"}, + {"title":"2) Definitions","content":"4"}, + {"title":"III) Methods","content":"4"} +] + +Example 6 — Long list (NOT headings) +Input: +{5: "Item list: apples, bananas, strawberries, blueberries, mangos, peaches"} +Output: +[ + {"title":"-1","content":"5"} +] + +Example 7 — Mixed Chinese/English +Input: +{6: "(出版信息略)This standard follows industry practices. Chapter 1: Overview 摘要… 第2节:术语与缩略语"} +Output: +[ + {"title":"-1","content":"6"}, + {"title":"Chapter 1: Overview","content":"6"}, + {"title":"第2节:术语与缩略语","content":"6"} +] diff --git a/rag/prompts/toc_from_text_user.md b/rag/prompts/toc_from_text_user.md new file mode 100644 index 000000000..952d8eff2 --- /dev/null +++ b/rag/prompts/toc_from_text_user.md @@ -0,0 +1,8 @@ +OUTPUT FORMAT +- Return ONLY the JSON array. +- Use double quotes. +- No extra commentary. +- Keep language of "title" the same as the input. + +INPUT +{{text}}