Feat: Use data pipeline to visualize the parsing configuration of the knowledge base (#10423)

### What problem does this PR solve? #9869 ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Signed-off-by: dependabot[bot] <support@github.com> Signed-off-by: jinhai <haijin.chn@gmail.com> Signed-off-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: chanx <1243304602@qq.com> Co-authored-by: balibabu <cike8899@users.noreply.github.com> Co-authored-by: Lynn <lynn_inf@hotmail.com> Co-authored-by: 纷繁下的无奈 <zhileihuang@126.com> Co-authored-by: huangzl <huangzl@shinemo.com> Co-authored-by: writinwaters <93570324+writinwaters@users.noreply.github.com> Co-authored-by: Wilmer <33392318@qq.com> Co-authored-by: Adrian Weidig <adrianweidig@gmx.net> Co-authored-by: Zhichang Yu <yuzhichang@gmail.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Co-authored-by: Yongteng Lei <yongtengrey@outlook.com> Co-authored-by: Liu An <asiro@qq.com> Co-authored-by: buua436 <66937541+buua436@users.noreply.github.com> Co-authored-by: BadwomanCraZY <511528396@qq.com> Co-authored-by: cucusenok <31804608+cucusenok@users.noreply.github.com> Co-authored-by: Russell Valentine <russ@coldstonelabs.org> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Billy Bao <newyorkupperbay@gmail.com> Co-authored-by: Zhedong Cen <cenzhedong2@126.com> Co-authored-by: TensorNull <129579691+TensorNull@users.noreply.github.com> Co-authored-by: TensorNull <tensor.null@gmail.com> Co-authored-by: TeslaZY <TeslaZY@outlook.com> Co-authored-by: Ajay <160579663+aybanda@users.noreply.github.com> Co-authored-by: AB <aj@Ajays-MacBook-Air.local> Co-authored-by: 天海蒼灆 <huangaoqin@tecpie.com> Co-authored-by: He Wang <wanghechn@qq.com> Co-authored-by: Atsushi Hatakeyama <atu729@icloud.com> Co-authored-by: Jin Hai <haijin.chn@gmail.com> Co-authored-by: Mohamed Mathari <155896313+melmathari@users.noreply.github.com> Co-authored-by: Mohamed Mathari <nocodeventure@Mac-mini-van-Mohamed.fritz.box> Co-authored-by: Stephen Hu <stephenhu@seismic.com> Co-authored-by: Shaun Zhang <zhangwfjh@users.noreply.github.com> Co-authored-by: zhimeng123 <60221886+zhimeng123@users.noreply.github.com> Co-authored-by: mxc <mxc@example.com> Co-authored-by: Dominik Novotný <50611433+SgtMarmite@users.noreply.github.com> Co-authored-by: EVGENY M <168018528+rjohny55@users.noreply.github.com> Co-authored-by: mcoder6425 <mcoder64@gmail.com> Co-authored-by: lemsn <lemsn@msn.com> Co-authored-by: lemsn <lemsn@126.com> Co-authored-by: Adrian Gora <47756404+adagora@users.noreply.github.com> Co-authored-by: Womsxd <45663319+Womsxd@users.noreply.github.com> Co-authored-by: FatMii <39074672+FatMii@users.noreply.github.com>
2026-02-02 16:45:08 +08:00 · 2025-10-09 12:36:19 +08:00
parent ef0aecea3b
commit cbf04ee470
490 changed files with 10630 additions and 30688 deletions
--- a/rag/prompts/generator.py
+++ b/rag/prompts/generator.py
@ -436,4 +436,217 @@ def gen_meta_filter(chat_mdl, meta_data:dict, query: str) -> list:
        return ans
    except Exception:
        logging.exception(f"Loading json failure: {ans}")
-    return []
+    return []
+
+
+def gen_json(system_prompt:str, user_prompt:str, chat_mdl):
+    _, msg = message_fit_in(form_message(system_prompt, user_prompt), chat_mdl.max_length)
+    ans = chat_mdl.chat(msg[0]["content"], msg[1:])
+    ans = re.sub(r"(^.*</think>|```json\n|```\n*$)", "", ans, flags=re.DOTALL)
+    try:
+        return json_repair.loads(ans)
+    except Exception:
+        logging.exception(f"Loading json failure: {ans}")
+
+
+TOC_DETECTION = load_prompt("toc_detection")
+def detect_table_of_contents(page_1024:list[str], chat_mdl):
+    toc_secs = []
+    for i, sec in enumerate(page_1024[:22]):
+        ans = gen_json(PROMPT_JINJA_ENV.from_string(TOC_DETECTION).render(page_txt=sec), "Only JSON please.", chat_mdl)
+        if toc_secs and not ans["exists"]:
+            break
+        toc_secs.append(sec)
+    return toc_secs
+
+
+TOC_EXTRACTION = load_prompt("toc_extraction")
+TOC_EXTRACTION_CONTINUE = load_prompt("toc_extraction_continue")
+def extract_table_of_contents(toc_pages, chat_mdl):
+    if not toc_pages:
+        return []
+
+    return gen_json(PROMPT_JINJA_ENV.from_string(TOC_EXTRACTION).render(toc_page="\n".join(toc_pages)), "Only JSON please.", chat_mdl)
+
+
+def toc_index_extractor(toc:list[dict], content:str, chat_mdl):
+    tob_extractor_prompt = """
+    You are given a table of contents in a json format and several pages of a document, your job is to add the physical_index to the table of contents in the json format.
+
+    The provided pages contains tags like <physical_index_X> and <physical_index_X> to indicate the physical location of the page X.
+
+    The structure variable is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc.
+
+    The response should be in the following JSON format: 
+    [
+        {
+            "structure": <structure index, "x.x.x" or None> (string),
+            "title": <title of the section>,
+            "physical_index": "<physical_index_X>" (keep the format)
+        },
+        ...
+    ]
+
+    Only add the physical_index to the sections that are in the provided pages.
+    If the title of the section are not in the provided pages, do not add the physical_index to it.
+    Directly return the final JSON structure. Do not output anything else."""
+
+    prompt = tob_extractor_prompt + '\nTable of contents:\n' + json.dumps(toc, ensure_ascii=False, indent=2) + '\nDocument pages:\n' + content
+    return gen_json(prompt, "Only JSON please.", chat_mdl)
+
+
+TOC_INDEX = load_prompt("toc_index")
+def table_of_contents_index(toc_arr: list[dict], sections: list[str], chat_mdl):
+    if not toc_arr or not sections:
+        return []
+
+    toc_map = {}
+    for i, it in enumerate(toc_arr):
+        k1 = (it["structure"]+it["title"]).replace(" ", "")
+        k2 = it["title"].strip()
+        if k1 not in toc_map:
+            toc_map[k1] = []
+        if k2 not in toc_map:
+            toc_map[k2] = []
+        toc_map[k1].append(i)
+        toc_map[k2].append(i)
+
+    for it in toc_arr:
+        it["indices"] = []
+    for i, sec in enumerate(sections):
+        sec = sec.strip()
+        if sec.replace(" ", "") in toc_map:
+            for j in toc_map[sec.replace(" ", "")]:
+                toc_arr[j]["indices"].append(i)
+
+    all_pathes = []
+    def dfs(start, path):
+        nonlocal all_pathes
+        if start >= len(toc_arr):
+            if path:
+                all_pathes.append(path)
+            return
+        if not toc_arr[start]["indices"]:
+            dfs(start+1, path)
+            return
+        added = False
+        for j in toc_arr[start]["indices"]:
+            if path and j < path[-1][0]:
+                continue
+            _path = deepcopy(path)
+            _path.append((j, start))
+            added = True
+            dfs(start+1, _path)
+        if not added and path:
+            all_pathes.append(path)
+
+    dfs(0, [])
+    path = max(all_pathes, key=lambda x:len(x))
+    for it in toc_arr:
+        it["indices"] = []
+    for j, i in path:
+        toc_arr[i]["indices"] = [j]
+    print(json.dumps(toc_arr, ensure_ascii=False, indent=2))
+
+    i = 0
+    while i < len(toc_arr):
+        it  = toc_arr[i]
+        if it["indices"]:
+            i += 1
+            continue
+
+        if i>0 and toc_arr[i-1]["indices"]:
+            st_i = toc_arr[i-1]["indices"][-1]
+        else:
+            st_i = 0
+        e = i + 1
+        while e <len(toc_arr) and not toc_arr[e]["indices"]:
+            e += 1
+        if e >= len(toc_arr):
+            e = len(sections)
+        else:
+            e = toc_arr[e]["indices"][0]
+
+        for j in range(st_i, min(e+1, len(sections))):
+            ans = gen_json(PROMPT_JINJA_ENV.from_string(TOC_INDEX).render(
+                structure=it["structure"],
+                title=it["title"],
+                text=sections[j]), "Only JSON please.", chat_mdl)
+            if ans["exist"] == "yes":
+                it["indices"].append(j)
+                break
+
+        i += 1
+
+    return toc_arr
+
+
+def check_if_toc_transformation_is_complete(content, toc, chat_mdl):
+    prompt = """
+    You are given a raw table of contents and a  table of contents.
+    Your job is to check if the  table of contents is complete.
+
+    Reply format:
+    {{
+        "thinking": <why do you think the cleaned table of contents is complete or not>
+        "completed": "yes" or "no"
+    }}
+    Directly return the final JSON structure. Do not output anything else."""
+
+    prompt = prompt + '\n Raw Table of contents:\n' + content + '\n Cleaned Table of contents:\n' + toc
+    response = gen_json(prompt, "Only JSON please.", chat_mdl)
+    return response['completed']
+
+
+def toc_transformer(toc_pages, chat_mdl):
+    init_prompt = """
+    You are given a table of contents, You job is to transform the whole table of content into a JSON format included table_of_contents.
+
+    The `structure` is the numeric system which represents the index of the hierarchy section in the table of contents. For example, the first section has structure index 1, the first subsection has structure index 1.1, the second subsection has structure index 1.2, etc.
+    The `title` is a short phrase or a several-words term.
+    
+    The response should be in the following JSON format: 
+    [
+        {
+            "structure": <structure index, "x.x.x" or None> (string),
+            "title": <title of the section>
+        },
+        ...
+    ],
+    You should transform the full table of contents in one go.
+    Directly return the final JSON structure, do not output anything else. """
+
+    toc_content = "\n".join(toc_pages)
+    prompt = init_prompt + '\n Given table of contents\n:' + toc_content
+    def clean_toc(arr):
+        for a in arr:
+            a["title"] = re.sub(r"[.·….]{2,}", "", a["title"])
+    last_complete = gen_json(prompt, "Only JSON please.", chat_mdl)
+    if_complete = check_if_toc_transformation_is_complete(toc_content, json.dumps(last_complete, ensure_ascii=False, indent=2), chat_mdl)
+    clean_toc(last_complete)
+    if if_complete == "yes":
+        return last_complete
+
+    while not (if_complete == "yes"):
+        prompt = f"""
+        Your task is to continue the table of contents json structure, directly output the remaining part of the json structure.
+        The response should be in the following JSON format: 
+
+        The raw table of contents json structure is:
+        {toc_content}
+
+        The incomplete transformed table of contents json structure is:
+        {json.dumps(last_complete[-24:], ensure_ascii=False, indent=2)}
+
+        Please continue the json structure, directly output the remaining part of the json structure."""
+        new_complete = gen_json(prompt, "Only JSON please.", chat_mdl)
+        if not new_complete or str(last_complete).find(str(new_complete)) >= 0:
+            break
+        clean_toc(new_complete)
+        last_complete.extend(new_complete)
+        if_complete = check_if_toc_transformation_is_complete(toc_content, json.dumps(last_complete, ensure_ascii=False, indent=2), chat_mdl)
+
+    return last_complete
+
+
+
--- a/rag/prompts/toc_detection.md
+++ b/rag/prompts/toc_detection.md
@ -0,0 +1,29 @@
+You are an AI assistant designed to analyze text content and detect whether a table of contents (TOC) list exists on the given page. Follow these steps:  
+
+1. **Analyze the Input**: Carefully review the provided text content.  
+2. **Identify Key Features**: Look for common indicators of a TOC, such as:  
+   - Section titles or headings paired with page numbers.
+   - Patterns like repeated formatting (e.g., bold/italicized text, dots/dashes between titles and numbers).  
+   - Phrases like "Table of Contents," "Contents," or similar headings.  
+   - Logical grouping of topics/subtopics with sequential page references.  
+3. **Discern Negative  Features**:
+   - The text contains no numbers, or the numbers present are clearly not page references (e.g., dates, statistical figures, phone numbers, version numbers).
+   - The text consists of full, descriptive sentences and paragraphs that form a narrative, present arguments, or explain concepts, rather than succinctly listing topics.
+   - Contains citations with authors, publication years, journal titles, and page ranges (e.g., "Smith, J. (2020). Journal Title, 10(2), 45-67.").
+   - Lists keywords or terms followed by multiple page numbers, often in alphabetical order.
+   - Comprises terms followed by their definitions or explanations.
+   - Labeled with headers like "Appendix A," "Appendix B," etc.
+   - Contains expressive language thanking individuals or organizations for their support or contributions.
+4. **Evaluate Evidence**: Weigh the presence/absence of these features to determine if the content resembles a TOC.
+5. **Output Format**: Provide your response in the following JSON structure:  
+   ```json  
+   {  
+     "reasoning": "Step-by-step explanation of your analysis based on the features identified." ,
+     "exists": true/false
+   }  
+   ```  
+6. **DO NOT** output anything else except JSON structure.
+
+**Input text Content ( Text-Only Extraction ):**  
+{{ page_txt }} 
+
--- a/rag/prompts/toc_extraction.md
+++ b/rag/prompts/toc_extraction.md
@ -0,0 +1,53 @@
+You are an expert parser and data formatter. Your task is to analyze the provided table of contents (TOC) text and convert it into a valid JSON array of objects.
+
+**Instructions:**
+1.  Analyze each line of the input TOC.
+2.  For each line, extract the following three pieces of information:
+    *   `structure`: The hierarchical index/numbering (e.g., "1", "2.1", "3.2.5", "A.1"). If a line has no visible numbering or structure indicator (like a main "Chapter" title), use `null`.
+    *   `title`: The textual title of the section or chapter. This should be the main descriptive text, clean and without the page number.
+3.  Output **only** a valid JSON array. Do not include any other text, explanations, or markdown code block fences (like ```json) in your response.
+
+**JSON Format:**
+The output must be a list of objects following this exact schema:
+```json
+[
+    {
+        "structure": <structure index, "x.x.x" or None> (string）,
+        "title": <title of the section>
+    },
+    ...
+]
+```
+
+**Input Example:**
+```
+Contents
+1 Introduction to the System ... 1
+1.1 Overview .... 2
+1.2 Key Features .... 5
+2 Installation Guide ....8
+2.1 Prerequisites ........ 9
+2.2 Step-by-Step Process ........ 12
+Appendix A: Specifications ..... 45
+References ... 47
+```
+
+**Expected Output For The Example:**
+```json
+[
+    {"structure": null, "title": "Contents"},
+    {"structure": "1", "title": "Introduction to the System"},
+    {"structure": "1.1", "title": "Overview"},
+    {"structure": "1.2", "title": "Key Features"},
+    {"structure": "2", "title": "Installation Guide"},
+    {"structure": "2.1", "title": "Prerequisites"},
+    {"structure": "2.2", "title": "Step-by-Step Process"},
+    {"structure": "A", "title": "Specifications"},
+    {"structure": null, "title": "References"}
+]
+```
+
+**Now, process the following TOC input:**
+```
+{{ toc_page }}
+```
--- a/rag/prompts/toc_extraction_continue.md
+++ b/rag/prompts/toc_extraction_continue.md
@ -0,0 +1,60 @@
+You are an expert parser and data formatter, currently in the process of building a JSON array from a multi-page table of contents (TOC). Your task is to analyze the new page of content and **append** the new entries to the existing JSON array.
+
+**Instructions:**
+1.  You will be given two inputs:
+    *   `current_page_text`: The text content from the new page of the TOC.
+    *   `existing_json`: The valid JSON array you have generated from the previous pages.
+2.  Analyze each line of the `current_page_text` input.
+3.  For each new line, extract the following three pieces of information:
+    *   `structure`: The hierarchical index/numbering (e.g., "1", "2.1", "3.2.5"). Use `null` if none exists.
+    *   `title`: The clean textual title of the section or chapter.
+    *   `page`: The page number on which the section starts. Extract only the number. Use `null` if not present.
+4.  **Append these new entries** to the `existing_json` array. Do not modify, reorder, or delete any of the existing entries.
+5.  Output **only** the complete, updated JSON array. Do not include any other text, explanations, or markdown code block fences (like ```json).
+
+**JSON Format:**
+The output must be a valid JSON array following this schema:
+```json
+[
+    {
+        "structure": <string or null>,
+        "title": <string>,
+        "page": <number or null>
+    },
+    ...
+]
+```
+
+**Input Example:**
+`current_page_text`:
+```
+3.2 Advanced Configuration ........... 25
+3.3 Troubleshooting .................. 28
+4 User Management .................... 30
+```
+
+`existing_json`:
+```json
+[
+    {"structure": "1", "title": "Introduction", "page": 1},
+    {"structure": "2", "title": "Installation", "page": 5},
+    {"structure": "3", "title": "Configuration", "page": 12},
+    {"structure": "3.1", "title": "Basic Setup", "page": 15}
+]
+```
+
+**Expected Output For The Example:**
+```json
+[
+    {"structure": "3.2", "title": "Advanced Configuration", "page": 25},
+    {"structure": "3.3", "title": "Troubleshooting", "page": 28},
+    {"structure": "4", "title": "User Management", "page": 30}
+]
+```
+
+**Now, process the following inputs:**
+`current_page_text`:
+{{ toc_page }}
+
+`existing_json`:
+{{ toc_json }}
--- a/rag/prompts/toc_index.md
+++ b/rag/prompts/toc_index.md
@ -0,0 +1,20 @@
+You are an expert analyst tasked with matching text content to the title.
+
+**Instructions:**
+1. Analyze the given title with its numeric structure index and the provided text.
+2. Determine whether the title is mentioned as a section tile in the given text.
+3. Provide a concise, step-by-step reasoning for your decision.
+4. Output **only** the complete JSON object. Do not include any other text, explanations, or markdown code block fences (like ```json).
+
+**Output Format:**
+Your output must be a valid JSON object with the following keys:
+{
+"reasoning": "Step-by-step explanation of your analysis.",
+"exist": "<yes or no>",
+}
+
+** The title: **
+{{ structure }} {{ title }}
+
+** Given text: **
+{{ text }}