Fix: fixed context loss caused by separating markdown tables from original text (#8844)

### What problem does this PR solve? Fix context loss caused by separating markdown tables from original text. #6871, #8804. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-01-31 23:55:06 +08:00 · 2025-07-15 13:03:01 +08:00
parent c08ed28f09
commit 51a8604dcb
2 changed files with 73 additions and 44 deletions
--- a/deepdoc/parser/markdown_parser.py
+++ b/deepdoc/parser/markdown_parser.py
@ -17,13 +17,33 @@
 import re
 from markdown import markdown
 class RAGFlowMarkdownParser:
    def __init__(self, chunk_token_num=128):
        self.chunk_token_num = int(chunk_token_num)
-    def extract_tables_and_remainder(self, markdown_text):
+    def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
        tables = []
-        remainder = markdown_text
+        working_text = markdown_text
        def replace_tables_with_rendered_html(pattern, table_list, render=True):
            new_text = ""
            last_end = 0
            for match in pattern.finditer(working_text):
                raw_table = match.group()
                table_list.append(raw_table)
                if separate_tables:
                    # Skip this match (i.e., remove it)
                    new_text += working_text[last_end:match.start()] + "\n\n"
                else:
                    # Replace with rendered HTML
                    html_table = markdown(raw_table, extensions=['markdown.extensions.tables']) if render else raw_table
                    new_text += working_text[last_end:match.start()] + html_table + "\n\n"
                last_end = match.end()
            new_text += working_text[last_end:]
            return new_text
        if "|" in markdown_text: # for optimize performance
            # Standard Markdown table
            border_table_pattern = re.compile(
@ -33,9 +53,7 @@ class RAGFlowMarkdownParser:
                (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
                (?:\|.*?\|.*?\|.*?\n)+
            ''', re.VERBOSE)
-            border_tables = border_table_pattern.findall(markdown_text)
+            working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
            tables.extend(border_tables)
            remainder = border_table_pattern.sub('', remainder)
            # Borderless Markdown table
            no_border_table_pattern = re.compile(
@ -45,11 +63,9 @@ class RAGFlowMarkdownParser:
                (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
                (?:\S.*?\|.*?\n)+
                ''', re.VERBOSE)
-            no_border_tables = no_border_table_pattern.findall(remainder)
+            working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
            tables.extend(no_border_tables)
            remainder = no_border_table_pattern.sub('', remainder)
-        if "<table>" in remainder.lower(): # for optimize performance
+        if "<table>" in working_text.lower(): # for optimize performance
            #HTML table extraction - handle possible html/body wrapper tags
            html_table_pattern = re.compile(
            r'''
@ -70,8 +86,21 @@ class RAGFlowMarkdownParser:
            ''',
            re.VERBOSE | re.DOTALL | re.IGNORECASE
            )
-            html_tables = html_table_pattern.findall(remainder)
+            def replace_html_tables():
-            tables.extend(html_tables)
+                nonlocal working_text
-            remainder = html_table_pattern.sub('', remainder)
+                new_text = ""
                last_end = 0
                for match in html_table_pattern.finditer(working_text):
                    raw_table = match.group()
                    tables.append(raw_table)
                    if separate_tables:
                        new_text += working_text[last_end:match.start()] + "\n\n"
                    else:
                        new_text += working_text[last_end:match.start()] + raw_table + "\n\n"
                    last_end = match.end()
                new_text += working_text[last_end:]
                working_text = new_text
-        return remainder, tables
+            replace_html_tables()
        return working_text, tables
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -323,14 +323,14 @@ class Markdown(MarkdownParser):
        return images if images else None
-    def __call__(self, filename, binary=None):
+    def __call__(self, filename, binary=None, separate_tables=True):
        if binary:
            encoding = find_codec(binary)
            txt = binary.decode(encoding, errors="ignore")
        else:
            with open(filename, "r") as f:
                txt = f.read()
-        remainder, tables = self.extract_tables_and_remainder(f'{txt}\n')
+        remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
        sections = []
        tbls = []
        for sec in remainder.split("\n"):
@ -465,7 +465,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
    elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
-        sections, tables = markdown_parser(filename, binary)
+        sections, tables = markdown_parser(filename, binary, separate_tables=False)
        # Process images for each section
        section_images = []