Feat: advanced markdown parsing (#9607)

### What problem does this PR solve? Using AST parsing to handle markdown more accurately, preventing components from being cut off by chunking. #9564 <img width="1746" height="993" alt="image" src="https://github.com/user-attachments/assets/4aaf4bf6-5714-4d48-a9cf-864f59633f7f" /> <img width="1739" height="982" alt="image" src="https://github.com/user-attachments/assets/dc00233f-7a55-434f-bbb7-74ce7f57a6cf" /> <img width="559" height="100" alt="image" src="https://github.com/user-attachments/assets/4a556b5b-d9c6-4544-a486-8ac342bd504e" /> ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-02-06 02:25:05 +08:00 · 2025-08-21 09:36:18 +08:00
parent 4080f6a54a
commit 382458ace7
3 changed files with 196 additions and 28 deletions
--- a/deepdoc/parser/init.py
+++ b/deepdoc/parser/init.py
@ -14,13 +14,15 @@
 #  limitations under the License.
 #
 from .pdf_parser import RAGFlowPdfParser as PdfParser, PlainParser
 from .docx_parser import RAGFlowDocxParser as DocxParser
 from .excel_parser import RAGFlowExcelParser as ExcelParser
 from .ppt_parser import RAGFlowPptParser as PptParser
 from .html_parser import RAGFlowHtmlParser as HtmlParser
 from .json_parser import RAGFlowJsonParser as JsonParser
 from .markdown_parser import MarkdownElementExtractor
 from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
 from .pdf_parser import PlainParser
 from .pdf_parser import RAGFlowPdfParser as PdfParser
 from .ppt_parser import RAGFlowPptParser as PptParser
 from .txt_parser import RAGFlowTxtParser as TxtParser
 __all__ = [
@ -33,4 +35,6 @@ __all__ = [
    "JsonParser",
    "MarkdownParser",
    "TxtParser",
    "MarkdownElementExtractor",
 ]
--- a/deepdoc/parser/markdown_parser.py
+++ b/deepdoc/parser/markdown_parser.py
@ -17,8 +17,10 @@
 import re
 import mistune
 from markdown import markdown
 class RAGFlowMarkdownParser:
    def __init__(self, chunk_token_num=128):
        self.chunk_token_num = int(chunk_token_num)
@ -35,11 +37,11 @@ class RAGFlowMarkdownParser:
                table_list.append(raw_table)
                if separate_tables:
                    # Skip this match (i.e., remove it)
-                    new_text += working_text[last_end:match.start()] + "\n\n"
+                    new_text += working_text[last_end : match.start()] + "\n\n"
                else:
                    # Replace with rendered HTML
-                    html_table = markdown(raw_table, extensions=['markdown.extensions.tables']) if render else raw_table
+                    html_table = markdown(raw_table, extensions=["markdown.extensions.tables"]) if render else raw_table
-                    new_text += working_text[last_end:match.start()] + html_table + "\n\n"
+                    new_text += working_text[last_end : match.start()] + html_table + "\n\n"
                last_end = match.end()
            new_text += working_text[last_end:]
            return new_text
@ -47,28 +49,32 @@ class RAGFlowMarkdownParser:
        if "|" in markdown_text:  # for optimize performance
            # Standard Markdown table
            border_table_pattern = re.compile(
-                r'''
+                r"""
                (?:\n|^)
                (?:\|.*?\|.*?\|.*?\n)
                (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
                (?:\|.*?\|.*?\|.*?\n)+
-            ''', re.VERBOSE)
+            """,
                re.VERBOSE,
            )
            working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
            # Borderless Markdown table
            no_border_table_pattern = re.compile(
-                r'''
+                r"""
                (?:\n|^)
                (?:\S.*?\|.*?\n)
                (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
                (?:\S.*?\|.*?\n)+
-                ''', re.VERBOSE)
+                """,
                re.VERBOSE,
            )
            working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
        if "<table>" in working_text.lower():  # for optimize performance
-            #HTML table extraction - handle possible html/body wrapper tags
+            # HTML table extraction - handle possible html/body wrapper tags
            html_table_pattern = re.compile(
-            r'''
+                r"""
            (?:\n|^)
            \s*
            (?:
@ -83,9 +89,10 @@ class RAGFlowMarkdownParser:
            )
            \s*
            (?=\n|$)
-            ''',
+            """,
-            re.VERBOSE | re.DOTALL | re.IGNORECASE
+                re.VERBOSE | re.DOTALL | re.IGNORECASE,
            )
            def replace_html_tables():
                nonlocal working_text
                new_text = ""
@ -94,9 +101,9 @@ class RAGFlowMarkdownParser:
                    raw_table = match.group()
                    tables.append(raw_table)
                    if separate_tables:
-                        new_text += working_text[last_end:match.start()] + "\n\n"
+                        new_text += working_text[last_end : match.start()] + "\n\n"
                    else:
-                        new_text += working_text[last_end:match.start()] + raw_table + "\n\n"
+                        new_text += working_text[last_end : match.start()] + raw_table + "\n\n"
                    last_end = match.end()
                new_text += working_text[last_end:]
                working_text = new_text
@ -104,3 +111,163 @@ class RAGFlowMarkdownParser:
            replace_html_tables()
        return working_text, tables
 class MarkdownElementExtractor:
    def __init__(self, markdown_content):
        self.markdown_content = markdown_content
        self.lines = markdown_content.split("\n")
        self.ast_parser = mistune.create_markdown(renderer="ast")
        self.ast_nodes = self.ast_parser(markdown_content)
    def extract_elements(self):
        """Extract individual elements (headers, code blocks, lists, etc.)"""
        sections = []
        i = 0
        while i < len(self.lines):
            line = self.lines[i]
            if re.match(r"^#{1,6}\s+.*$", line):
                # header
                element = self._extract_header(i)
                sections.append(element["content"])
                i = element["end_line"] + 1
            elif line.strip().startswith("```"):
                # code block
                element = self._extract_code_block(i)
                sections.append(element["content"])
                i = element["end_line"] + 1
            elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
                # list block
                element = self._extract_list_block(i)
                sections.append(element["content"])
                i = element["end_line"] + 1
            elif line.strip().startswith(">"):
                # blockquote
                element = self._extract_blockquote(i)
                sections.append(element["content"])
                i = element["end_line"] + 1
            elif line.strip():
                # text block (paragraphs and inline elements until next block element)
                element = self._extract_text_block(i)
                sections.append(element["content"])
                i = element["end_line"] + 1
            else:
                i += 1
        sections = [section for section in sections if section.strip()]
        return sections
    def _extract_header(self, start_pos):
        return {
            "type": "header",
            "content": self.lines[start_pos],
            "start_line": start_pos,
            "end_line": start_pos,
        }
    def _extract_code_block(self, start_pos):
        end_pos = start_pos
        content_lines = [self.lines[start_pos]]
        # Find the end of the code block
        for i in range(start_pos + 1, len(self.lines)):
            content_lines.append(self.lines[i])
            end_pos = i
            if self.lines[i].strip().startswith("```"):
                break
        return {
            "type": "code_block",
            "content": "\n".join(content_lines),
            "start_line": start_pos,
            "end_line": end_pos,
        }
    def _extract_list_block(self, start_pos):
        end_pos = start_pos
        content_lines = []
        i = start_pos
        while i < len(self.lines):
            line = self.lines[i]
            # check if this line is a list item or continuation of a list
            if (
                re.match(r"^\s*[-*+]\s+.*$", line)
                or re.match(r"^\s*\d+\.\s+.*$", line)
                or (i > start_pos and not line.strip())
                or (i > start_pos and re.match(r"^\s{2,}[-*+]\s+.*$", line))
                or (i > start_pos and re.match(r"^\s{2,}\d+\.\s+.*$", line))
                or (i > start_pos and re.match(r"^\s+\w+.*$", line))
            ):
                content_lines.append(line)
                end_pos = i
                i += 1
            else:
                break
        return {
            "type": "list_block",
            "content": "\n".join(content_lines),
            "start_line": start_pos,
            "end_line": end_pos,
        }
    def _extract_blockquote(self, start_pos):
        end_pos = start_pos
        content_lines = []
        i = start_pos
        while i < len(self.lines):
            line = self.lines[i]
            if line.strip().startswith(">") or (i > start_pos and not line.strip()):
                content_lines.append(line)
                end_pos = i
                i += 1
            else:
                break
        return {
            "type": "blockquote",
            "content": "\n".join(content_lines),
            "start_line": start_pos,
            "end_line": end_pos,
        }
    def _extract_text_block(self, start_pos):
        """Extract a text block (paragraphs, inline elements) until next block element"""
        end_pos = start_pos
        content_lines = [self.lines[start_pos]]
        i = start_pos + 1
        while i < len(self.lines):
            line = self.lines[i]
            # stop if we encounter a block element
            if re.match(r"^#{1,6}\s+.*$", line) or line.strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"):
                break
            elif not line.strip():
                # check if the next line is a block element
                if i + 1 < len(self.lines) and (
                    re.match(r"^#{1,6}\s+.*$", self.lines[i + 1])
                    or self.lines[i + 1].strip().startswith("```")
                    or re.match(r"^\s*[-*+]\s+.*$", self.lines[i + 1])
                    or re.match(r"^\s*\d+\.\s+.*$", self.lines[i + 1])
                    or self.lines[i + 1].strip().startswith(">")
                ):
                    break
                else:
                    content_lines.append(line)
                    end_pos = i
                    i += 1
            else:
                content_lines.append(line)
                end_pos = i
                i += 1
        return {
            "type": "text_block",
            "content": "\n".join(content_lines),
            "start_line": start_pos,
            "end_line": end_pos,
        }
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -30,7 +30,7 @@ from tika import parser
 from api.db import LLMType
 from api.db.services.llm_service import LLMBundle
-from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
+from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
 from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper
 from deepdoc.parser.pdf_parser import PlainParser, VisionParser
 from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
@ -350,17 +350,14 @@ class Markdown(MarkdownParser):
        else:
            with open(filename, "r") as f:
                txt = f.read()
        remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
-        sections = []
+
        extractor = MarkdownElementExtractor(txt)
        element_sections = extractor.extract_elements()
        sections = [(element, "") for element in element_sections]
        tbls = []
        for sec in remainder.split("\n"):
            if sec.strip().find("#") == 0:
                sections.append((sec, ""))
            elif sections and sections[-1][0].strip().find("#") == 0:
                sec_, _ = sections.pop(-1)
                sections.append((sec_ + "\n" + sec, ""))
            else:
                sections.append((sec, ""))
        for table in tables:
            tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
        return sections, tbls