Feat: advanced markdown parsing (#9607)

### What problem does this PR solve?

Using AST parsing to handle markdown more accurately, preventing
components from being cut off by chunking. #9564

<img width="1746" height="993" alt="image"
src="https://github.com/user-attachments/assets/4aaf4bf6-5714-4d48-a9cf-864f59633f7f"
/>

<img width="1739" height="982" alt="image"
src="https://github.com/user-attachments/assets/dc00233f-7a55-434f-bbb7-74ce7f57a6cf"
/>

<img width="559" height="100" alt="image"
src="https://github.com/user-attachments/assets/4a556b5b-d9c6-4544-a486-8ac342bd504e"
/>


### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2025-08-21 09:36:18 +08:00
committed by GitHub
parent 4080f6a54a
commit 382458ace7
3 changed files with 196 additions and 28 deletions

View File

@ -14,13 +14,15 @@
# limitations under the License. # limitations under the License.
# #
from .pdf_parser import RAGFlowPdfParser as PdfParser, PlainParser
from .docx_parser import RAGFlowDocxParser as DocxParser from .docx_parser import RAGFlowDocxParser as DocxParser
from .excel_parser import RAGFlowExcelParser as ExcelParser from .excel_parser import RAGFlowExcelParser as ExcelParser
from .ppt_parser import RAGFlowPptParser as PptParser
from .html_parser import RAGFlowHtmlParser as HtmlParser from .html_parser import RAGFlowHtmlParser as HtmlParser
from .json_parser import RAGFlowJsonParser as JsonParser from .json_parser import RAGFlowJsonParser as JsonParser
from .markdown_parser import MarkdownElementExtractor
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
from .pdf_parser import PlainParser
from .pdf_parser import RAGFlowPdfParser as PdfParser
from .ppt_parser import RAGFlowPptParser as PptParser
from .txt_parser import RAGFlowTxtParser as TxtParser from .txt_parser import RAGFlowTxtParser as TxtParser
__all__ = [ __all__ = [
@ -33,4 +35,6 @@ __all__ = [
"JsonParser", "JsonParser",
"MarkdownParser", "MarkdownParser",
"TxtParser", "TxtParser",
"MarkdownElementExtractor",
] ]

View File

@ -17,8 +17,10 @@
import re import re
import mistune
from markdown import markdown from markdown import markdown
class RAGFlowMarkdownParser: class RAGFlowMarkdownParser:
def __init__(self, chunk_token_num=128): def __init__(self, chunk_token_num=128):
self.chunk_token_num = int(chunk_token_num) self.chunk_token_num = int(chunk_token_num)
@ -35,11 +37,11 @@ class RAGFlowMarkdownParser:
table_list.append(raw_table) table_list.append(raw_table)
if separate_tables: if separate_tables:
# Skip this match (i.e., remove it) # Skip this match (i.e., remove it)
new_text += working_text[last_end:match.start()] + "\n\n" new_text += working_text[last_end : match.start()] + "\n\n"
else: else:
# Replace with rendered HTML # Replace with rendered HTML
html_table = markdown(raw_table, extensions=['markdown.extensions.tables']) if render else raw_table html_table = markdown(raw_table, extensions=["markdown.extensions.tables"]) if render else raw_table
new_text += working_text[last_end:match.start()] + html_table + "\n\n" new_text += working_text[last_end : match.start()] + html_table + "\n\n"
last_end = match.end() last_end = match.end()
new_text += working_text[last_end:] new_text += working_text[last_end:]
return new_text return new_text
@ -47,28 +49,32 @@ class RAGFlowMarkdownParser:
if "|" in markdown_text: # for optimize performance if "|" in markdown_text: # for optimize performance
# Standard Markdown table # Standard Markdown table
border_table_pattern = re.compile( border_table_pattern = re.compile(
r''' r"""
(?:\n|^) (?:\n|^)
(?:\|.*?\|.*?\|.*?\n) (?:\|.*?\|.*?\|.*?\n)
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
(?:\|.*?\|.*?\|.*?\n)+ (?:\|.*?\|.*?\|.*?\n)+
''', re.VERBOSE) """,
re.VERBOSE,
)
working_text = replace_tables_with_rendered_html(border_table_pattern, tables) working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
# Borderless Markdown table # Borderless Markdown table
no_border_table_pattern = re.compile( no_border_table_pattern = re.compile(
r''' r"""
(?:\n|^) (?:\n|^)
(?:\S.*?\|.*?\n) (?:\S.*?\|.*?\n)
(?:(?:\s*[:-]+[-| :]*\s*).*?\n) (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
(?:\S.*?\|.*?\n)+ (?:\S.*?\|.*?\n)+
''', re.VERBOSE) """,
re.VERBOSE,
)
working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables) working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
if "<table>" in working_text.lower(): # for optimize performance if "<table>" in working_text.lower(): # for optimize performance
#HTML table extraction - handle possible html/body wrapper tags # HTML table extraction - handle possible html/body wrapper tags
html_table_pattern = re.compile( html_table_pattern = re.compile(
r''' r"""
(?:\n|^) (?:\n|^)
\s* \s*
(?: (?:
@ -83,9 +89,10 @@ class RAGFlowMarkdownParser:
) )
\s* \s*
(?=\n|$) (?=\n|$)
''', """,
re.VERBOSE | re.DOTALL | re.IGNORECASE re.VERBOSE | re.DOTALL | re.IGNORECASE,
) )
def replace_html_tables(): def replace_html_tables():
nonlocal working_text nonlocal working_text
new_text = "" new_text = ""
@ -94,9 +101,9 @@ class RAGFlowMarkdownParser:
raw_table = match.group() raw_table = match.group()
tables.append(raw_table) tables.append(raw_table)
if separate_tables: if separate_tables:
new_text += working_text[last_end:match.start()] + "\n\n" new_text += working_text[last_end : match.start()] + "\n\n"
else: else:
new_text += working_text[last_end:match.start()] + raw_table + "\n\n" new_text += working_text[last_end : match.start()] + raw_table + "\n\n"
last_end = match.end() last_end = match.end()
new_text += working_text[last_end:] new_text += working_text[last_end:]
working_text = new_text working_text = new_text
@ -104,3 +111,163 @@ class RAGFlowMarkdownParser:
replace_html_tables() replace_html_tables()
return working_text, tables return working_text, tables
class MarkdownElementExtractor:
def __init__(self, markdown_content):
self.markdown_content = markdown_content
self.lines = markdown_content.split("\n")
self.ast_parser = mistune.create_markdown(renderer="ast")
self.ast_nodes = self.ast_parser(markdown_content)
def extract_elements(self):
"""Extract individual elements (headers, code blocks, lists, etc.)"""
sections = []
i = 0
while i < len(self.lines):
line = self.lines[i]
if re.match(r"^#{1,6}\s+.*$", line):
# header
element = self._extract_header(i)
sections.append(element["content"])
i = element["end_line"] + 1
elif line.strip().startswith("```"):
# code block
element = self._extract_code_block(i)
sections.append(element["content"])
i = element["end_line"] + 1
elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
# list block
element = self._extract_list_block(i)
sections.append(element["content"])
i = element["end_line"] + 1
elif line.strip().startswith(">"):
# blockquote
element = self._extract_blockquote(i)
sections.append(element["content"])
i = element["end_line"] + 1
elif line.strip():
# text block (paragraphs and inline elements until next block element)
element = self._extract_text_block(i)
sections.append(element["content"])
i = element["end_line"] + 1
else:
i += 1
sections = [section for section in sections if section.strip()]
return sections
def _extract_header(self, start_pos):
return {
"type": "header",
"content": self.lines[start_pos],
"start_line": start_pos,
"end_line": start_pos,
}
def _extract_code_block(self, start_pos):
end_pos = start_pos
content_lines = [self.lines[start_pos]]
# Find the end of the code block
for i in range(start_pos + 1, len(self.lines)):
content_lines.append(self.lines[i])
end_pos = i
if self.lines[i].strip().startswith("```"):
break
return {
"type": "code_block",
"content": "\n".join(content_lines),
"start_line": start_pos,
"end_line": end_pos,
}
def _extract_list_block(self, start_pos):
end_pos = start_pos
content_lines = []
i = start_pos
while i < len(self.lines):
line = self.lines[i]
# check if this line is a list item or continuation of a list
if (
re.match(r"^\s*[-*+]\s+.*$", line)
or re.match(r"^\s*\d+\.\s+.*$", line)
or (i > start_pos and not line.strip())
or (i > start_pos and re.match(r"^\s{2,}[-*+]\s+.*$", line))
or (i > start_pos and re.match(r"^\s{2,}\d+\.\s+.*$", line))
or (i > start_pos and re.match(r"^\s+\w+.*$", line))
):
content_lines.append(line)
end_pos = i
i += 1
else:
break
return {
"type": "list_block",
"content": "\n".join(content_lines),
"start_line": start_pos,
"end_line": end_pos,
}
def _extract_blockquote(self, start_pos):
end_pos = start_pos
content_lines = []
i = start_pos
while i < len(self.lines):
line = self.lines[i]
if line.strip().startswith(">") or (i > start_pos and not line.strip()):
content_lines.append(line)
end_pos = i
i += 1
else:
break
return {
"type": "blockquote",
"content": "\n".join(content_lines),
"start_line": start_pos,
"end_line": end_pos,
}
def _extract_text_block(self, start_pos):
"""Extract a text block (paragraphs, inline elements) until next block element"""
end_pos = start_pos
content_lines = [self.lines[start_pos]]
i = start_pos + 1
while i < len(self.lines):
line = self.lines[i]
# stop if we encounter a block element
if re.match(r"^#{1,6}\s+.*$", line) or line.strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"):
break
elif not line.strip():
# check if the next line is a block element
if i + 1 < len(self.lines) and (
re.match(r"^#{1,6}\s+.*$", self.lines[i + 1])
or self.lines[i + 1].strip().startswith("```")
or re.match(r"^\s*[-*+]\s+.*$", self.lines[i + 1])
or re.match(r"^\s*\d+\.\s+.*$", self.lines[i + 1])
or self.lines[i + 1].strip().startswith(">")
):
break
else:
content_lines.append(line)
end_pos = i
i += 1
else:
content_lines.append(line)
end_pos = i
i += 1
return {
"type": "text_block",
"content": "\n".join(content_lines),
"start_line": start_pos,
"end_line": end_pos,
}

View File

@ -30,7 +30,7 @@ from tika import parser
from api.db import LLMType from api.db import LLMType
from api.db.services.llm_service import LLMBundle from api.db.services.llm_service import LLMBundle
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper
from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
@ -350,17 +350,14 @@ class Markdown(MarkdownParser):
else: else:
with open(filename, "r") as f: with open(filename, "r") as f:
txt = f.read() txt = f.read()
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables) remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
sections = []
extractor = MarkdownElementExtractor(txt)
element_sections = extractor.extract_elements()
sections = [(element, "") for element in element_sections]
tbls = [] tbls = []
for sec in remainder.split("\n"):
if sec.strip().find("#") == 0:
sections.append((sec, ""))
elif sections and sections[-1][0].strip().find("#") == 0:
sec_, _ = sections.pop(-1)
sections.append((sec_ + "\n" + sec, ""))
else:
sections.append((sec, ""))
for table in tables: for table in tables:
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), "")) tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
return sections, tbls return sections, tbls