mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Feat: advanced markdown parsing (#9607)
### What problem does this PR solve? Using AST parsing to handle markdown more accurately, preventing components from being cut off by chunking. #9564 <img width="1746" height="993" alt="image" src="https://github.com/user-attachments/assets/4aaf4bf6-5714-4d48-a9cf-864f59633f7f" /> <img width="1739" height="982" alt="image" src="https://github.com/user-attachments/assets/dc00233f-7a55-434f-bbb7-74ce7f57a6cf" /> <img width="559" height="100" alt="image" src="https://github.com/user-attachments/assets/4a556b5b-d9c6-4544-a486-8ac342bd504e" /> ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -14,13 +14,15 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
#
|
#
|
||||||
|
|
||||||
from .pdf_parser import RAGFlowPdfParser as PdfParser, PlainParser
|
|
||||||
from .docx_parser import RAGFlowDocxParser as DocxParser
|
from .docx_parser import RAGFlowDocxParser as DocxParser
|
||||||
from .excel_parser import RAGFlowExcelParser as ExcelParser
|
from .excel_parser import RAGFlowExcelParser as ExcelParser
|
||||||
from .ppt_parser import RAGFlowPptParser as PptParser
|
|
||||||
from .html_parser import RAGFlowHtmlParser as HtmlParser
|
from .html_parser import RAGFlowHtmlParser as HtmlParser
|
||||||
from .json_parser import RAGFlowJsonParser as JsonParser
|
from .json_parser import RAGFlowJsonParser as JsonParser
|
||||||
|
from .markdown_parser import MarkdownElementExtractor
|
||||||
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
|
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
|
||||||
|
from .pdf_parser import PlainParser
|
||||||
|
from .pdf_parser import RAGFlowPdfParser as PdfParser
|
||||||
|
from .ppt_parser import RAGFlowPptParser as PptParser
|
||||||
from .txt_parser import RAGFlowTxtParser as TxtParser
|
from .txt_parser import RAGFlowTxtParser as TxtParser
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
@ -33,4 +35,6 @@ __all__ = [
|
|||||||
"JsonParser",
|
"JsonParser",
|
||||||
"MarkdownParser",
|
"MarkdownParser",
|
||||||
"TxtParser",
|
"TxtParser",
|
||||||
|
"MarkdownElementExtractor",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|||||||
@ -17,8 +17,10 @@
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
import mistune
|
||||||
from markdown import markdown
|
from markdown import markdown
|
||||||
|
|
||||||
|
|
||||||
class RAGFlowMarkdownParser:
|
class RAGFlowMarkdownParser:
|
||||||
def __init__(self, chunk_token_num=128):
|
def __init__(self, chunk_token_num=128):
|
||||||
self.chunk_token_num = int(chunk_token_num)
|
self.chunk_token_num = int(chunk_token_num)
|
||||||
@ -38,7 +40,7 @@ class RAGFlowMarkdownParser:
|
|||||||
new_text += working_text[last_end : match.start()] + "\n\n"
|
new_text += working_text[last_end : match.start()] + "\n\n"
|
||||||
else:
|
else:
|
||||||
# Replace with rendered HTML
|
# Replace with rendered HTML
|
||||||
html_table = markdown(raw_table, extensions=['markdown.extensions.tables']) if render else raw_table
|
html_table = markdown(raw_table, extensions=["markdown.extensions.tables"]) if render else raw_table
|
||||||
new_text += working_text[last_end : match.start()] + html_table + "\n\n"
|
new_text += working_text[last_end : match.start()] + html_table + "\n\n"
|
||||||
last_end = match.end()
|
last_end = match.end()
|
||||||
new_text += working_text[last_end:]
|
new_text += working_text[last_end:]
|
||||||
@ -47,28 +49,32 @@ class RAGFlowMarkdownParser:
|
|||||||
if "|" in markdown_text: # for optimize performance
|
if "|" in markdown_text: # for optimize performance
|
||||||
# Standard Markdown table
|
# Standard Markdown table
|
||||||
border_table_pattern = re.compile(
|
border_table_pattern = re.compile(
|
||||||
r'''
|
r"""
|
||||||
(?:\n|^)
|
(?:\n|^)
|
||||||
(?:\|.*?\|.*?\|.*?\n)
|
(?:\|.*?\|.*?\|.*?\n)
|
||||||
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
|
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
|
||||||
(?:\|.*?\|.*?\|.*?\n)+
|
(?:\|.*?\|.*?\|.*?\n)+
|
||||||
''', re.VERBOSE)
|
""",
|
||||||
|
re.VERBOSE,
|
||||||
|
)
|
||||||
working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
|
working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
|
||||||
|
|
||||||
# Borderless Markdown table
|
# Borderless Markdown table
|
||||||
no_border_table_pattern = re.compile(
|
no_border_table_pattern = re.compile(
|
||||||
r'''
|
r"""
|
||||||
(?:\n|^)
|
(?:\n|^)
|
||||||
(?:\S.*?\|.*?\n)
|
(?:\S.*?\|.*?\n)
|
||||||
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
|
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
|
||||||
(?:\S.*?\|.*?\n)+
|
(?:\S.*?\|.*?\n)+
|
||||||
''', re.VERBOSE)
|
""",
|
||||||
|
re.VERBOSE,
|
||||||
|
)
|
||||||
working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
|
working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
|
||||||
|
|
||||||
if "<table>" in working_text.lower(): # for optimize performance
|
if "<table>" in working_text.lower(): # for optimize performance
|
||||||
# HTML table extraction - handle possible html/body wrapper tags
|
# HTML table extraction - handle possible html/body wrapper tags
|
||||||
html_table_pattern = re.compile(
|
html_table_pattern = re.compile(
|
||||||
r'''
|
r"""
|
||||||
(?:\n|^)
|
(?:\n|^)
|
||||||
\s*
|
\s*
|
||||||
(?:
|
(?:
|
||||||
@ -83,9 +89,10 @@ class RAGFlowMarkdownParser:
|
|||||||
)
|
)
|
||||||
\s*
|
\s*
|
||||||
(?=\n|$)
|
(?=\n|$)
|
||||||
''',
|
""",
|
||||||
re.VERBOSE | re.DOTALL | re.IGNORECASE
|
re.VERBOSE | re.DOTALL | re.IGNORECASE,
|
||||||
)
|
)
|
||||||
|
|
||||||
def replace_html_tables():
|
def replace_html_tables():
|
||||||
nonlocal working_text
|
nonlocal working_text
|
||||||
new_text = ""
|
new_text = ""
|
||||||
@ -104,3 +111,163 @@ class RAGFlowMarkdownParser:
|
|||||||
replace_html_tables()
|
replace_html_tables()
|
||||||
|
|
||||||
return working_text, tables
|
return working_text, tables
|
||||||
|
|
||||||
|
|
||||||
|
class MarkdownElementExtractor:
|
||||||
|
def __init__(self, markdown_content):
|
||||||
|
self.markdown_content = markdown_content
|
||||||
|
self.lines = markdown_content.split("\n")
|
||||||
|
self.ast_parser = mistune.create_markdown(renderer="ast")
|
||||||
|
self.ast_nodes = self.ast_parser(markdown_content)
|
||||||
|
|
||||||
|
def extract_elements(self):
|
||||||
|
"""Extract individual elements (headers, code blocks, lists, etc.)"""
|
||||||
|
sections = []
|
||||||
|
|
||||||
|
i = 0
|
||||||
|
while i < len(self.lines):
|
||||||
|
line = self.lines[i]
|
||||||
|
|
||||||
|
if re.match(r"^#{1,6}\s+.*$", line):
|
||||||
|
# header
|
||||||
|
element = self._extract_header(i)
|
||||||
|
sections.append(element["content"])
|
||||||
|
i = element["end_line"] + 1
|
||||||
|
elif line.strip().startswith("```"):
|
||||||
|
# code block
|
||||||
|
element = self._extract_code_block(i)
|
||||||
|
sections.append(element["content"])
|
||||||
|
i = element["end_line"] + 1
|
||||||
|
elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
|
||||||
|
# list block
|
||||||
|
element = self._extract_list_block(i)
|
||||||
|
sections.append(element["content"])
|
||||||
|
i = element["end_line"] + 1
|
||||||
|
elif line.strip().startswith(">"):
|
||||||
|
# blockquote
|
||||||
|
element = self._extract_blockquote(i)
|
||||||
|
sections.append(element["content"])
|
||||||
|
i = element["end_line"] + 1
|
||||||
|
elif line.strip():
|
||||||
|
# text block (paragraphs and inline elements until next block element)
|
||||||
|
element = self._extract_text_block(i)
|
||||||
|
sections.append(element["content"])
|
||||||
|
i = element["end_line"] + 1
|
||||||
|
else:
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
sections = [section for section in sections if section.strip()]
|
||||||
|
return sections
|
||||||
|
|
||||||
|
def _extract_header(self, start_pos):
|
||||||
|
return {
|
||||||
|
"type": "header",
|
||||||
|
"content": self.lines[start_pos],
|
||||||
|
"start_line": start_pos,
|
||||||
|
"end_line": start_pos,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _extract_code_block(self, start_pos):
|
||||||
|
end_pos = start_pos
|
||||||
|
content_lines = [self.lines[start_pos]]
|
||||||
|
|
||||||
|
# Find the end of the code block
|
||||||
|
for i in range(start_pos + 1, len(self.lines)):
|
||||||
|
content_lines.append(self.lines[i])
|
||||||
|
end_pos = i
|
||||||
|
if self.lines[i].strip().startswith("```"):
|
||||||
|
break
|
||||||
|
|
||||||
|
return {
|
||||||
|
"type": "code_block",
|
||||||
|
"content": "\n".join(content_lines),
|
||||||
|
"start_line": start_pos,
|
||||||
|
"end_line": end_pos,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _extract_list_block(self, start_pos):
|
||||||
|
end_pos = start_pos
|
||||||
|
content_lines = []
|
||||||
|
|
||||||
|
i = start_pos
|
||||||
|
while i < len(self.lines):
|
||||||
|
line = self.lines[i]
|
||||||
|
# check if this line is a list item or continuation of a list
|
||||||
|
if (
|
||||||
|
re.match(r"^\s*[-*+]\s+.*$", line)
|
||||||
|
or re.match(r"^\s*\d+\.\s+.*$", line)
|
||||||
|
or (i > start_pos and not line.strip())
|
||||||
|
or (i > start_pos and re.match(r"^\s{2,}[-*+]\s+.*$", line))
|
||||||
|
or (i > start_pos and re.match(r"^\s{2,}\d+\.\s+.*$", line))
|
||||||
|
or (i > start_pos and re.match(r"^\s+\w+.*$", line))
|
||||||
|
):
|
||||||
|
content_lines.append(line)
|
||||||
|
end_pos = i
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
return {
|
||||||
|
"type": "list_block",
|
||||||
|
"content": "\n".join(content_lines),
|
||||||
|
"start_line": start_pos,
|
||||||
|
"end_line": end_pos,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _extract_blockquote(self, start_pos):
|
||||||
|
end_pos = start_pos
|
||||||
|
content_lines = []
|
||||||
|
|
||||||
|
i = start_pos
|
||||||
|
while i < len(self.lines):
|
||||||
|
line = self.lines[i]
|
||||||
|
if line.strip().startswith(">") or (i > start_pos and not line.strip()):
|
||||||
|
content_lines.append(line)
|
||||||
|
end_pos = i
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
|
return {
|
||||||
|
"type": "blockquote",
|
||||||
|
"content": "\n".join(content_lines),
|
||||||
|
"start_line": start_pos,
|
||||||
|
"end_line": end_pos,
|
||||||
|
}
|
||||||
|
|
||||||
|
def _extract_text_block(self, start_pos):
|
||||||
|
"""Extract a text block (paragraphs, inline elements) until next block element"""
|
||||||
|
end_pos = start_pos
|
||||||
|
content_lines = [self.lines[start_pos]]
|
||||||
|
|
||||||
|
i = start_pos + 1
|
||||||
|
while i < len(self.lines):
|
||||||
|
line = self.lines[i]
|
||||||
|
# stop if we encounter a block element
|
||||||
|
if re.match(r"^#{1,6}\s+.*$", line) or line.strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"):
|
||||||
|
break
|
||||||
|
elif not line.strip():
|
||||||
|
# check if the next line is a block element
|
||||||
|
if i + 1 < len(self.lines) and (
|
||||||
|
re.match(r"^#{1,6}\s+.*$", self.lines[i + 1])
|
||||||
|
or self.lines[i + 1].strip().startswith("```")
|
||||||
|
or re.match(r"^\s*[-*+]\s+.*$", self.lines[i + 1])
|
||||||
|
or re.match(r"^\s*\d+\.\s+.*$", self.lines[i + 1])
|
||||||
|
or self.lines[i + 1].strip().startswith(">")
|
||||||
|
):
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
content_lines.append(line)
|
||||||
|
end_pos = i
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
content_lines.append(line)
|
||||||
|
end_pos = i
|
||||||
|
i += 1
|
||||||
|
|
||||||
|
return {
|
||||||
|
"type": "text_block",
|
||||||
|
"content": "\n".join(content_lines),
|
||||||
|
"start_line": start_pos,
|
||||||
|
"end_line": end_pos,
|
||||||
|
}
|
||||||
|
|||||||
@ -30,7 +30,7 @@ from tika import parser
|
|||||||
|
|
||||||
from api.db import LLMType
|
from api.db import LLMType
|
||||||
from api.db.services.llm_service import LLMBundle
|
from api.db.services.llm_service import LLMBundle
|
||||||
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
|
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
|
||||||
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper
|
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper
|
||||||
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
||||||
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
|
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
|
||||||
@ -350,17 +350,14 @@ class Markdown(MarkdownParser):
|
|||||||
else:
|
else:
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
txt = f.read()
|
txt = f.read()
|
||||||
|
|
||||||
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
|
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
|
||||||
sections = []
|
|
||||||
|
extractor = MarkdownElementExtractor(txt)
|
||||||
|
element_sections = extractor.extract_elements()
|
||||||
|
sections = [(element, "") for element in element_sections]
|
||||||
|
|
||||||
tbls = []
|
tbls = []
|
||||||
for sec in remainder.split("\n"):
|
|
||||||
if sec.strip().find("#") == 0:
|
|
||||||
sections.append((sec, ""))
|
|
||||||
elif sections and sections[-1][0].strip().find("#") == 0:
|
|
||||||
sec_, _ = sections.pop(-1)
|
|
||||||
sections.append((sec_ + "\n" + sec, ""))
|
|
||||||
else:
|
|
||||||
sections.append((sec, ""))
|
|
||||||
for table in tables:
|
for table in tables:
|
||||||
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
|
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
|
||||||
return sections, tbls
|
return sections, tbls
|
||||||
|
|||||||
Reference in New Issue
Block a user