diff --git a/deepdoc/parser/__init__.py b/deepdoc/parser/__init__.py
index 1597ed081..809a56edf 100644
--- a/deepdoc/parser/__init__.py
+++ b/deepdoc/parser/__init__.py
@@ -14,13 +14,15 @@
# limitations under the License.
#
-from .pdf_parser import RAGFlowPdfParser as PdfParser, PlainParser
from .docx_parser import RAGFlowDocxParser as DocxParser
from .excel_parser import RAGFlowExcelParser as ExcelParser
-from .ppt_parser import RAGFlowPptParser as PptParser
from .html_parser import RAGFlowHtmlParser as HtmlParser
from .json_parser import RAGFlowJsonParser as JsonParser
+from .markdown_parser import MarkdownElementExtractor
from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser
+from .pdf_parser import PlainParser
+from .pdf_parser import RAGFlowPdfParser as PdfParser
+from .ppt_parser import RAGFlowPptParser as PptParser
from .txt_parser import RAGFlowTxtParser as TxtParser
__all__ = [
@@ -33,4 +35,6 @@ __all__ = [
"JsonParser",
"MarkdownParser",
"TxtParser",
-]
\ No newline at end of file
+ "MarkdownElementExtractor",
+]
+
diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py
index 1db8557a6..0f39b2443 100644
--- a/deepdoc/parser/markdown_parser.py
+++ b/deepdoc/parser/markdown_parser.py
@@ -17,8 +17,10 @@
import re
+import mistune
from markdown import markdown
+
class RAGFlowMarkdownParser:
def __init__(self, chunk_token_num=128):
self.chunk_token_num = int(chunk_token_num)
@@ -35,40 +37,44 @@ class RAGFlowMarkdownParser:
table_list.append(raw_table)
if separate_tables:
# Skip this match (i.e., remove it)
- new_text += working_text[last_end:match.start()] + "\n\n"
+ new_text += working_text[last_end : match.start()] + "\n\n"
else:
# Replace with rendered HTML
- html_table = markdown(raw_table, extensions=['markdown.extensions.tables']) if render else raw_table
- new_text += working_text[last_end:match.start()] + html_table + "\n\n"
+ html_table = markdown(raw_table, extensions=["markdown.extensions.tables"]) if render else raw_table
+ new_text += working_text[last_end : match.start()] + html_table + "\n\n"
last_end = match.end()
new_text += working_text[last_end:]
return new_text
- if "|" in markdown_text: # for optimize performance
+ if "|" in markdown_text: # for optimize performance
# Standard Markdown table
border_table_pattern = re.compile(
- r'''
+ r"""
(?:\n|^)
(?:\|.*?\|.*?\|.*?\n)
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
(?:\|.*?\|.*?\|.*?\n)+
- ''', re.VERBOSE)
+ """,
+ re.VERBOSE,
+ )
working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
# Borderless Markdown table
no_border_table_pattern = re.compile(
- r'''
+ r"""
(?:\n|^)
(?:\S.*?\|.*?\n)
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
(?:\S.*?\|.*?\n)+
- ''', re.VERBOSE)
+ """,
+ re.VERBOSE,
+ )
working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
- if "
" in working_text.lower(): # for optimize performance
- #HTML table extraction - handle possible html/body wrapper tags
+ if "" in working_text.lower(): # for optimize performance
+ # HTML table extraction - handle possible html/body wrapper tags
html_table_pattern = re.compile(
- r'''
+ r"""
(?:\n|^)
\s*
(?:
@@ -83,9 +89,10 @@ class RAGFlowMarkdownParser:
)
\s*
(?=\n|$)
- ''',
- re.VERBOSE | re.DOTALL | re.IGNORECASE
+ """,
+ re.VERBOSE | re.DOTALL | re.IGNORECASE,
)
+
def replace_html_tables():
nonlocal working_text
new_text = ""
@@ -94,9 +101,9 @@ class RAGFlowMarkdownParser:
raw_table = match.group()
tables.append(raw_table)
if separate_tables:
- new_text += working_text[last_end:match.start()] + "\n\n"
+ new_text += working_text[last_end : match.start()] + "\n\n"
else:
- new_text += working_text[last_end:match.start()] + raw_table + "\n\n"
+ new_text += working_text[last_end : match.start()] + raw_table + "\n\n"
last_end = match.end()
new_text += working_text[last_end:]
working_text = new_text
@@ -104,3 +111,163 @@ class RAGFlowMarkdownParser:
replace_html_tables()
return working_text, tables
+
+
+class MarkdownElementExtractor:
+ def __init__(self, markdown_content):
+ self.markdown_content = markdown_content
+ self.lines = markdown_content.split("\n")
+ self.ast_parser = mistune.create_markdown(renderer="ast")
+ self.ast_nodes = self.ast_parser(markdown_content)
+
+ def extract_elements(self):
+ """Extract individual elements (headers, code blocks, lists, etc.)"""
+ sections = []
+
+ i = 0
+ while i < len(self.lines):
+ line = self.lines[i]
+
+ if re.match(r"^#{1,6}\s+.*$", line):
+ # header
+ element = self._extract_header(i)
+ sections.append(element["content"])
+ i = element["end_line"] + 1
+ elif line.strip().startswith("```"):
+ # code block
+ element = self._extract_code_block(i)
+ sections.append(element["content"])
+ i = element["end_line"] + 1
+ elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
+ # list block
+ element = self._extract_list_block(i)
+ sections.append(element["content"])
+ i = element["end_line"] + 1
+ elif line.strip().startswith(">"):
+ # blockquote
+ element = self._extract_blockquote(i)
+ sections.append(element["content"])
+ i = element["end_line"] + 1
+ elif line.strip():
+ # text block (paragraphs and inline elements until next block element)
+ element = self._extract_text_block(i)
+ sections.append(element["content"])
+ i = element["end_line"] + 1
+ else:
+ i += 1
+
+ sections = [section for section in sections if section.strip()]
+ return sections
+
+ def _extract_header(self, start_pos):
+ return {
+ "type": "header",
+ "content": self.lines[start_pos],
+ "start_line": start_pos,
+ "end_line": start_pos,
+ }
+
+ def _extract_code_block(self, start_pos):
+ end_pos = start_pos
+ content_lines = [self.lines[start_pos]]
+
+ # Find the end of the code block
+ for i in range(start_pos + 1, len(self.lines)):
+ content_lines.append(self.lines[i])
+ end_pos = i
+ if self.lines[i].strip().startswith("```"):
+ break
+
+ return {
+ "type": "code_block",
+ "content": "\n".join(content_lines),
+ "start_line": start_pos,
+ "end_line": end_pos,
+ }
+
+ def _extract_list_block(self, start_pos):
+ end_pos = start_pos
+ content_lines = []
+
+ i = start_pos
+ while i < len(self.lines):
+ line = self.lines[i]
+ # check if this line is a list item or continuation of a list
+ if (
+ re.match(r"^\s*[-*+]\s+.*$", line)
+ or re.match(r"^\s*\d+\.\s+.*$", line)
+ or (i > start_pos and not line.strip())
+ or (i > start_pos and re.match(r"^\s{2,}[-*+]\s+.*$", line))
+ or (i > start_pos and re.match(r"^\s{2,}\d+\.\s+.*$", line))
+ or (i > start_pos and re.match(r"^\s+\w+.*$", line))
+ ):
+ content_lines.append(line)
+ end_pos = i
+ i += 1
+ else:
+ break
+
+ return {
+ "type": "list_block",
+ "content": "\n".join(content_lines),
+ "start_line": start_pos,
+ "end_line": end_pos,
+ }
+
+ def _extract_blockquote(self, start_pos):
+ end_pos = start_pos
+ content_lines = []
+
+ i = start_pos
+ while i < len(self.lines):
+ line = self.lines[i]
+ if line.strip().startswith(">") or (i > start_pos and not line.strip()):
+ content_lines.append(line)
+ end_pos = i
+ i += 1
+ else:
+ break
+
+ return {
+ "type": "blockquote",
+ "content": "\n".join(content_lines),
+ "start_line": start_pos,
+ "end_line": end_pos,
+ }
+
+ def _extract_text_block(self, start_pos):
+ """Extract a text block (paragraphs, inline elements) until next block element"""
+ end_pos = start_pos
+ content_lines = [self.lines[start_pos]]
+
+ i = start_pos + 1
+ while i < len(self.lines):
+ line = self.lines[i]
+ # stop if we encounter a block element
+ if re.match(r"^#{1,6}\s+.*$", line) or line.strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"):
+ break
+ elif not line.strip():
+ # check if the next line is a block element
+ if i + 1 < len(self.lines) and (
+ re.match(r"^#{1,6}\s+.*$", self.lines[i + 1])
+ or self.lines[i + 1].strip().startswith("```")
+ or re.match(r"^\s*[-*+]\s+.*$", self.lines[i + 1])
+ or re.match(r"^\s*\d+\.\s+.*$", self.lines[i + 1])
+ or self.lines[i + 1].strip().startswith(">")
+ ):
+ break
+ else:
+ content_lines.append(line)
+ end_pos = i
+ i += 1
+ else:
+ content_lines.append(line)
+ end_pos = i
+ i += 1
+
+ return {
+ "type": "text_block",
+ "content": "\n".join(content_lines),
+ "start_line": start_pos,
+ "end_line": end_pos,
+ }
diff --git a/rag/app/naive.py b/rag/app/naive.py
index 8c0708173..a3291b1d6 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -30,7 +30,7 @@ from tika import parser
from api.db import LLMType
from api.db.services.llm_service import LLMBundle
-from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser
+from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
@@ -350,17 +350,14 @@ class Markdown(MarkdownParser):
else:
with open(filename, "r") as f:
txt = f.read()
+
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
- sections = []
+
+ extractor = MarkdownElementExtractor(txt)
+ element_sections = extractor.extract_elements()
+ sections = [(element, "") for element in element_sections]
+
tbls = []
- for sec in remainder.split("\n"):
- if sec.strip().find("#") == 0:
- sections.append((sec, ""))
- elif sections and sections[-1][0].strip().find("#") == 0:
- sec_, _ = sections.pop(-1)
- sections.append((sec_ + "\n" + sec, ""))
- else:
- sections.append((sec, ""))
for table in tables:
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
return sections, tbls