diff --git a/deepdoc/parser/__init__.py b/deepdoc/parser/__init__.py index 1597ed081..809a56edf 100644 --- a/deepdoc/parser/__init__.py +++ b/deepdoc/parser/__init__.py @@ -14,13 +14,15 @@ # limitations under the License. # -from .pdf_parser import RAGFlowPdfParser as PdfParser, PlainParser from .docx_parser import RAGFlowDocxParser as DocxParser from .excel_parser import RAGFlowExcelParser as ExcelParser -from .ppt_parser import RAGFlowPptParser as PptParser from .html_parser import RAGFlowHtmlParser as HtmlParser from .json_parser import RAGFlowJsonParser as JsonParser +from .markdown_parser import MarkdownElementExtractor from .markdown_parser import RAGFlowMarkdownParser as MarkdownParser +from .pdf_parser import PlainParser +from .pdf_parser import RAGFlowPdfParser as PdfParser +from .ppt_parser import RAGFlowPptParser as PptParser from .txt_parser import RAGFlowTxtParser as TxtParser __all__ = [ @@ -33,4 +35,6 @@ __all__ = [ "JsonParser", "MarkdownParser", "TxtParser", -] \ No newline at end of file + "MarkdownElementExtractor", +] + diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py index 1db8557a6..0f39b2443 100644 --- a/deepdoc/parser/markdown_parser.py +++ b/deepdoc/parser/markdown_parser.py @@ -17,8 +17,10 @@ import re +import mistune from markdown import markdown + class RAGFlowMarkdownParser: def __init__(self, chunk_token_num=128): self.chunk_token_num = int(chunk_token_num) @@ -35,40 +37,44 @@ class RAGFlowMarkdownParser: table_list.append(raw_table) if separate_tables: # Skip this match (i.e., remove it) - new_text += working_text[last_end:match.start()] + "\n\n" + new_text += working_text[last_end : match.start()] + "\n\n" else: # Replace with rendered HTML - html_table = markdown(raw_table, extensions=['markdown.extensions.tables']) if render else raw_table - new_text += working_text[last_end:match.start()] + html_table + "\n\n" + html_table = markdown(raw_table, extensions=["markdown.extensions.tables"]) if render else raw_table + new_text += working_text[last_end : match.start()] + html_table + "\n\n" last_end = match.end() new_text += working_text[last_end:] return new_text - if "|" in markdown_text: # for optimize performance + if "|" in markdown_text: # for optimize performance # Standard Markdown table border_table_pattern = re.compile( - r''' + r""" (?:\n|^) (?:\|.*?\|.*?\|.*?\n) (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) (?:\|.*?\|.*?\|.*?\n)+ - ''', re.VERBOSE) + """, + re.VERBOSE, + ) working_text = replace_tables_with_rendered_html(border_table_pattern, tables) # Borderless Markdown table no_border_table_pattern = re.compile( - r''' + r""" (?:\n|^) (?:\S.*?\|.*?\n) (?:(?:\s*[:-]+[-| :]*\s*).*?\n) (?:\S.*?\|.*?\n)+ - ''', re.VERBOSE) + """, + re.VERBOSE, + ) working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables) - if "" in working_text.lower(): # for optimize performance - #HTML table extraction - handle possible html/body wrapper tags + if "
" in working_text.lower(): # for optimize performance + # HTML table extraction - handle possible html/body wrapper tags html_table_pattern = re.compile( - r''' + r""" (?:\n|^) \s* (?: @@ -83,9 +89,10 @@ class RAGFlowMarkdownParser: ) \s* (?=\n|$) - ''', - re.VERBOSE | re.DOTALL | re.IGNORECASE + """, + re.VERBOSE | re.DOTALL | re.IGNORECASE, ) + def replace_html_tables(): nonlocal working_text new_text = "" @@ -94,9 +101,9 @@ class RAGFlowMarkdownParser: raw_table = match.group() tables.append(raw_table) if separate_tables: - new_text += working_text[last_end:match.start()] + "\n\n" + new_text += working_text[last_end : match.start()] + "\n\n" else: - new_text += working_text[last_end:match.start()] + raw_table + "\n\n" + new_text += working_text[last_end : match.start()] + raw_table + "\n\n" last_end = match.end() new_text += working_text[last_end:] working_text = new_text @@ -104,3 +111,163 @@ class RAGFlowMarkdownParser: replace_html_tables() return working_text, tables + + +class MarkdownElementExtractor: + def __init__(self, markdown_content): + self.markdown_content = markdown_content + self.lines = markdown_content.split("\n") + self.ast_parser = mistune.create_markdown(renderer="ast") + self.ast_nodes = self.ast_parser(markdown_content) + + def extract_elements(self): + """Extract individual elements (headers, code blocks, lists, etc.)""" + sections = [] + + i = 0 + while i < len(self.lines): + line = self.lines[i] + + if re.match(r"^#{1,6}\s+.*$", line): + # header + element = self._extract_header(i) + sections.append(element["content"]) + i = element["end_line"] + 1 + elif line.strip().startswith("```"): + # code block + element = self._extract_code_block(i) + sections.append(element["content"]) + i = element["end_line"] + 1 + elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line): + # list block + element = self._extract_list_block(i) + sections.append(element["content"]) + i = element["end_line"] + 1 + elif line.strip().startswith(">"): + # blockquote + element = self._extract_blockquote(i) + sections.append(element["content"]) + i = element["end_line"] + 1 + elif line.strip(): + # text block (paragraphs and inline elements until next block element) + element = self._extract_text_block(i) + sections.append(element["content"]) + i = element["end_line"] + 1 + else: + i += 1 + + sections = [section for section in sections if section.strip()] + return sections + + def _extract_header(self, start_pos): + return { + "type": "header", + "content": self.lines[start_pos], + "start_line": start_pos, + "end_line": start_pos, + } + + def _extract_code_block(self, start_pos): + end_pos = start_pos + content_lines = [self.lines[start_pos]] + + # Find the end of the code block + for i in range(start_pos + 1, len(self.lines)): + content_lines.append(self.lines[i]) + end_pos = i + if self.lines[i].strip().startswith("```"): + break + + return { + "type": "code_block", + "content": "\n".join(content_lines), + "start_line": start_pos, + "end_line": end_pos, + } + + def _extract_list_block(self, start_pos): + end_pos = start_pos + content_lines = [] + + i = start_pos + while i < len(self.lines): + line = self.lines[i] + # check if this line is a list item or continuation of a list + if ( + re.match(r"^\s*[-*+]\s+.*$", line) + or re.match(r"^\s*\d+\.\s+.*$", line) + or (i > start_pos and not line.strip()) + or (i > start_pos and re.match(r"^\s{2,}[-*+]\s+.*$", line)) + or (i > start_pos and re.match(r"^\s{2,}\d+\.\s+.*$", line)) + or (i > start_pos and re.match(r"^\s+\w+.*$", line)) + ): + content_lines.append(line) + end_pos = i + i += 1 + else: + break + + return { + "type": "list_block", + "content": "\n".join(content_lines), + "start_line": start_pos, + "end_line": end_pos, + } + + def _extract_blockquote(self, start_pos): + end_pos = start_pos + content_lines = [] + + i = start_pos + while i < len(self.lines): + line = self.lines[i] + if line.strip().startswith(">") or (i > start_pos and not line.strip()): + content_lines.append(line) + end_pos = i + i += 1 + else: + break + + return { + "type": "blockquote", + "content": "\n".join(content_lines), + "start_line": start_pos, + "end_line": end_pos, + } + + def _extract_text_block(self, start_pos): + """Extract a text block (paragraphs, inline elements) until next block element""" + end_pos = start_pos + content_lines = [self.lines[start_pos]] + + i = start_pos + 1 + while i < len(self.lines): + line = self.lines[i] + # stop if we encounter a block element + if re.match(r"^#{1,6}\s+.*$", line) or line.strip().startswith("```") or re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line) or line.strip().startswith(">"): + break + elif not line.strip(): + # check if the next line is a block element + if i + 1 < len(self.lines) and ( + re.match(r"^#{1,6}\s+.*$", self.lines[i + 1]) + or self.lines[i + 1].strip().startswith("```") + or re.match(r"^\s*[-*+]\s+.*$", self.lines[i + 1]) + or re.match(r"^\s*\d+\.\s+.*$", self.lines[i + 1]) + or self.lines[i + 1].strip().startswith(">") + ): + break + else: + content_lines.append(line) + end_pos = i + i += 1 + else: + content_lines.append(line) + end_pos = i + i += 1 + + return { + "type": "text_block", + "content": "\n".join(content_lines), + "start_line": start_pos, + "end_line": end_pos, + } diff --git a/rag/app/naive.py b/rag/app/naive.py index 8c0708173..a3291b1d6 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -30,7 +30,7 @@ from tika import parser from api.db import LLMType from api.db.services.llm_service import LLMBundle -from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownParser, PdfParser, TxtParser +from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_figure_data_wrapper from deepdoc.parser.pdf_parser import PlainParser, VisionParser from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table @@ -350,17 +350,14 @@ class Markdown(MarkdownParser): else: with open(filename, "r") as f: txt = f.read() + remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables) - sections = [] + + extractor = MarkdownElementExtractor(txt) + element_sections = extractor.extract_elements() + sections = [(element, "") for element in element_sections] + tbls = [] - for sec in remainder.split("\n"): - if sec.strip().find("#") == 0: - sections.append((sec, "")) - elif sections and sections[-1][0].strip().find("#") == 0: - sec_, _ = sections.pop(-1) - sections.append((sec_ + "\n" + sec, "")) - else: - sections.append((sec, "")) for table in tables: tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), "")) return sections, tbls