From 121c51661d94843a4c33561976ce418b25fb8d84 Mon Sep 17 00:00:00 2001 From: Billy Bao Date: Wed, 5 Nov 2025 16:10:21 +0800 Subject: [PATCH] Fix: Markdown table extractor (#11018) ### What problem does this PR solve? Now markdown table extractor supports . #10966 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/markdown_parser.py | 11 +++++++++++ rag/app/naive.py | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py index 2f8d6ab97..cfcf0ae83 100644 --- a/deepdoc/parser/markdown_parser.py +++ b/deepdoc/parser/markdown_parser.py @@ -70,6 +70,17 @@ class RAGFlowMarkdownParser: ) working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables) + # Replace any TAGS e.g.
to
+ TAGS = ["table", "td", "tr", "th", "tbody", "thead", "div"] + table_with_attributes_pattern = re.compile( + rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE + ) + def replace_tag(m): + tag_name = re.match(r"<(\w+)", m.group()).group(1) + return "<{}>".format(tag_name) + + working_text = re.sub(table_with_attributes_pattern, replace_tag, working_text) + if "
" in working_text.lower(): # for optimize performance # HTML table extraction - handle possible html/body wrapper tags html_table_pattern = re.compile( diff --git a/rag/app/naive.py b/rag/app/naive.py index f2bfd565b..a96d947d8 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -509,11 +509,11 @@ class Markdown(MarkdownParser): txt = f.read() remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables) - + # To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410. + # extractor = MarkdownElementExtractor(remainder) extractor = MarkdownElementExtractor(txt) element_sections = extractor.extract_elements(delimiter) sections = [(element, "") for element in element_sections] - tbls = [] for table in tables: tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))