diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py
index 2f8d6ab97..cfcf0ae83 100644
--- a/deepdoc/parser/markdown_parser.py
+++ b/deepdoc/parser/markdown_parser.py
@@ -70,6 +70,17 @@ class RAGFlowMarkdownParser:
)
working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
+ # Replace any TAGS e.g.
to
+ TAGS = ["table", "td", "tr", "th", "tbody", "thead", "div"]
+ table_with_attributes_pattern = re.compile(
+ rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE
+ )
+ def replace_tag(m):
+ tag_name = re.match(r"<(\w+)", m.group()).group(1)
+ return "<{}>".format(tag_name)
+
+ working_text = re.sub(table_with_attributes_pattern, replace_tag, working_text)
+
if "" in working_text.lower(): # for optimize performance
# HTML table extraction - handle possible html/body wrapper tags
html_table_pattern = re.compile(
diff --git a/rag/app/naive.py b/rag/app/naive.py
index f2bfd565b..a96d947d8 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -509,11 +509,11 @@ class Markdown(MarkdownParser):
txt = f.read()
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
-
+ # To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
+ # extractor = MarkdownElementExtractor(remainder)
extractor = MarkdownElementExtractor(txt)
element_sections = extractor.extract_elements(delimiter)
sections = [(element, "") for element in element_sections]
-
tbls = []
for table in tables:
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))