Fix: fixed context loss caused by separating markdown tables from original text (#8844)

### What problem does this PR solve?

Fix context loss caused by separating markdown tables from original
text. #6871, #8804.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Yongteng Lei
2025-07-15 13:03:01 +08:00
committed by GitHub
parent c08ed28f09
commit 51a8604dcb
2 changed files with 73 additions and 44 deletions

View File

@ -17,13 +17,33 @@
import re import re
from markdown import markdown
class RAGFlowMarkdownParser: class RAGFlowMarkdownParser:
def __init__(self, chunk_token_num=128): def __init__(self, chunk_token_num=128):
self.chunk_token_num = int(chunk_token_num) self.chunk_token_num = int(chunk_token_num)
def extract_tables_and_remainder(self, markdown_text): def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
tables = [] tables = []
remainder = markdown_text working_text = markdown_text
def replace_tables_with_rendered_html(pattern, table_list, render=True):
new_text = ""
last_end = 0
for match in pattern.finditer(working_text):
raw_table = match.group()
table_list.append(raw_table)
if separate_tables:
# Skip this match (i.e., remove it)
new_text += working_text[last_end:match.start()] + "\n\n"
else:
# Replace with rendered HTML
html_table = markdown(raw_table, extensions=['markdown.extensions.tables']) if render else raw_table
new_text += working_text[last_end:match.start()] + html_table + "\n\n"
last_end = match.end()
new_text += working_text[last_end:]
return new_text
if "|" in markdown_text: # for optimize performance if "|" in markdown_text: # for optimize performance
# Standard Markdown table # Standard Markdown table
border_table_pattern = re.compile( border_table_pattern = re.compile(
@ -33,9 +53,7 @@ class RAGFlowMarkdownParser:
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
(?:\|.*?\|.*?\|.*?\n)+ (?:\|.*?\|.*?\|.*?\n)+
''', re.VERBOSE) ''', re.VERBOSE)
border_tables = border_table_pattern.findall(markdown_text) working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
tables.extend(border_tables)
remainder = border_table_pattern.sub('', remainder)
# Borderless Markdown table # Borderless Markdown table
no_border_table_pattern = re.compile( no_border_table_pattern = re.compile(
@ -45,11 +63,9 @@ class RAGFlowMarkdownParser:
(?:(?:\s*[:-]+[-| :]*\s*).*?\n) (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
(?:\S.*?\|.*?\n)+ (?:\S.*?\|.*?\n)+
''', re.VERBOSE) ''', re.VERBOSE)
no_border_tables = no_border_table_pattern.findall(remainder) working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
tables.extend(no_border_tables)
remainder = no_border_table_pattern.sub('', remainder)
if "<table>" in remainder.lower(): # for optimize performance if "<table>" in working_text.lower(): # for optimize performance
#HTML table extraction - handle possible html/body wrapper tags #HTML table extraction - handle possible html/body wrapper tags
html_table_pattern = re.compile( html_table_pattern = re.compile(
r''' r'''
@ -70,8 +86,21 @@ class RAGFlowMarkdownParser:
''', ''',
re.VERBOSE | re.DOTALL | re.IGNORECASE re.VERBOSE | re.DOTALL | re.IGNORECASE
) )
html_tables = html_table_pattern.findall(remainder) def replace_html_tables():
tables.extend(html_tables) nonlocal working_text
remainder = html_table_pattern.sub('', remainder) new_text = ""
last_end = 0
for match in html_table_pattern.finditer(working_text):
raw_table = match.group()
tables.append(raw_table)
if separate_tables:
new_text += working_text[last_end:match.start()] + "\n\n"
else:
new_text += working_text[last_end:match.start()] + raw_table + "\n\n"
last_end = match.end()
new_text += working_text[last_end:]
working_text = new_text
return remainder, tables replace_html_tables()
return working_text, tables

View File

@ -323,14 +323,14 @@ class Markdown(MarkdownParser):
return images if images else None return images if images else None
def __call__(self, filename, binary=None): def __call__(self, filename, binary=None, separate_tables=True):
if binary: if binary:
encoding = find_codec(binary) encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore") txt = binary.decode(encoding, errors="ignore")
else: else:
with open(filename, "r") as f: with open(filename, "r") as f:
txt = f.read() txt = f.read()
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n') remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
sections = [] sections = []
tbls = [] tbls = []
for sec in remainder.split("\n"): for sec in remainder.split("\n"):
@ -465,7 +465,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128))) markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
sections, tables = markdown_parser(filename, binary) sections, tables = markdown_parser(filename, binary, separate_tables=False)
# Process images for each section # Process images for each section
section_images = [] section_images = []