Fix: fixed context loss caused by separating markdown tables from original text (#8844)

### What problem does this PR solve?

Fix context loss caused by separating markdown tables from original
text. #6871, #8804.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Yongteng Lei
2025-07-15 13:03:01 +08:00
committed by GitHub
parent c08ed28f09
commit 51a8604dcb
2 changed files with 73 additions and 44 deletions

View File

@ -17,39 +17,55 @@
import re import re
from markdown import markdown
class RAGFlowMarkdownParser: class RAGFlowMarkdownParser:
def __init__(self, chunk_token_num=128): def __init__(self, chunk_token_num=128):
self.chunk_token_num = int(chunk_token_num) self.chunk_token_num = int(chunk_token_num)
def extract_tables_and_remainder(self, markdown_text): def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
tables = [] tables = []
remainder = markdown_text working_text = markdown_text
def replace_tables_with_rendered_html(pattern, table_list, render=True):
new_text = ""
last_end = 0
for match in pattern.finditer(working_text):
raw_table = match.group()
table_list.append(raw_table)
if separate_tables:
# Skip this match (i.e., remove it)
new_text += working_text[last_end:match.start()] + "\n\n"
else:
# Replace with rendered HTML
html_table = markdown(raw_table, extensions=['markdown.extensions.tables']) if render else raw_table
new_text += working_text[last_end:match.start()] + html_table + "\n\n"
last_end = match.end()
new_text += working_text[last_end:]
return new_text
if "|" in markdown_text: # for optimize performance if "|" in markdown_text: # for optimize performance
# Standard Markdown table # Standard Markdown table
border_table_pattern = re.compile( border_table_pattern = re.compile(
r''' r'''
(?:\n|^) (?:\n|^)
(?:\|.*?\|.*?\|.*?\n) (?:\|.*?\|.*?\|.*?\n)
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n) (?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
(?:\|.*?\|.*?\|.*?\n)+ (?:\|.*?\|.*?\|.*?\n)+
''', re.VERBOSE) ''', re.VERBOSE)
border_tables = border_table_pattern.findall(markdown_text) working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
tables.extend(border_tables)
remainder = border_table_pattern.sub('', remainder)
# Borderless Markdown table # Borderless Markdown table
no_border_table_pattern = re.compile( no_border_table_pattern = re.compile(
r''' r'''
(?:\n|^) (?:\n|^)
(?:\S.*?\|.*?\n) (?:\S.*?\|.*?\n)
(?:(?:\s*[:-]+[-| :]*\s*).*?\n) (?:(?:\s*[:-]+[-| :]*\s*).*?\n)
(?:\S.*?\|.*?\n)+ (?:\S.*?\|.*?\n)+
''', re.VERBOSE) ''', re.VERBOSE)
no_border_tables = no_border_table_pattern.findall(remainder) working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
tables.extend(no_border_tables)
remainder = no_border_table_pattern.sub('', remainder)
if "<table>" in remainder.lower(): # for optimize performance if "<table>" in working_text.lower(): # for optimize performance
#HTML table extraction - handle possible html/body wrapper tags #HTML table extraction - handle possible html/body wrapper tags
html_table_pattern = re.compile( html_table_pattern = re.compile(
r''' r'''
@ -70,8 +86,21 @@ class RAGFlowMarkdownParser:
''', ''',
re.VERBOSE | re.DOTALL | re.IGNORECASE re.VERBOSE | re.DOTALL | re.IGNORECASE
) )
html_tables = html_table_pattern.findall(remainder) def replace_html_tables():
tables.extend(html_tables) nonlocal working_text
remainder = html_table_pattern.sub('', remainder) new_text = ""
last_end = 0
for match in html_table_pattern.finditer(working_text):
raw_table = match.group()
tables.append(raw_table)
if separate_tables:
new_text += working_text[last_end:match.start()] + "\n\n"
else:
new_text += working_text[last_end:match.start()] + raw_table + "\n\n"
last_end = match.end()
new_text += working_text[last_end:]
working_text = new_text
return remainder, tables replace_html_tables()
return working_text, tables

View File

@ -22,7 +22,7 @@ from timeit import default_timer as timer
from docx import Document from docx import Document
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
from markdown import markdown from markdown import markdown
from PIL import Image from PIL import Image
from tika import parser from tika import parser
@ -76,15 +76,15 @@ class Docx(DocxParser):
"""Get the hierarchical title structure before the table""" """Get the hierarchical title structure before the table"""
import re import re
from docx.text.paragraph import Paragraph from docx.text.paragraph import Paragraph
titles = [] titles = []
blocks = [] blocks = []
# Get document name from filename parameter # Get document name from filename parameter
doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename) doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename)
if not doc_name: if not doc_name:
doc_name = "Untitled Document" doc_name = "Untitled Document"
# Collect all document blocks while maintaining document order # Collect all document blocks while maintaining document order
try: try:
# Iterate through all paragraphs and tables in document order # Iterate through all paragraphs and tables in document order
@ -97,7 +97,7 @@ class Docx(DocxParser):
except Exception as e: except Exception as e:
logging.error(f"Error collecting blocks: {e}") logging.error(f"Error collecting blocks: {e}")
return "" return ""
# Find the target table position # Find the target table position
target_table_pos = -1 target_table_pos = -1
table_count = 0 table_count = 0
@ -107,20 +107,20 @@ class Docx(DocxParser):
target_table_pos = pos target_table_pos = pos
break break
table_count += 1 table_count += 1
if target_table_pos == -1: if target_table_pos == -1:
return "" # Target table not found return "" # Target table not found
# Find the nearest heading paragraph in reverse order # Find the nearest heading paragraph in reverse order
nearest_title = None nearest_title = None
for i in range(len(blocks)-1, -1, -1): for i in range(len(blocks)-1, -1, -1):
block_type, pos, block = blocks[i] block_type, pos, block = blocks[i]
if pos >= target_table_pos: # Skip blocks after the table if pos >= target_table_pos: # Skip blocks after the table
continue continue
if block_type != 'p': if block_type != 'p':
continue continue
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I): if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
try: try:
level_match = re.search(r"(\d+)", block.style.name) level_match = re.search(r"(\d+)", block.style.name)
@ -133,12 +133,12 @@ class Docx(DocxParser):
break break
except Exception as e: except Exception as e:
logging.error(f"Error parsing heading level: {e}") logging.error(f"Error parsing heading level: {e}")
if nearest_title: if nearest_title:
# Add current title # Add current title
titles.append(nearest_title) titles.append(nearest_title)
current_level = nearest_title[0] current_level = nearest_title[0]
# Find all parent headings, allowing cross-level search # Find all parent headings, allowing cross-level search
while current_level > 1: while current_level > 1:
found = False found = False
@ -146,17 +146,17 @@ class Docx(DocxParser):
block_type, pos, block = blocks[i] block_type, pos, block = blocks[i]
if pos >= target_table_pos: # Skip blocks after the table if pos >= target_table_pos: # Skip blocks after the table
continue continue
if block_type != 'p': if block_type != 'p':
continue continue
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I): if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
try: try:
level_match = re.search(r"(\d+)", block.style.name) level_match = re.search(r"(\d+)", block.style.name)
if level_match: if level_match:
level = int(level_match.group(1)) level = int(level_match.group(1))
# Find any heading with a higher level # Find any heading with a higher level
if level < current_level: if level < current_level:
title_text = block.text.strip() title_text = block.text.strip()
if title_text: # Avoid empty titles if title_text: # Avoid empty titles
titles.append((level, title_text)) titles.append((level, title_text))
@ -165,16 +165,16 @@ class Docx(DocxParser):
break break
except Exception as e: except Exception as e:
logging.error(f"Error parsing parent heading: {e}") logging.error(f"Error parsing parent heading: {e}")
if not found: # Break if no parent heading is found if not found: # Break if no parent heading is found
break break
# Sort by level (ascending, from highest to lowest) # Sort by level (ascending, from highest to lowest)
titles.sort(key=lambda x: x[0]) titles.sort(key=lambda x: x[0])
# Organize titles (from highest to lowest) # Organize titles (from highest to lowest)
hierarchy = [doc_name] + [t[1] for t in titles] hierarchy = [doc_name] + [t[1] for t in titles]
return " > ".join(hierarchy) return " > ".join(hierarchy)
return "" return ""
def __call__(self, filename, binary=None, from_page=0, to_page=100000): def __call__(self, filename, binary=None, from_page=0, to_page=100000):
@ -298,13 +298,13 @@ class Markdown(MarkdownParser):
text = sections[0] text = sections[0]
else: else:
return [] return []
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
html_content = markdown(text) html_content = markdown(text)
soup = BeautifulSoup(html_content, 'html.parser') soup = BeautifulSoup(html_content, 'html.parser')
html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')] html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')]
return html_images return html_images
def get_pictures(self, text): def get_pictures(self, text):
"""Download and open all images from markdown text.""" """Download and open all images from markdown text."""
import requests import requests
@ -320,17 +320,17 @@ class Markdown(MarkdownParser):
except Exception as e: except Exception as e:
logging.error(f"Failed to download/open image from {url}: {e}") logging.error(f"Failed to download/open image from {url}: {e}")
continue continue
return images if images else None return images if images else None
def __call__(self, filename, binary=None): def __call__(self, filename, binary=None, separate_tables=True):
if binary: if binary:
encoding = find_codec(binary) encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore") txt = binary.decode(encoding, errors="ignore")
else: else:
with open(filename, "r") as f: with open(filename, "r") as f:
txt = f.read() txt = f.read()
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n') remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
sections = [] sections = []
tbls = [] tbls = []
for sec in remainder.split("\n"): for sec in remainder.split("\n"):
@ -465,8 +465,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.") callback(0.1, "Start to parse.")
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128))) markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
sections, tables = markdown_parser(filename, binary) sections, tables = markdown_parser(filename, binary, separate_tables=False)
# Process images for each section # Process images for each section
section_images = [] section_images = []
for section_text, _ in sections: for section_text, _ in sections:
@ -477,7 +477,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
section_images.append(combined_image) section_images.append(combined_image)
else: else:
section_images.append(None) section_images.append(None)
res = tokenize_table(tables, doc, is_english) res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.") callback(0.8, "Finish parsing.")
@ -524,7 +524,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
"delimiter", "\n!?。;!?")) "delimiter", "\n!?。;!?"))
if kwargs.get("section_only", False): if kwargs.get("section_only", False):
return chunks return chunks
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images)) res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
else: else:
chunks = naive_merge( chunks = naive_merge(
@ -535,7 +535,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
return chunks return chunks
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
logging.info("naive_merge({}): {}".format(filename, timer() - st)) logging.info("naive_merge({}): {}".format(filename, timer() - st))
return res return res