mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix: fixed context loss caused by separating markdown tables from original text (#8844)
### What problem does this PR solve? Fix context loss caused by separating markdown tables from original text. #6871, #8804. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -17,39 +17,55 @@
|
|||||||
|
|
||||||
import re
|
import re
|
||||||
|
|
||||||
|
from markdown import markdown
|
||||||
|
|
||||||
class RAGFlowMarkdownParser:
|
class RAGFlowMarkdownParser:
|
||||||
def __init__(self, chunk_token_num=128):
|
def __init__(self, chunk_token_num=128):
|
||||||
self.chunk_token_num = int(chunk_token_num)
|
self.chunk_token_num = int(chunk_token_num)
|
||||||
|
|
||||||
def extract_tables_and_remainder(self, markdown_text):
|
def extract_tables_and_remainder(self, markdown_text, separate_tables=True):
|
||||||
tables = []
|
tables = []
|
||||||
remainder = markdown_text
|
working_text = markdown_text
|
||||||
|
|
||||||
|
def replace_tables_with_rendered_html(pattern, table_list, render=True):
|
||||||
|
new_text = ""
|
||||||
|
last_end = 0
|
||||||
|
for match in pattern.finditer(working_text):
|
||||||
|
raw_table = match.group()
|
||||||
|
table_list.append(raw_table)
|
||||||
|
if separate_tables:
|
||||||
|
# Skip this match (i.e., remove it)
|
||||||
|
new_text += working_text[last_end:match.start()] + "\n\n"
|
||||||
|
else:
|
||||||
|
# Replace with rendered HTML
|
||||||
|
html_table = markdown(raw_table, extensions=['markdown.extensions.tables']) if render else raw_table
|
||||||
|
new_text += working_text[last_end:match.start()] + html_table + "\n\n"
|
||||||
|
last_end = match.end()
|
||||||
|
new_text += working_text[last_end:]
|
||||||
|
return new_text
|
||||||
|
|
||||||
if "|" in markdown_text: # for optimize performance
|
if "|" in markdown_text: # for optimize performance
|
||||||
# Standard Markdown table
|
# Standard Markdown table
|
||||||
border_table_pattern = re.compile(
|
border_table_pattern = re.compile(
|
||||||
r'''
|
r'''
|
||||||
(?:\n|^)
|
(?:\n|^)
|
||||||
(?:\|.*?\|.*?\|.*?\n)
|
(?:\|.*?\|.*?\|.*?\n)
|
||||||
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
|
(?:\|(?:\s*[:-]+[-| :]*\s*)\|.*?\n)
|
||||||
(?:\|.*?\|.*?\|.*?\n)+
|
(?:\|.*?\|.*?\|.*?\n)+
|
||||||
''', re.VERBOSE)
|
''', re.VERBOSE)
|
||||||
border_tables = border_table_pattern.findall(markdown_text)
|
working_text = replace_tables_with_rendered_html(border_table_pattern, tables)
|
||||||
tables.extend(border_tables)
|
|
||||||
remainder = border_table_pattern.sub('', remainder)
|
|
||||||
|
|
||||||
# Borderless Markdown table
|
# Borderless Markdown table
|
||||||
no_border_table_pattern = re.compile(
|
no_border_table_pattern = re.compile(
|
||||||
r'''
|
r'''
|
||||||
(?:\n|^)
|
(?:\n|^)
|
||||||
(?:\S.*?\|.*?\n)
|
(?:\S.*?\|.*?\n)
|
||||||
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
|
(?:(?:\s*[:-]+[-| :]*\s*).*?\n)
|
||||||
(?:\S.*?\|.*?\n)+
|
(?:\S.*?\|.*?\n)+
|
||||||
''', re.VERBOSE)
|
''', re.VERBOSE)
|
||||||
no_border_tables = no_border_table_pattern.findall(remainder)
|
working_text = replace_tables_with_rendered_html(no_border_table_pattern, tables)
|
||||||
tables.extend(no_border_tables)
|
|
||||||
remainder = no_border_table_pattern.sub('', remainder)
|
|
||||||
|
|
||||||
if "<table>" in remainder.lower(): # for optimize performance
|
if "<table>" in working_text.lower(): # for optimize performance
|
||||||
#HTML table extraction - handle possible html/body wrapper tags
|
#HTML table extraction - handle possible html/body wrapper tags
|
||||||
html_table_pattern = re.compile(
|
html_table_pattern = re.compile(
|
||||||
r'''
|
r'''
|
||||||
@ -70,8 +86,21 @@ class RAGFlowMarkdownParser:
|
|||||||
''',
|
''',
|
||||||
re.VERBOSE | re.DOTALL | re.IGNORECASE
|
re.VERBOSE | re.DOTALL | re.IGNORECASE
|
||||||
)
|
)
|
||||||
html_tables = html_table_pattern.findall(remainder)
|
def replace_html_tables():
|
||||||
tables.extend(html_tables)
|
nonlocal working_text
|
||||||
remainder = html_table_pattern.sub('', remainder)
|
new_text = ""
|
||||||
|
last_end = 0
|
||||||
|
for match in html_table_pattern.finditer(working_text):
|
||||||
|
raw_table = match.group()
|
||||||
|
tables.append(raw_table)
|
||||||
|
if separate_tables:
|
||||||
|
new_text += working_text[last_end:match.start()] + "\n\n"
|
||||||
|
else:
|
||||||
|
new_text += working_text[last_end:match.start()] + raw_table + "\n\n"
|
||||||
|
last_end = match.end()
|
||||||
|
new_text += working_text[last_end:]
|
||||||
|
working_text = new_text
|
||||||
|
|
||||||
return remainder, tables
|
replace_html_tables()
|
||||||
|
|
||||||
|
return working_text, tables
|
||||||
|
|||||||
@ -22,7 +22,7 @@ from timeit import default_timer as timer
|
|||||||
|
|
||||||
from docx import Document
|
from docx import Document
|
||||||
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
|
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
|
||||||
from markdown import markdown
|
from markdown import markdown
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from tika import parser
|
from tika import parser
|
||||||
|
|
||||||
@ -76,15 +76,15 @@ class Docx(DocxParser):
|
|||||||
"""Get the hierarchical title structure before the table"""
|
"""Get the hierarchical title structure before the table"""
|
||||||
import re
|
import re
|
||||||
from docx.text.paragraph import Paragraph
|
from docx.text.paragraph import Paragraph
|
||||||
|
|
||||||
titles = []
|
titles = []
|
||||||
blocks = []
|
blocks = []
|
||||||
|
|
||||||
# Get document name from filename parameter
|
# Get document name from filename parameter
|
||||||
doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename)
|
doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename)
|
||||||
if not doc_name:
|
if not doc_name:
|
||||||
doc_name = "Untitled Document"
|
doc_name = "Untitled Document"
|
||||||
|
|
||||||
# Collect all document blocks while maintaining document order
|
# Collect all document blocks while maintaining document order
|
||||||
try:
|
try:
|
||||||
# Iterate through all paragraphs and tables in document order
|
# Iterate through all paragraphs and tables in document order
|
||||||
@ -97,7 +97,7 @@ class Docx(DocxParser):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Error collecting blocks: {e}")
|
logging.error(f"Error collecting blocks: {e}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
# Find the target table position
|
# Find the target table position
|
||||||
target_table_pos = -1
|
target_table_pos = -1
|
||||||
table_count = 0
|
table_count = 0
|
||||||
@ -107,20 +107,20 @@ class Docx(DocxParser):
|
|||||||
target_table_pos = pos
|
target_table_pos = pos
|
||||||
break
|
break
|
||||||
table_count += 1
|
table_count += 1
|
||||||
|
|
||||||
if target_table_pos == -1:
|
if target_table_pos == -1:
|
||||||
return "" # Target table not found
|
return "" # Target table not found
|
||||||
|
|
||||||
# Find the nearest heading paragraph in reverse order
|
# Find the nearest heading paragraph in reverse order
|
||||||
nearest_title = None
|
nearest_title = None
|
||||||
for i in range(len(blocks)-1, -1, -1):
|
for i in range(len(blocks)-1, -1, -1):
|
||||||
block_type, pos, block = blocks[i]
|
block_type, pos, block = blocks[i]
|
||||||
if pos >= target_table_pos: # Skip blocks after the table
|
if pos >= target_table_pos: # Skip blocks after the table
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if block_type != 'p':
|
if block_type != 'p':
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
||||||
try:
|
try:
|
||||||
level_match = re.search(r"(\d+)", block.style.name)
|
level_match = re.search(r"(\d+)", block.style.name)
|
||||||
@ -133,12 +133,12 @@ class Docx(DocxParser):
|
|||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Error parsing heading level: {e}")
|
logging.error(f"Error parsing heading level: {e}")
|
||||||
|
|
||||||
if nearest_title:
|
if nearest_title:
|
||||||
# Add current title
|
# Add current title
|
||||||
titles.append(nearest_title)
|
titles.append(nearest_title)
|
||||||
current_level = nearest_title[0]
|
current_level = nearest_title[0]
|
||||||
|
|
||||||
# Find all parent headings, allowing cross-level search
|
# Find all parent headings, allowing cross-level search
|
||||||
while current_level > 1:
|
while current_level > 1:
|
||||||
found = False
|
found = False
|
||||||
@ -146,17 +146,17 @@ class Docx(DocxParser):
|
|||||||
block_type, pos, block = blocks[i]
|
block_type, pos, block = blocks[i]
|
||||||
if pos >= target_table_pos: # Skip blocks after the table
|
if pos >= target_table_pos: # Skip blocks after the table
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if block_type != 'p':
|
if block_type != 'p':
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
||||||
try:
|
try:
|
||||||
level_match = re.search(r"(\d+)", block.style.name)
|
level_match = re.search(r"(\d+)", block.style.name)
|
||||||
if level_match:
|
if level_match:
|
||||||
level = int(level_match.group(1))
|
level = int(level_match.group(1))
|
||||||
# Find any heading with a higher level
|
# Find any heading with a higher level
|
||||||
if level < current_level:
|
if level < current_level:
|
||||||
title_text = block.text.strip()
|
title_text = block.text.strip()
|
||||||
if title_text: # Avoid empty titles
|
if title_text: # Avoid empty titles
|
||||||
titles.append((level, title_text))
|
titles.append((level, title_text))
|
||||||
@ -165,16 +165,16 @@ class Docx(DocxParser):
|
|||||||
break
|
break
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Error parsing parent heading: {e}")
|
logging.error(f"Error parsing parent heading: {e}")
|
||||||
|
|
||||||
if not found: # Break if no parent heading is found
|
if not found: # Break if no parent heading is found
|
||||||
break
|
break
|
||||||
|
|
||||||
# Sort by level (ascending, from highest to lowest)
|
# Sort by level (ascending, from highest to lowest)
|
||||||
titles.sort(key=lambda x: x[0])
|
titles.sort(key=lambda x: x[0])
|
||||||
# Organize titles (from highest to lowest)
|
# Organize titles (from highest to lowest)
|
||||||
hierarchy = [doc_name] + [t[1] for t in titles]
|
hierarchy = [doc_name] + [t[1] for t in titles]
|
||||||
return " > ".join(hierarchy)
|
return " > ".join(hierarchy)
|
||||||
|
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
|
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
|
||||||
@ -298,13 +298,13 @@ class Markdown(MarkdownParser):
|
|||||||
text = sections[0]
|
text = sections[0]
|
||||||
else:
|
else:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
html_content = markdown(text)
|
html_content = markdown(text)
|
||||||
soup = BeautifulSoup(html_content, 'html.parser')
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')]
|
html_images = [img.get('src') for img in soup.find_all('img') if img.get('src')]
|
||||||
return html_images
|
return html_images
|
||||||
|
|
||||||
def get_pictures(self, text):
|
def get_pictures(self, text):
|
||||||
"""Download and open all images from markdown text."""
|
"""Download and open all images from markdown text."""
|
||||||
import requests
|
import requests
|
||||||
@ -320,17 +320,17 @@ class Markdown(MarkdownParser):
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Failed to download/open image from {url}: {e}")
|
logging.error(f"Failed to download/open image from {url}: {e}")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return images if images else None
|
return images if images else None
|
||||||
|
|
||||||
def __call__(self, filename, binary=None):
|
def __call__(self, filename, binary=None, separate_tables=True):
|
||||||
if binary:
|
if binary:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding, errors="ignore")
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
else:
|
else:
|
||||||
with open(filename, "r") as f:
|
with open(filename, "r") as f:
|
||||||
txt = f.read()
|
txt = f.read()
|
||||||
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n')
|
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
|
||||||
sections = []
|
sections = []
|
||||||
tbls = []
|
tbls = []
|
||||||
for sec in remainder.split("\n"):
|
for sec in remainder.split("\n"):
|
||||||
@ -465,8 +465,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
|
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
|
||||||
sections, tables = markdown_parser(filename, binary)
|
sections, tables = markdown_parser(filename, binary, separate_tables=False)
|
||||||
|
|
||||||
# Process images for each section
|
# Process images for each section
|
||||||
section_images = []
|
section_images = []
|
||||||
for section_text, _ in sections:
|
for section_text, _ in sections:
|
||||||
@ -477,7 +477,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
section_images.append(combined_image)
|
section_images.append(combined_image)
|
||||||
else:
|
else:
|
||||||
section_images.append(None)
|
section_images.append(None)
|
||||||
|
|
||||||
res = tokenize_table(tables, doc, is_english)
|
res = tokenize_table(tables, doc, is_english)
|
||||||
callback(0.8, "Finish parsing.")
|
callback(0.8, "Finish parsing.")
|
||||||
|
|
||||||
@ -524,7 +524,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
"delimiter", "\n!?。;!?"))
|
"delimiter", "\n!?。;!?"))
|
||||||
if kwargs.get("section_only", False):
|
if kwargs.get("section_only", False):
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
|
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
|
||||||
else:
|
else:
|
||||||
chunks = naive_merge(
|
chunks = naive_merge(
|
||||||
@ -535,7 +535,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
|
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
|
||||||
|
|
||||||
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
||||||
return res
|
return res
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user