From 7c20c964b429a765e190472732c1f4dbf034e582 Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Tue, 25 Nov 2025 19:54:06 +0800 Subject: [PATCH] Fix: incorrect image merging for naive markdown parser (#11520) ### What problem does this PR solve? Fix incorrect image merging for naive markdown parser. #9349 [ragflow_readme.webm](https://github.com/user-attachments/assets/ca3f1e18-72b6-4a4c-80db-d03da9adf8dc) ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/markdown_parser.py | 57 ++++++-- rag/app/naive.py | 236 ++++++++++++++++++++++-------- rag/flow/parser/parser.py | 14 +- 3 files changed, 231 insertions(+), 76 deletions(-) diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py index cfcf0ae83..900ef525c 100644 --- a/deepdoc/parser/markdown_parser.py +++ b/deepdoc/parser/markdown_parser.py @@ -72,9 +72,8 @@ class RAGFlowMarkdownParser: # Replace any TAGS e.g. to
TAGS = ["table", "td", "tr", "th", "tbody", "thead", "div"] - table_with_attributes_pattern = re.compile( - rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE - ) + table_with_attributes_pattern = re.compile(rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE) + def replace_tag(m): tag_name = re.match(r"<(\w+)", m.group()).group(1) return "<{}>".format(tag_name) @@ -128,23 +127,48 @@ class MarkdownElementExtractor: self.markdown_content = markdown_content self.lines = markdown_content.split("\n") - def get_delimiters(self,delimiters): + def get_delimiters(self, delimiters): toks = re.findall(r"`([^`]+)`", delimiters) toks = sorted(set(toks), key=lambda x: -len(x)) return "|".join(re.escape(t) for t in toks if t) - - def extract_elements(self,delimiter=None): + + def extract_elements(self, delimiter=None, include_meta=False): """Extract individual elements (headers, code blocks, lists, etc.)""" sections = [] i = 0 - dels="" + dels = "" if delimiter: dels = self.get_delimiters(delimiter) if len(dels) > 0: text = "\n".join(self.lines) - parts = re.split(dels, text) - sections = [p.strip() for p in parts if p and p.strip()] + if include_meta: + pattern = re.compile(dels) + last_end = 0 + for m in pattern.finditer(text): + part = text[last_end : m.start()] + if part and part.strip(): + sections.append( + { + "content": part.strip(), + "start_line": text.count("\n", 0, last_end), + "end_line": text.count("\n", 0, m.start()), + } + ) + last_end = m.end() + + part = text[last_end:] + if part and part.strip(): + sections.append( + { + "content": part.strip(), + "start_line": text.count("\n", 0, last_end), + "end_line": text.count("\n", 0, len(text)), + } + ) + else: + parts = re.split(dels, text) + sections = [p.strip() for p in parts if p and p.strip()] return sections while i < len(self.lines): line = self.lines[i] @@ -152,32 +176,35 @@ class MarkdownElementExtractor: if re.match(r"^#{1,6}\s+.*$", line): # header element = self._extract_header(i) - sections.append(element["content"]) + sections.append(element if include_meta else element["content"]) i = element["end_line"] + 1 elif line.strip().startswith("```"): # code block element = self._extract_code_block(i) - sections.append(element["content"]) + sections.append(element if include_meta else element["content"]) i = element["end_line"] + 1 elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line): # list block element = self._extract_list_block(i) - sections.append(element["content"]) + sections.append(element if include_meta else element["content"]) i = element["end_line"] + 1 elif line.strip().startswith(">"): # blockquote element = self._extract_blockquote(i) - sections.append(element["content"]) + sections.append(element if include_meta else element["content"]) i = element["end_line"] + 1 elif line.strip(): # text block (paragraphs and inline elements until next block element) element = self._extract_text_block(i) - sections.append(element["content"]) + sections.append(element if include_meta else element["content"]) i = element["end_line"] + 1 else: i += 1 - sections = [section for section in sections if section.strip()] + if include_meta: + sections = [section for section in sections if section["content"].strip()] + else: + sections = [section for section in sections if section.strip()] return sections def _extract_header(self, start_pos): diff --git a/rag/app/naive.py b/rag/app/naive.py index 562336d7f..836b3fd9e 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -26,6 +26,7 @@ from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship from docx.opc.oxml import parse_xml from markdown import markdown from PIL import Image +from common.token_utils import num_tokens_from_string from common.constants import LLMType from api.db.services.llm_service import LLMBundle @@ -464,51 +465,88 @@ class Markdown(MarkdownParser): html_content = markdown(text) soup = BeautifulSoup(html_content, 'html.parser') return soup - - def get_picture_urls(self, soup): - if soup: - return [img.get('src') for img in soup.find_all('img') if img.get('src')] - return [] def get_hyperlink_urls(self, soup): if soup: return set([a.get('href') for a in soup.find_all('a') if a.get('href')]) return [] - - def get_pictures(self, text): - """Download and open all images from markdown text.""" + + def extract_image_urls_with_lines(self, text): + md_img_re = re.compile(r"!\[[^\]]*\]\(([^)\s]+)") + html_img_re = re.compile(r'src=["\\\']([^"\\\'>\\s]+)', re.IGNORECASE) + urls = [] + seen = set() + lines = text.splitlines() + for idx, line in enumerate(lines): + for url in md_img_re.findall(line): + if (url, idx) not in seen: + urls.append({"url": url, "line": idx}) + seen.add((url, idx)) + for url in html_img_re.findall(line): + if (url, idx) not in seen: + urls.append({"url": url, "line": idx}) + seen.add((url, idx)) + + # cross-line + try: + from bs4 import BeautifulSoup + + soup = BeautifulSoup(text, 'html.parser') + newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)] + for img_tag in soup.find_all('img'): + src = img_tag.get('src') + if not src: + continue + + tag_str = str(img_tag) + pos = text.find(tag_str) + if pos == -1: + # fallback + pos = max(text.find(src), 0) + line_no = 0 + for i, off in enumerate(newline_offsets): + if pos <= off: + line_no = i + break + if (src, line_no) not in seen: + urls.append({"url": src, "line": line_no}) + seen.add((src, line_no)) + except Exception: + pass + + return urls + + def load_images_from_urls(self, urls, cache=None): import requests - soup = self.md_to_html(text) - image_urls = self.get_picture_urls(soup) + from pathlib import Path + + cache = cache or {} images = [] - # Find all image URLs in text - for url in image_urls: - if not url: + for url in urls: + if url in cache: + if cache[url]: + images.append(cache[url]) continue + img_obj = None try: - # check if the url is a local file or a remote URL if url.startswith(('http://', 'https://')): - # For remote URLs, download the image response = requests.get(url, stream=True, timeout=30) - if response.status_code == 200 and response.headers['Content-Type'] and response.headers['Content-Type'].startswith('image/'): - img = Image.open(BytesIO(response.content)).convert('RGB') - images.append(img) + if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'): + img_obj = Image.open(BytesIO(response.content)).convert('RGB') else: - # For local file paths, open the image directly - from pathlib import Path local_path = Path(url) - if not local_path.exists(): + if local_path.exists(): + img_obj = Image.open(url).convert('RGB') + else: logging.warning(f"Local image file not found: {url}") - continue - img = Image.open(url).convert('RGB') - images.append(img) except Exception as e: logging.error(f"Failed to download/open image from {url}: {e}") - continue + cache[url] = img_obj + if img_obj: + images.append(img_obj) + return images, cache - return images if images else None - - def __call__(self, filename, binary=None, separate_tables=True, delimiter=None): + def __call__(self, filename, binary=None, separate_tables=True, delimiter=None, return_section_images=False): if binary: encoding = find_codec(binary) txt = binary.decode(encoding, errors="ignore") @@ -520,11 +558,31 @@ class Markdown(MarkdownParser): # To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410. # extractor = MarkdownElementExtractor(remainder) extractor = MarkdownElementExtractor(txt) - element_sections = extractor.extract_elements(delimiter) - sections = [(element, "") for element in element_sections] + image_refs = self.extract_image_urls_with_lines(txt) + element_sections = extractor.extract_elements(delimiter, include_meta=True) + + sections = [] + section_images = [] + image_cache = {} + for element in element_sections: + content = element["content"] + start_line = element["start_line"] + end_line = element["end_line"] + urls_in_section = [ref["url"] for ref in image_refs if start_line <= ref["line"] <= end_line] + imgs = [] + if urls_in_section: + imgs, image_cache = self.load_images_from_urls(urls_in_section, image_cache) + combined_image = None + if imgs: + combined_image = reduce(concat_img, imgs) if len(imgs) > 1 else imgs[0] + sections.append((content, "")) + section_images.append(combined_image) + tbls = [] for table in tables: tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), "")) + if return_section_images: + return sections, tbls, section_images return sections, tbls def load_from_xml_v2(baseURI, rels_item_xml): @@ -558,6 +616,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, parser_config = kwargs.get( "parser_config", { "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True}) + final_sections = False doc = { "docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) @@ -709,7 +768,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128))) - sections, tables = markdown_parser(filename, binary, separate_tables=False, delimiter=parser_config.get("delimiter", "\n!?;。;!?")) + sections, tables, section_images = markdown_parser( + filename, + binary, + separate_tables=False, + delimiter=parser_config.get("delimiter", "\n!?;。;!?"), + return_section_images=True, + ) + + final_sections = True try: vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT) @@ -719,19 +786,22 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if vision_model: # Process images for each section - section_images = [] for idx, (section_text, _) in enumerate(sections): - images = markdown_parser.get_pictures(section_text) if section_text else None + images = [] + if section_images and len(section_images) > idx and section_images[idx] is not None: + images.append(section_images[idx]) - if images: + if images and len(images) > 0: # If multiple images found, combine them using concat_img combined_image = reduce(concat_img, images) if len(images) > 1 else images[0] - section_images.append(combined_image) + if section_images: + section_images[idx] = combined_image + else: + section_images = [None] * len(sections) + section_images[idx] = combined_image markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data= [((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs) boosted_figures = markdown_vision_parser(callback=callback) sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1]) - else: - section_images.append(None) else: logging.warning("No visual model detected. Skipping figure parsing enhancement.") @@ -783,31 +853,81 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, "file type not supported yet(pdf, xlsx, doc, docx, txt supported)") st = timer() - if section_images: - # if all images are None, set section_images to None - if all(image is None for image in section_images): - section_images = None + if final_sections: + merged_chunks = [] + merged_images = [] + chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128))) + overlapped_percent = int(parser_config.get("overlapped_percent", 0)) + overlapped_percent = max(0, min(overlapped_percent, 90)) - if section_images: - chunks, images = naive_merge_with_images(sections, section_images, - int(parser_config.get( - "chunk_token_num", 128)), parser_config.get( - "delimiter", "\n!?。;!?")) + current_text = "" + current_tokens = 0 + current_image = None + + for idx, sec in enumerate(sections): + text = sec[0] if isinstance(sec, tuple) else sec + sec_tokens = num_tokens_from_string(text) + sec_image = section_images[idx] if section_images and idx < len(section_images) else None + + if current_text and current_tokens + sec_tokens > chunk_limit: + merged_chunks.append(current_text) + merged_images.append(current_image) + overlap_part = "" + if overlapped_percent > 0: + overlap_len = int(len(current_text) * overlapped_percent / 100) + if overlap_len > 0: + overlap_part = current_text[-overlap_len:] + current_text = overlap_part + current_tokens = num_tokens_from_string(current_text) + current_image = current_image if overlap_part else None + + if current_text: + current_text += "\n" + text + else: + current_text = text + current_tokens += sec_tokens + + if sec_image: + current_image = concat_img(current_image, sec_image) if current_image else sec_image + + if current_text: + merged_chunks.append(current_text) + merged_images.append(current_image) + + chunks = merged_chunks + has_images = merged_images and any(img is not None for img in merged_images) if kwargs.get("section_only", False): chunks.extend(embed_res) return chunks - - res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images)) + if has_images: + res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images)) + else: + res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) else: - chunks = naive_merge( - sections, int(parser_config.get( - "chunk_token_num", 128)), parser_config.get( - "delimiter", "\n!?。;!?")) - if kwargs.get("section_only", False): - chunks.extend(embed_res) - return chunks + if section_images: + if all(image is None for image in section_images): + section_images = None - res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) + if section_images: + chunks, images = naive_merge_with_images(sections, section_images, + int(parser_config.get( + "chunk_token_num", 128)), parser_config.get( + "delimiter", "\n!?。;!?")) + if kwargs.get("section_only", False): + chunks.extend(embed_res) + return chunks + + res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images)) + else: + chunks = naive_merge( + sections, int(parser_config.get( + "chunk_token_num", 128)), parser_config.get( + "delimiter", "\n!?。;!?")) + if kwargs.get("section_only", False): + chunks.extend(embed_res) + return chunks + + res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser)) if urls and parser_config.get("analyze_hyperlink", False) and is_root: for index, url in enumerate(urls): @@ -820,9 +940,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, logging.info(f"Failed to chunk url in registered file type {url}: {e}") sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs) url_res.extend(sub_url_res) - + logging.info("naive_merge({}): {}".format(filename, timer() - st)) - + if embed_res: res.extend(embed_res) if url_res: diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index 2ba5cfa7b..1a111cc3a 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -482,17 +482,25 @@ class Parser(ProcessBase): self.set_output("output_format", conf["output_format"]) markdown_parser = naive_markdown_parser() - sections, tables = markdown_parser(name, blob, separate_tables=False) + sections, tables, section_images = markdown_parser( + name, + blob, + separate_tables=False, + delimiter=conf.get("delimiter"), + return_section_images=True, + ) if conf.get("output_format") == "json": json_results = [] - for section_text, _ in sections: + for idx, (section_text, _) in enumerate(sections): json_result = { "text": section_text, } - images = markdown_parser.get_pictures(section_text) if section_text else None + images = [] + if section_images and len(section_images) > idx and section_images[idx] is not None: + images.append(section_images[idx]) if images: # If multiple images found, combine them using concat_img combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]