From 7c20c964b429a765e190472732c1f4dbf034e582 Mon Sep 17 00:00:00 2001
From: Yongteng Lei <yongtengrey@outlook.com>
Date: Tue, 25 Nov 2025 19:54:06 +0800
Subject: [PATCH] Fix: incorrect image merging for naive markdown parser
 (#11520)

### What problem does this PR solve?

Fix incorrect image merging for naive markdown parser. #9349


[ragflow_readme.webm](https://github.com/user-attachments/assets/ca3f1e18-72b6-4a4c-80db-d03da9adf8dc)

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
---
 deepdoc/parser/markdown_parser.py |  57 ++++++--
 rag/app/naive.py                  | 236 ++++++++++++++++++++++--------
 rag/flow/parser/parser.py         |  14 +-
 3 files changed, 231 insertions(+), 76 deletions(-)
diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py
index cfcf0ae83..900ef525c 100644
--- a/deepdoc/parser/markdown_parser.py
+++ b/deepdoc/parser/markdown_parser.py
@@ -72,9 +72,8 @@ class RAGFlowMarkdownParser:
 
         # Replace any TAGS e.g. <table ...> to <table>
         TAGS = ["table", "td", "tr", "th", "tbody", "thead", "div"]
-        table_with_attributes_pattern = re.compile(
-            rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE
-        )
+        table_with_attributes_pattern = re.compile(rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE)
+
         def replace_tag(m):
             tag_name = re.match(r"<(\w+)", m.group()).group(1)
             return "<{}>".format(tag_name)
@@ -128,23 +127,48 @@ class MarkdownElementExtractor:
         self.markdown_content = markdown_content
         self.lines = markdown_content.split("\n")
 
-    def get_delimiters(self,delimiters):
+    def get_delimiters(self, delimiters):
         toks = re.findall(r"`([^`]+)`", delimiters)
         toks = sorted(set(toks), key=lambda x: -len(x))
         return "|".join(re.escape(t) for t in toks if t)
-    
-    def extract_elements(self,delimiter=None):
+
+    def extract_elements(self, delimiter=None, include_meta=False):
         """Extract individual elements (headers, code blocks, lists, etc.)"""
         sections = []
 
         i = 0
-        dels=""
+        dels = ""
         if delimiter:
             dels = self.get_delimiters(delimiter)
         if len(dels) > 0:
             text = "\n".join(self.lines)
-            parts = re.split(dels, text)
-            sections = [p.strip() for p in parts if p and p.strip()]
+            if include_meta:
+                pattern = re.compile(dels)
+                last_end = 0
+                for m in pattern.finditer(text):
+                    part = text[last_end : m.start()]
+                    if part and part.strip():
+                        sections.append(
+                            {
+                                "content": part.strip(),
+                                "start_line": text.count("\n", 0, last_end),
+                                "end_line": text.count("\n", 0, m.start()),
+                            }
+                        )
+                    last_end = m.end()
+
+                part = text[last_end:]
+                if part and part.strip():
+                    sections.append(
+                        {
+                            "content": part.strip(),
+                            "start_line": text.count("\n", 0, last_end),
+                            "end_line": text.count("\n", 0, len(text)),
+                        }
+                    )
+            else:
+                parts = re.split(dels, text)
+                sections = [p.strip() for p in parts if p and p.strip()]
             return sections
         while i < len(self.lines):
             line = self.lines[i]
@@ -152,32 +176,35 @@ class MarkdownElementExtractor:
             if re.match(r"^#{1,6}\s+.*$", line):
                 # header
                 element = self._extract_header(i)
-                sections.append(element["content"])
+                sections.append(element if include_meta else element["content"])
                 i = element["end_line"] + 1
             elif line.strip().startswith("```"):
                 # code block
                 element = self._extract_code_block(i)
-                sections.append(element["content"])
+                sections.append(element if include_meta else element["content"])
                 i = element["end_line"] + 1
             elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
                 # list block
                 element = self._extract_list_block(i)
-                sections.append(element["content"])
+                sections.append(element if include_meta else element["content"])
                 i = element["end_line"] + 1
             elif line.strip().startswith(">"):
                 # blockquote
                 element = self._extract_blockquote(i)
-                sections.append(element["content"])
+                sections.append(element if include_meta else element["content"])
                 i = element["end_line"] + 1
             elif line.strip():
                 # text block (paragraphs and inline elements until next block element)
                 element = self._extract_text_block(i)
-                sections.append(element["content"])
+                sections.append(element if include_meta else element["content"])
                 i = element["end_line"] + 1
             else:
                 i += 1
 
-        sections = [section for section in sections if section.strip()]
+        if include_meta:
+            sections = [section for section in sections if section["content"].strip()]
+        else:
+            sections = [section for section in sections if section.strip()]
         return sections
 
     def _extract_header(self, start_pos):
diff --git a/rag/app/naive.py b/rag/app/naive.py
index 562336d7f..836b3fd9e 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -26,6 +26,7 @@ from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
 from docx.opc.oxml import parse_xml
 from markdown import markdown
 from PIL import Image
+from common.token_utils import num_tokens_from_string
 
 from common.constants import LLMType
 from api.db.services.llm_service import LLMBundle
@@ -464,51 +465,88 @@ class Markdown(MarkdownParser):
         html_content = markdown(text)
         soup = BeautifulSoup(html_content, 'html.parser')
         return soup
-    
-    def get_picture_urls(self, soup):
-        if soup:
-            return [img.get('src') for img in soup.find_all('img') if img.get('src')]
-        return []
 
     def get_hyperlink_urls(self, soup):
         if soup:
             return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
         return []
-    
-    def get_pictures(self, text):
-        """Download and open all images from markdown text."""
+
+    def extract_image_urls_with_lines(self, text):
+        md_img_re = re.compile(r"!\[[^\]]*\]\(([^)\s]+)")
+        html_img_re = re.compile(r'src=["\\\']([^"\\\'>\\s]+)', re.IGNORECASE)
+        urls = []
+        seen = set()
+        lines = text.splitlines()
+        for idx, line in enumerate(lines):
+            for url in md_img_re.findall(line):
+                if (url, idx) not in seen:
+                    urls.append({"url": url, "line": idx})
+                    seen.add((url, idx))
+            for url in html_img_re.findall(line):
+                if (url, idx) not in seen:
+                    urls.append({"url": url, "line": idx})
+                    seen.add((url, idx))
+
+        # cross-line
+        try:
+            from bs4 import BeautifulSoup
+
+            soup = BeautifulSoup(text, 'html.parser')
+            newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)]
+            for img_tag in soup.find_all('img'):
+                src = img_tag.get('src')
+                if not src:
+                    continue
+
+                tag_str = str(img_tag)
+                pos = text.find(tag_str)
+                if pos == -1:
+                    # fallback
+                    pos = max(text.find(src), 0)
+                line_no = 0
+                for i, off in enumerate(newline_offsets):
+                    if pos <= off:
+                        line_no = i
+                        break
+                if (src, line_no) not in seen:
+                    urls.append({"url": src, "line": line_no})
+                    seen.add((src, line_no))
+        except Exception:
+            pass
+
+        return urls
+
+    def load_images_from_urls(self, urls, cache=None):
         import requests
-        soup = self.md_to_html(text)
-        image_urls = self.get_picture_urls(soup)
+        from pathlib import Path
+
+        cache = cache or {}
         images = []
-        # Find all image URLs in text
-        for url in image_urls:
-            if not url:
+        for url in urls:
+            if url in cache:
+                if cache[url]:
+                    images.append(cache[url])
                 continue
+            img_obj = None
             try:
-                # check if the url is a local file or a remote URL
                 if url.startswith(('http://', 'https://')):
-                    # For remote URLs, download the image
                     response = requests.get(url, stream=True, timeout=30)
-                    if response.status_code == 200 and response.headers['Content-Type'] and response.headers['Content-Type'].startswith('image/'):
-                        img = Image.open(BytesIO(response.content)).convert('RGB')
-                        images.append(img)
+                    if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'):
+                        img_obj = Image.open(BytesIO(response.content)).convert('RGB')
                 else:
-                    # For local file paths, open the image directly
-                    from pathlib import Path
                     local_path = Path(url)
-                    if not local_path.exists():
+                    if local_path.exists():
+                        img_obj = Image.open(url).convert('RGB')
+                    else:
                         logging.warning(f"Local image file not found: {url}")
-                        continue
-                    img = Image.open(url).convert('RGB')
-                    images.append(img)
             except Exception as e:
                 logging.error(f"Failed to download/open image from {url}: {e}")
-                continue
+            cache[url] = img_obj
+            if img_obj:
+                images.append(img_obj)
+        return images, cache
 
-        return images if images else None
-
-    def __call__(self, filename, binary=None, separate_tables=True, delimiter=None):
+    def __call__(self, filename, binary=None, separate_tables=True, delimiter=None, return_section_images=False):
         if binary:
             encoding = find_codec(binary)
             txt = binary.decode(encoding, errors="ignore")
@@ -520,11 +558,31 @@ class Markdown(MarkdownParser):
         # To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
         # extractor = MarkdownElementExtractor(remainder)
         extractor = MarkdownElementExtractor(txt)
-        element_sections = extractor.extract_elements(delimiter)
-        sections = [(element, "") for element in element_sections]
+        image_refs = self.extract_image_urls_with_lines(txt)
+        element_sections = extractor.extract_elements(delimiter, include_meta=True)
+
+        sections = []
+        section_images = []
+        image_cache = {}
+        for element in element_sections:
+            content = element["content"]
+            start_line = element["start_line"]
+            end_line = element["end_line"]
+            urls_in_section = [ref["url"] for ref in image_refs if start_line <= ref["line"] <= end_line]
+            imgs = []
+            if urls_in_section:
+                imgs, image_cache = self.load_images_from_urls(urls_in_section, image_cache)
+            combined_image = None
+            if imgs:
+                combined_image = reduce(concat_img, imgs) if len(imgs) > 1 else imgs[0]
+            sections.append((content, ""))
+            section_images.append(combined_image)
+
         tbls = []
         for table in tables:
             tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
+        if return_section_images:
+            return sections, tbls, section_images
         return sections, tbls
 
 def load_from_xml_v2(baseURI, rels_item_xml):
@@ -558,6 +616,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
     parser_config = kwargs.get(
         "parser_config", {
             "chunk_token_num": 512, "delimiter": "\n!?。；！？", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
+    final_sections = False
     doc = {
         "docnm_kwd": filename,
         "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
@@ -709,7 +768,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
     elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
         callback(0.1, "Start to parse.")
         markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
-        sections, tables = markdown_parser(filename, binary, separate_tables=False, delimiter=parser_config.get("delimiter", "\n!?;。；！？"))
+        sections, tables, section_images = markdown_parser(
+            filename,
+            binary,
+            separate_tables=False,
+            delimiter=parser_config.get("delimiter", "\n!?;。；！？"),
+            return_section_images=True,
+        )
+
+        final_sections = True
 
         try:
             vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
@@ -719,19 +786,22 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
 
         if vision_model:
             # Process images for each section
-            section_images = []
             for idx, (section_text, _) in enumerate(sections):
-                images = markdown_parser.get_pictures(section_text) if section_text else None
+                images = []
+                if section_images and len(section_images) > idx and section_images[idx] is not None:
+                    images.append(section_images[idx])
 
-                if images:
+                if images and len(images) > 0:
                     # If multiple images found, combine them using concat_img
                     combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
-                    section_images.append(combined_image)
+                    if section_images:
+                        section_images[idx] = combined_image
+                    else:
+                        section_images = [None] * len(sections)
+                        section_images[idx] = combined_image
                     markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data= [((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
                     boosted_figures = markdown_vision_parser(callback=callback)
                     sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
-                else:
-                    section_images.append(None)
 
         else:
             logging.warning("No visual model detected. Skipping figure parsing enhancement.")
@@ -783,31 +853,81 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
             "file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
 
     st = timer()
-    if section_images:
-        # if all images are None, set section_images to None
-        if all(image is None for image in section_images):
-            section_images = None
+    if final_sections:
+        merged_chunks = []
+        merged_images = []
+        chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128)))
+        overlapped_percent = int(parser_config.get("overlapped_percent", 0))
+        overlapped_percent = max(0, min(overlapped_percent, 90))
 
-    if section_images:
-        chunks, images = naive_merge_with_images(sections, section_images,
-                                        int(parser_config.get(
-                                            "chunk_token_num", 128)), parser_config.get(
-                                            "delimiter", "\n!?。；！？"))
+        current_text = ""
+        current_tokens = 0
+        current_image = None
+
+        for idx, sec in enumerate(sections):
+            text = sec[0] if isinstance(sec, tuple) else sec
+            sec_tokens = num_tokens_from_string(text)
+            sec_image = section_images[idx] if section_images and idx < len(section_images) else None
+
+            if current_text and current_tokens + sec_tokens > chunk_limit:
+                merged_chunks.append(current_text)
+                merged_images.append(current_image)
+                overlap_part = ""
+                if overlapped_percent > 0:
+                    overlap_len = int(len(current_text) * overlapped_percent / 100)
+                    if overlap_len > 0:
+                        overlap_part = current_text[-overlap_len:]
+                current_text = overlap_part
+                current_tokens = num_tokens_from_string(current_text)
+                current_image = current_image if overlap_part else None
+
+            if current_text:
+                current_text += "\n" + text
+            else:
+                current_text = text
+            current_tokens += sec_tokens
+
+            if sec_image:
+                current_image = concat_img(current_image, sec_image) if current_image else sec_image
+
+        if current_text:
+            merged_chunks.append(current_text)
+            merged_images.append(current_image)
+
+        chunks = merged_chunks
+        has_images = merged_images and any(img is not None for img in merged_images)
         if kwargs.get("section_only", False):
             chunks.extend(embed_res)
             return chunks
-
-        res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
+        if has_images:
+            res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images))
+        else:
+            res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
     else:
-        chunks = naive_merge(
-            sections, int(parser_config.get(
-                "chunk_token_num", 128)), parser_config.get(
-                "delimiter", "\n!?。；！？"))
-        if kwargs.get("section_only", False):
-            chunks.extend(embed_res)
-            return chunks
+        if section_images:
+            if all(image is None for image in section_images):
+                section_images = None
 
-        res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
+        if section_images:
+            chunks, images = naive_merge_with_images(sections, section_images,
+                                            int(parser_config.get(
+                                                "chunk_token_num", 128)), parser_config.get(
+                                                "delimiter", "\n!?。；！？"))
+            if kwargs.get("section_only", False):
+                chunks.extend(embed_res)
+                return chunks
+
+            res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
+        else:
+            chunks = naive_merge(
+                sections, int(parser_config.get(
+                    "chunk_token_num", 128)), parser_config.get(
+                    "delimiter", "\n!?。；！？"))
+            if kwargs.get("section_only", False):
+                chunks.extend(embed_res)
+                return chunks
+
+            res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
 
     if urls and parser_config.get("analyze_hyperlink", False) and is_root:
         for index, url in enumerate(urls):
@@ -820,9 +940,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
                 logging.info(f"Failed to chunk url in registered file type {url}: {e}")
                 sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
             url_res.extend(sub_url_res)
-        
+
     logging.info("naive_merge({}): {}".format(filename, timer() - st))
-    
+
     if embed_res:
         res.extend(embed_res)
     if url_res:
diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py
index 2ba5cfa7b..1a111cc3a 100644
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@@ -482,17 +482,25 @@ class Parser(ProcessBase):
         self.set_output("output_format", conf["output_format"])
 
         markdown_parser = naive_markdown_parser()
-        sections, tables = markdown_parser(name, blob, separate_tables=False)
+        sections, tables, section_images = markdown_parser(
+            name,
+            blob,
+            separate_tables=False,
+            delimiter=conf.get("delimiter"),
+            return_section_images=True,
+        )
 
         if conf.get("output_format") == "json":
             json_results = []
 
-            for section_text, _ in sections:
+            for idx, (section_text, _) in enumerate(sections):
                 json_result = {
                     "text": section_text,
                 }
 
-                images = markdown_parser.get_pictures(section_text) if section_text else None
+                images = []
+                if section_images and len(section_images) > idx and section_images[idx] is not None:
+                    images.append(section_images[idx])
                 if images:
                     # If multiple images found, combine them using concat_img
                     combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]