diff --git a/deepdoc/parser/markdown_parser.py b/deepdoc/parser/markdown_parser.py
index cfcf0ae83..900ef525c 100644
--- a/deepdoc/parser/markdown_parser.py
+++ b/deepdoc/parser/markdown_parser.py
@@ -72,9 +72,8 @@ class RAGFlowMarkdownParser:
# Replace any TAGS e.g.
to
TAGS = ["table", "td", "tr", "th", "tbody", "thead", "div"]
- table_with_attributes_pattern = re.compile(
- rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE
- )
+ table_with_attributes_pattern = re.compile(rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE)
+
def replace_tag(m):
tag_name = re.match(r"<(\w+)", m.group()).group(1)
return "<{}>".format(tag_name)
@@ -128,23 +127,48 @@ class MarkdownElementExtractor:
self.markdown_content = markdown_content
self.lines = markdown_content.split("\n")
- def get_delimiters(self,delimiters):
+ def get_delimiters(self, delimiters):
toks = re.findall(r"`([^`]+)`", delimiters)
toks = sorted(set(toks), key=lambda x: -len(x))
return "|".join(re.escape(t) for t in toks if t)
-
- def extract_elements(self,delimiter=None):
+
+ def extract_elements(self, delimiter=None, include_meta=False):
"""Extract individual elements (headers, code blocks, lists, etc.)"""
sections = []
i = 0
- dels=""
+ dels = ""
if delimiter:
dels = self.get_delimiters(delimiter)
if len(dels) > 0:
text = "\n".join(self.lines)
- parts = re.split(dels, text)
- sections = [p.strip() for p in parts if p and p.strip()]
+ if include_meta:
+ pattern = re.compile(dels)
+ last_end = 0
+ for m in pattern.finditer(text):
+ part = text[last_end : m.start()]
+ if part and part.strip():
+ sections.append(
+ {
+ "content": part.strip(),
+ "start_line": text.count("\n", 0, last_end),
+ "end_line": text.count("\n", 0, m.start()),
+ }
+ )
+ last_end = m.end()
+
+ part = text[last_end:]
+ if part and part.strip():
+ sections.append(
+ {
+ "content": part.strip(),
+ "start_line": text.count("\n", 0, last_end),
+ "end_line": text.count("\n", 0, len(text)),
+ }
+ )
+ else:
+ parts = re.split(dels, text)
+ sections = [p.strip() for p in parts if p and p.strip()]
return sections
while i < len(self.lines):
line = self.lines[i]
@@ -152,32 +176,35 @@ class MarkdownElementExtractor:
if re.match(r"^#{1,6}\s+.*$", line):
# header
element = self._extract_header(i)
- sections.append(element["content"])
+ sections.append(element if include_meta else element["content"])
i = element["end_line"] + 1
elif line.strip().startswith("```"):
# code block
element = self._extract_code_block(i)
- sections.append(element["content"])
+ sections.append(element if include_meta else element["content"])
i = element["end_line"] + 1
elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
# list block
element = self._extract_list_block(i)
- sections.append(element["content"])
+ sections.append(element if include_meta else element["content"])
i = element["end_line"] + 1
elif line.strip().startswith(">"):
# blockquote
element = self._extract_blockquote(i)
- sections.append(element["content"])
+ sections.append(element if include_meta else element["content"])
i = element["end_line"] + 1
elif line.strip():
# text block (paragraphs and inline elements until next block element)
element = self._extract_text_block(i)
- sections.append(element["content"])
+ sections.append(element if include_meta else element["content"])
i = element["end_line"] + 1
else:
i += 1
- sections = [section for section in sections if section.strip()]
+ if include_meta:
+ sections = [section for section in sections if section["content"].strip()]
+ else:
+ sections = [section for section in sections if section.strip()]
return sections
def _extract_header(self, start_pos):
diff --git a/rag/app/naive.py b/rag/app/naive.py
index 562336d7f..836b3fd9e 100644
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@@ -26,6 +26,7 @@ from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
from docx.opc.oxml import parse_xml
from markdown import markdown
from PIL import Image
+from common.token_utils import num_tokens_from_string
from common.constants import LLMType
from api.db.services.llm_service import LLMBundle
@@ -464,51 +465,88 @@ class Markdown(MarkdownParser):
html_content = markdown(text)
soup = BeautifulSoup(html_content, 'html.parser')
return soup
-
- def get_picture_urls(self, soup):
- if soup:
- return [img.get('src') for img in soup.find_all('img') if img.get('src')]
- return []
def get_hyperlink_urls(self, soup):
if soup:
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
return []
-
- def get_pictures(self, text):
- """Download and open all images from markdown text."""
+
+ def extract_image_urls_with_lines(self, text):
+ md_img_re = re.compile(r"!\[[^\]]*\]\(([^)\s]+)")
+ html_img_re = re.compile(r'src=["\\\']([^"\\\'>\\s]+)', re.IGNORECASE)
+ urls = []
+ seen = set()
+ lines = text.splitlines()
+ for idx, line in enumerate(lines):
+ for url in md_img_re.findall(line):
+ if (url, idx) not in seen:
+ urls.append({"url": url, "line": idx})
+ seen.add((url, idx))
+ for url in html_img_re.findall(line):
+ if (url, idx) not in seen:
+ urls.append({"url": url, "line": idx})
+ seen.add((url, idx))
+
+ # cross-line
+ try:
+ from bs4 import BeautifulSoup
+
+ soup = BeautifulSoup(text, 'html.parser')
+ newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)]
+ for img_tag in soup.find_all('img'):
+ src = img_tag.get('src')
+ if not src:
+ continue
+
+ tag_str = str(img_tag)
+ pos = text.find(tag_str)
+ if pos == -1:
+ # fallback
+ pos = max(text.find(src), 0)
+ line_no = 0
+ for i, off in enumerate(newline_offsets):
+ if pos <= off:
+ line_no = i
+ break
+ if (src, line_no) not in seen:
+ urls.append({"url": src, "line": line_no})
+ seen.add((src, line_no))
+ except Exception:
+ pass
+
+ return urls
+
+ def load_images_from_urls(self, urls, cache=None):
import requests
- soup = self.md_to_html(text)
- image_urls = self.get_picture_urls(soup)
+ from pathlib import Path
+
+ cache = cache or {}
images = []
- # Find all image URLs in text
- for url in image_urls:
- if not url:
+ for url in urls:
+ if url in cache:
+ if cache[url]:
+ images.append(cache[url])
continue
+ img_obj = None
try:
- # check if the url is a local file or a remote URL
if url.startswith(('http://', 'https://')):
- # For remote URLs, download the image
response = requests.get(url, stream=True, timeout=30)
- if response.status_code == 200 and response.headers['Content-Type'] and response.headers['Content-Type'].startswith('image/'):
- img = Image.open(BytesIO(response.content)).convert('RGB')
- images.append(img)
+ if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'):
+ img_obj = Image.open(BytesIO(response.content)).convert('RGB')
else:
- # For local file paths, open the image directly
- from pathlib import Path
local_path = Path(url)
- if not local_path.exists():
+ if local_path.exists():
+ img_obj = Image.open(url).convert('RGB')
+ else:
logging.warning(f"Local image file not found: {url}")
- continue
- img = Image.open(url).convert('RGB')
- images.append(img)
except Exception as e:
logging.error(f"Failed to download/open image from {url}: {e}")
- continue
+ cache[url] = img_obj
+ if img_obj:
+ images.append(img_obj)
+ return images, cache
- return images if images else None
-
- def __call__(self, filename, binary=None, separate_tables=True, delimiter=None):
+ def __call__(self, filename, binary=None, separate_tables=True, delimiter=None, return_section_images=False):
if binary:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
@@ -520,11 +558,31 @@ class Markdown(MarkdownParser):
# To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
# extractor = MarkdownElementExtractor(remainder)
extractor = MarkdownElementExtractor(txt)
- element_sections = extractor.extract_elements(delimiter)
- sections = [(element, "") for element in element_sections]
+ image_refs = self.extract_image_urls_with_lines(txt)
+ element_sections = extractor.extract_elements(delimiter, include_meta=True)
+
+ sections = []
+ section_images = []
+ image_cache = {}
+ for element in element_sections:
+ content = element["content"]
+ start_line = element["start_line"]
+ end_line = element["end_line"]
+ urls_in_section = [ref["url"] for ref in image_refs if start_line <= ref["line"] <= end_line]
+ imgs = []
+ if urls_in_section:
+ imgs, image_cache = self.load_images_from_urls(urls_in_section, image_cache)
+ combined_image = None
+ if imgs:
+ combined_image = reduce(concat_img, imgs) if len(imgs) > 1 else imgs[0]
+ sections.append((content, ""))
+ section_images.append(combined_image)
+
tbls = []
for table in tables:
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
+ if return_section_images:
+ return sections, tbls, section_images
return sections, tbls
def load_from_xml_v2(baseURI, rels_item_xml):
@@ -558,6 +616,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
+ final_sections = False
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
@@ -709,7 +768,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
- sections, tables = markdown_parser(filename, binary, separate_tables=False, delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
+ sections, tables, section_images = markdown_parser(
+ filename,
+ binary,
+ separate_tables=False,
+ delimiter=parser_config.get("delimiter", "\n!?;。;!?"),
+ return_section_images=True,
+ )
+
+ final_sections = True
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
@@ -719,19 +786,22 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if vision_model:
# Process images for each section
- section_images = []
for idx, (section_text, _) in enumerate(sections):
- images = markdown_parser.get_pictures(section_text) if section_text else None
+ images = []
+ if section_images and len(section_images) > idx and section_images[idx] is not None:
+ images.append(section_images[idx])
- if images:
+ if images and len(images) > 0:
# If multiple images found, combine them using concat_img
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
- section_images.append(combined_image)
+ if section_images:
+ section_images[idx] = combined_image
+ else:
+ section_images = [None] * len(sections)
+ section_images[idx] = combined_image
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data= [((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
boosted_figures = markdown_vision_parser(callback=callback)
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
- else:
- section_images.append(None)
else:
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
@@ -783,31 +853,81 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
st = timer()
- if section_images:
- # if all images are None, set section_images to None
- if all(image is None for image in section_images):
- section_images = None
+ if final_sections:
+ merged_chunks = []
+ merged_images = []
+ chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128)))
+ overlapped_percent = int(parser_config.get("overlapped_percent", 0))
+ overlapped_percent = max(0, min(overlapped_percent, 90))
- if section_images:
- chunks, images = naive_merge_with_images(sections, section_images,
- int(parser_config.get(
- "chunk_token_num", 128)), parser_config.get(
- "delimiter", "\n!?。;!?"))
+ current_text = ""
+ current_tokens = 0
+ current_image = None
+
+ for idx, sec in enumerate(sections):
+ text = sec[0] if isinstance(sec, tuple) else sec
+ sec_tokens = num_tokens_from_string(text)
+ sec_image = section_images[idx] if section_images and idx < len(section_images) else None
+
+ if current_text and current_tokens + sec_tokens > chunk_limit:
+ merged_chunks.append(current_text)
+ merged_images.append(current_image)
+ overlap_part = ""
+ if overlapped_percent > 0:
+ overlap_len = int(len(current_text) * overlapped_percent / 100)
+ if overlap_len > 0:
+ overlap_part = current_text[-overlap_len:]
+ current_text = overlap_part
+ current_tokens = num_tokens_from_string(current_text)
+ current_image = current_image if overlap_part else None
+
+ if current_text:
+ current_text += "\n" + text
+ else:
+ current_text = text
+ current_tokens += sec_tokens
+
+ if sec_image:
+ current_image = concat_img(current_image, sec_image) if current_image else sec_image
+
+ if current_text:
+ merged_chunks.append(current_text)
+ merged_images.append(current_image)
+
+ chunks = merged_chunks
+ has_images = merged_images and any(img is not None for img in merged_images)
if kwargs.get("section_only", False):
chunks.extend(embed_res)
return chunks
-
- res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
+ if has_images:
+ res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images))
+ else:
+ res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
else:
- chunks = naive_merge(
- sections, int(parser_config.get(
- "chunk_token_num", 128)), parser_config.get(
- "delimiter", "\n!?。;!?"))
- if kwargs.get("section_only", False):
- chunks.extend(embed_res)
- return chunks
+ if section_images:
+ if all(image is None for image in section_images):
+ section_images = None
- res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
+ if section_images:
+ chunks, images = naive_merge_with_images(sections, section_images,
+ int(parser_config.get(
+ "chunk_token_num", 128)), parser_config.get(
+ "delimiter", "\n!?。;!?"))
+ if kwargs.get("section_only", False):
+ chunks.extend(embed_res)
+ return chunks
+
+ res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
+ else:
+ chunks = naive_merge(
+ sections, int(parser_config.get(
+ "chunk_token_num", 128)), parser_config.get(
+ "delimiter", "\n!?。;!?"))
+ if kwargs.get("section_only", False):
+ chunks.extend(embed_res)
+ return chunks
+
+ res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
if urls and parser_config.get("analyze_hyperlink", False) and is_root:
for index, url in enumerate(urls):
@@ -820,9 +940,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
url_res.extend(sub_url_res)
-
+
logging.info("naive_merge({}): {}".format(filename, timer() - st))
-
+
if embed_res:
res.extend(embed_res)
if url_res:
diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py
index 2ba5cfa7b..1a111cc3a 100644
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@@ -482,17 +482,25 @@ class Parser(ProcessBase):
self.set_output("output_format", conf["output_format"])
markdown_parser = naive_markdown_parser()
- sections, tables = markdown_parser(name, blob, separate_tables=False)
+ sections, tables, section_images = markdown_parser(
+ name,
+ blob,
+ separate_tables=False,
+ delimiter=conf.get("delimiter"),
+ return_section_images=True,
+ )
if conf.get("output_format") == "json":
json_results = []
- for section_text, _ in sections:
+ for idx, (section_text, _) in enumerate(sections):
json_result = {
"text": section_text,
}
- images = markdown_parser.get_pictures(section_text) if section_text else None
+ images = []
+ if section_images and len(section_images) > idx and section_images[idx] is not None:
+ images.append(section_images[idx])
if images:
# If multiple images found, combine them using concat_img
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]