mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
Fix: incorrect image merging for naive markdown parser (#11520)
### What problem does this PR solve? Fix incorrect image merging for naive markdown parser. #9349 [ragflow_readme.webm](https://github.com/user-attachments/assets/ca3f1e18-72b6-4a4c-80db-d03da9adf8dc) ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
@ -72,9 +72,8 @@ class RAGFlowMarkdownParser:
|
|||||||
|
|
||||||
# Replace any TAGS e.g. <table ...> to <table>
|
# Replace any TAGS e.g. <table ...> to <table>
|
||||||
TAGS = ["table", "td", "tr", "th", "tbody", "thead", "div"]
|
TAGS = ["table", "td", "tr", "th", "tbody", "thead", "div"]
|
||||||
table_with_attributes_pattern = re.compile(
|
table_with_attributes_pattern = re.compile(rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE)
|
||||||
rf"<(?:{'|'.join(TAGS)})[^>]*>", re.IGNORECASE
|
|
||||||
)
|
|
||||||
def replace_tag(m):
|
def replace_tag(m):
|
||||||
tag_name = re.match(r"<(\w+)", m.group()).group(1)
|
tag_name = re.match(r"<(\w+)", m.group()).group(1)
|
||||||
return "<{}>".format(tag_name)
|
return "<{}>".format(tag_name)
|
||||||
@ -128,23 +127,48 @@ class MarkdownElementExtractor:
|
|||||||
self.markdown_content = markdown_content
|
self.markdown_content = markdown_content
|
||||||
self.lines = markdown_content.split("\n")
|
self.lines = markdown_content.split("\n")
|
||||||
|
|
||||||
def get_delimiters(self,delimiters):
|
def get_delimiters(self, delimiters):
|
||||||
toks = re.findall(r"`([^`]+)`", delimiters)
|
toks = re.findall(r"`([^`]+)`", delimiters)
|
||||||
toks = sorted(set(toks), key=lambda x: -len(x))
|
toks = sorted(set(toks), key=lambda x: -len(x))
|
||||||
return "|".join(re.escape(t) for t in toks if t)
|
return "|".join(re.escape(t) for t in toks if t)
|
||||||
|
|
||||||
def extract_elements(self,delimiter=None):
|
def extract_elements(self, delimiter=None, include_meta=False):
|
||||||
"""Extract individual elements (headers, code blocks, lists, etc.)"""
|
"""Extract individual elements (headers, code blocks, lists, etc.)"""
|
||||||
sections = []
|
sections = []
|
||||||
|
|
||||||
i = 0
|
i = 0
|
||||||
dels=""
|
dels = ""
|
||||||
if delimiter:
|
if delimiter:
|
||||||
dels = self.get_delimiters(delimiter)
|
dels = self.get_delimiters(delimiter)
|
||||||
if len(dels) > 0:
|
if len(dels) > 0:
|
||||||
text = "\n".join(self.lines)
|
text = "\n".join(self.lines)
|
||||||
parts = re.split(dels, text)
|
if include_meta:
|
||||||
sections = [p.strip() for p in parts if p and p.strip()]
|
pattern = re.compile(dels)
|
||||||
|
last_end = 0
|
||||||
|
for m in pattern.finditer(text):
|
||||||
|
part = text[last_end : m.start()]
|
||||||
|
if part and part.strip():
|
||||||
|
sections.append(
|
||||||
|
{
|
||||||
|
"content": part.strip(),
|
||||||
|
"start_line": text.count("\n", 0, last_end),
|
||||||
|
"end_line": text.count("\n", 0, m.start()),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
last_end = m.end()
|
||||||
|
|
||||||
|
part = text[last_end:]
|
||||||
|
if part and part.strip():
|
||||||
|
sections.append(
|
||||||
|
{
|
||||||
|
"content": part.strip(),
|
||||||
|
"start_line": text.count("\n", 0, last_end),
|
||||||
|
"end_line": text.count("\n", 0, len(text)),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
parts = re.split(dels, text)
|
||||||
|
sections = [p.strip() for p in parts if p and p.strip()]
|
||||||
return sections
|
return sections
|
||||||
while i < len(self.lines):
|
while i < len(self.lines):
|
||||||
line = self.lines[i]
|
line = self.lines[i]
|
||||||
@ -152,32 +176,35 @@ class MarkdownElementExtractor:
|
|||||||
if re.match(r"^#{1,6}\s+.*$", line):
|
if re.match(r"^#{1,6}\s+.*$", line):
|
||||||
# header
|
# header
|
||||||
element = self._extract_header(i)
|
element = self._extract_header(i)
|
||||||
sections.append(element["content"])
|
sections.append(element if include_meta else element["content"])
|
||||||
i = element["end_line"] + 1
|
i = element["end_line"] + 1
|
||||||
elif line.strip().startswith("```"):
|
elif line.strip().startswith("```"):
|
||||||
# code block
|
# code block
|
||||||
element = self._extract_code_block(i)
|
element = self._extract_code_block(i)
|
||||||
sections.append(element["content"])
|
sections.append(element if include_meta else element["content"])
|
||||||
i = element["end_line"] + 1
|
i = element["end_line"] + 1
|
||||||
elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
|
elif re.match(r"^\s*[-*+]\s+.*$", line) or re.match(r"^\s*\d+\.\s+.*$", line):
|
||||||
# list block
|
# list block
|
||||||
element = self._extract_list_block(i)
|
element = self._extract_list_block(i)
|
||||||
sections.append(element["content"])
|
sections.append(element if include_meta else element["content"])
|
||||||
i = element["end_line"] + 1
|
i = element["end_line"] + 1
|
||||||
elif line.strip().startswith(">"):
|
elif line.strip().startswith(">"):
|
||||||
# blockquote
|
# blockquote
|
||||||
element = self._extract_blockquote(i)
|
element = self._extract_blockquote(i)
|
||||||
sections.append(element["content"])
|
sections.append(element if include_meta else element["content"])
|
||||||
i = element["end_line"] + 1
|
i = element["end_line"] + 1
|
||||||
elif line.strip():
|
elif line.strip():
|
||||||
# text block (paragraphs and inline elements until next block element)
|
# text block (paragraphs and inline elements until next block element)
|
||||||
element = self._extract_text_block(i)
|
element = self._extract_text_block(i)
|
||||||
sections.append(element["content"])
|
sections.append(element if include_meta else element["content"])
|
||||||
i = element["end_line"] + 1
|
i = element["end_line"] + 1
|
||||||
else:
|
else:
|
||||||
i += 1
|
i += 1
|
||||||
|
|
||||||
sections = [section for section in sections if section.strip()]
|
if include_meta:
|
||||||
|
sections = [section for section in sections if section["content"].strip()]
|
||||||
|
else:
|
||||||
|
sections = [section for section in sections if section.strip()]
|
||||||
return sections
|
return sections
|
||||||
|
|
||||||
def _extract_header(self, start_pos):
|
def _extract_header(self, start_pos):
|
||||||
|
|||||||
230
rag/app/naive.py
230
rag/app/naive.py
@ -26,6 +26,7 @@ from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
|
|||||||
from docx.opc.oxml import parse_xml
|
from docx.opc.oxml import parse_xml
|
||||||
from markdown import markdown
|
from markdown import markdown
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
from common.token_utils import num_tokens_from_string
|
||||||
|
|
||||||
from common.constants import LLMType
|
from common.constants import LLMType
|
||||||
from api.db.services.llm_service import LLMBundle
|
from api.db.services.llm_service import LLMBundle
|
||||||
@ -465,50 +466,87 @@ class Markdown(MarkdownParser):
|
|||||||
soup = BeautifulSoup(html_content, 'html.parser')
|
soup = BeautifulSoup(html_content, 'html.parser')
|
||||||
return soup
|
return soup
|
||||||
|
|
||||||
def get_picture_urls(self, soup):
|
|
||||||
if soup:
|
|
||||||
return [img.get('src') for img in soup.find_all('img') if img.get('src')]
|
|
||||||
return []
|
|
||||||
|
|
||||||
def get_hyperlink_urls(self, soup):
|
def get_hyperlink_urls(self, soup):
|
||||||
if soup:
|
if soup:
|
||||||
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
|
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
|
||||||
return []
|
return []
|
||||||
|
|
||||||
def get_pictures(self, text):
|
def extract_image_urls_with_lines(self, text):
|
||||||
"""Download and open all images from markdown text."""
|
md_img_re = re.compile(r"!\[[^\]]*\]\(([^)\s]+)")
|
||||||
|
html_img_re = re.compile(r'src=["\\\']([^"\\\'>\\s]+)', re.IGNORECASE)
|
||||||
|
urls = []
|
||||||
|
seen = set()
|
||||||
|
lines = text.splitlines()
|
||||||
|
for idx, line in enumerate(lines):
|
||||||
|
for url in md_img_re.findall(line):
|
||||||
|
if (url, idx) not in seen:
|
||||||
|
urls.append({"url": url, "line": idx})
|
||||||
|
seen.add((url, idx))
|
||||||
|
for url in html_img_re.findall(line):
|
||||||
|
if (url, idx) not in seen:
|
||||||
|
urls.append({"url": url, "line": idx})
|
||||||
|
seen.add((url, idx))
|
||||||
|
|
||||||
|
# cross-line
|
||||||
|
try:
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
soup = BeautifulSoup(text, 'html.parser')
|
||||||
|
newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)]
|
||||||
|
for img_tag in soup.find_all('img'):
|
||||||
|
src = img_tag.get('src')
|
||||||
|
if not src:
|
||||||
|
continue
|
||||||
|
|
||||||
|
tag_str = str(img_tag)
|
||||||
|
pos = text.find(tag_str)
|
||||||
|
if pos == -1:
|
||||||
|
# fallback
|
||||||
|
pos = max(text.find(src), 0)
|
||||||
|
line_no = 0
|
||||||
|
for i, off in enumerate(newline_offsets):
|
||||||
|
if pos <= off:
|
||||||
|
line_no = i
|
||||||
|
break
|
||||||
|
if (src, line_no) not in seen:
|
||||||
|
urls.append({"url": src, "line": line_no})
|
||||||
|
seen.add((src, line_no))
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return urls
|
||||||
|
|
||||||
|
def load_images_from_urls(self, urls, cache=None):
|
||||||
import requests
|
import requests
|
||||||
soup = self.md_to_html(text)
|
from pathlib import Path
|
||||||
image_urls = self.get_picture_urls(soup)
|
|
||||||
|
cache = cache or {}
|
||||||
images = []
|
images = []
|
||||||
# Find all image URLs in text
|
for url in urls:
|
||||||
for url in image_urls:
|
if url in cache:
|
||||||
if not url:
|
if cache[url]:
|
||||||
|
images.append(cache[url])
|
||||||
continue
|
continue
|
||||||
|
img_obj = None
|
||||||
try:
|
try:
|
||||||
# check if the url is a local file or a remote URL
|
|
||||||
if url.startswith(('http://', 'https://')):
|
if url.startswith(('http://', 'https://')):
|
||||||
# For remote URLs, download the image
|
|
||||||
response = requests.get(url, stream=True, timeout=30)
|
response = requests.get(url, stream=True, timeout=30)
|
||||||
if response.status_code == 200 and response.headers['Content-Type'] and response.headers['Content-Type'].startswith('image/'):
|
if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'):
|
||||||
img = Image.open(BytesIO(response.content)).convert('RGB')
|
img_obj = Image.open(BytesIO(response.content)).convert('RGB')
|
||||||
images.append(img)
|
|
||||||
else:
|
else:
|
||||||
# For local file paths, open the image directly
|
|
||||||
from pathlib import Path
|
|
||||||
local_path = Path(url)
|
local_path = Path(url)
|
||||||
if not local_path.exists():
|
if local_path.exists():
|
||||||
|
img_obj = Image.open(url).convert('RGB')
|
||||||
|
else:
|
||||||
logging.warning(f"Local image file not found: {url}")
|
logging.warning(f"Local image file not found: {url}")
|
||||||
continue
|
|
||||||
img = Image.open(url).convert('RGB')
|
|
||||||
images.append(img)
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"Failed to download/open image from {url}: {e}")
|
logging.error(f"Failed to download/open image from {url}: {e}")
|
||||||
continue
|
cache[url] = img_obj
|
||||||
|
if img_obj:
|
||||||
|
images.append(img_obj)
|
||||||
|
return images, cache
|
||||||
|
|
||||||
return images if images else None
|
def __call__(self, filename, binary=None, separate_tables=True, delimiter=None, return_section_images=False):
|
||||||
|
|
||||||
def __call__(self, filename, binary=None, separate_tables=True, delimiter=None):
|
|
||||||
if binary:
|
if binary:
|
||||||
encoding = find_codec(binary)
|
encoding = find_codec(binary)
|
||||||
txt = binary.decode(encoding, errors="ignore")
|
txt = binary.decode(encoding, errors="ignore")
|
||||||
@ -520,11 +558,31 @@ class Markdown(MarkdownParser):
|
|||||||
# To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
|
# To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
|
||||||
# extractor = MarkdownElementExtractor(remainder)
|
# extractor = MarkdownElementExtractor(remainder)
|
||||||
extractor = MarkdownElementExtractor(txt)
|
extractor = MarkdownElementExtractor(txt)
|
||||||
element_sections = extractor.extract_elements(delimiter)
|
image_refs = self.extract_image_urls_with_lines(txt)
|
||||||
sections = [(element, "") for element in element_sections]
|
element_sections = extractor.extract_elements(delimiter, include_meta=True)
|
||||||
|
|
||||||
|
sections = []
|
||||||
|
section_images = []
|
||||||
|
image_cache = {}
|
||||||
|
for element in element_sections:
|
||||||
|
content = element["content"]
|
||||||
|
start_line = element["start_line"]
|
||||||
|
end_line = element["end_line"]
|
||||||
|
urls_in_section = [ref["url"] for ref in image_refs if start_line <= ref["line"] <= end_line]
|
||||||
|
imgs = []
|
||||||
|
if urls_in_section:
|
||||||
|
imgs, image_cache = self.load_images_from_urls(urls_in_section, image_cache)
|
||||||
|
combined_image = None
|
||||||
|
if imgs:
|
||||||
|
combined_image = reduce(concat_img, imgs) if len(imgs) > 1 else imgs[0]
|
||||||
|
sections.append((content, ""))
|
||||||
|
section_images.append(combined_image)
|
||||||
|
|
||||||
tbls = []
|
tbls = []
|
||||||
for table in tables:
|
for table in tables:
|
||||||
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
|
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
|
||||||
|
if return_section_images:
|
||||||
|
return sections, tbls, section_images
|
||||||
return sections, tbls
|
return sections, tbls
|
||||||
|
|
||||||
def load_from_xml_v2(baseURI, rels_item_xml):
|
def load_from_xml_v2(baseURI, rels_item_xml):
|
||||||
@ -558,6 +616,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
parser_config = kwargs.get(
|
parser_config = kwargs.get(
|
||||||
"parser_config", {
|
"parser_config", {
|
||||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
|
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
|
||||||
|
final_sections = False
|
||||||
doc = {
|
doc = {
|
||||||
"docnm_kwd": filename,
|
"docnm_kwd": filename,
|
||||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||||
@ -709,7 +768,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
||||||
callback(0.1, "Start to parse.")
|
callback(0.1, "Start to parse.")
|
||||||
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
|
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
|
||||||
sections, tables = markdown_parser(filename, binary, separate_tables=False, delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
|
sections, tables, section_images = markdown_parser(
|
||||||
|
filename,
|
||||||
|
binary,
|
||||||
|
separate_tables=False,
|
||||||
|
delimiter=parser_config.get("delimiter", "\n!?;。;!?"),
|
||||||
|
return_section_images=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
final_sections = True
|
||||||
|
|
||||||
try:
|
try:
|
||||||
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
|
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
|
||||||
@ -719,19 +786,22 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
|
|
||||||
if vision_model:
|
if vision_model:
|
||||||
# Process images for each section
|
# Process images for each section
|
||||||
section_images = []
|
|
||||||
for idx, (section_text, _) in enumerate(sections):
|
for idx, (section_text, _) in enumerate(sections):
|
||||||
images = markdown_parser.get_pictures(section_text) if section_text else None
|
images = []
|
||||||
|
if section_images and len(section_images) > idx and section_images[idx] is not None:
|
||||||
|
images.append(section_images[idx])
|
||||||
|
|
||||||
if images:
|
if images and len(images) > 0:
|
||||||
# If multiple images found, combine them using concat_img
|
# If multiple images found, combine them using concat_img
|
||||||
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
|
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
|
||||||
section_images.append(combined_image)
|
if section_images:
|
||||||
|
section_images[idx] = combined_image
|
||||||
|
else:
|
||||||
|
section_images = [None] * len(sections)
|
||||||
|
section_images[idx] = combined_image
|
||||||
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data= [((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
|
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data= [((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
|
||||||
boosted_figures = markdown_vision_parser(callback=callback)
|
boosted_figures = markdown_vision_parser(callback=callback)
|
||||||
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
|
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
|
||||||
else:
|
|
||||||
section_images.append(None)
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
|
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
|
||||||
@ -783,31 +853,81 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|||||||
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
||||||
|
|
||||||
st = timer()
|
st = timer()
|
||||||
if section_images:
|
if final_sections:
|
||||||
# if all images are None, set section_images to None
|
merged_chunks = []
|
||||||
if all(image is None for image in section_images):
|
merged_images = []
|
||||||
section_images = None
|
chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128)))
|
||||||
|
overlapped_percent = int(parser_config.get("overlapped_percent", 0))
|
||||||
|
overlapped_percent = max(0, min(overlapped_percent, 90))
|
||||||
|
|
||||||
if section_images:
|
current_text = ""
|
||||||
chunks, images = naive_merge_with_images(sections, section_images,
|
current_tokens = 0
|
||||||
int(parser_config.get(
|
current_image = None
|
||||||
"chunk_token_num", 128)), parser_config.get(
|
|
||||||
"delimiter", "\n!?。;!?"))
|
for idx, sec in enumerate(sections):
|
||||||
|
text = sec[0] if isinstance(sec, tuple) else sec
|
||||||
|
sec_tokens = num_tokens_from_string(text)
|
||||||
|
sec_image = section_images[idx] if section_images and idx < len(section_images) else None
|
||||||
|
|
||||||
|
if current_text and current_tokens + sec_tokens > chunk_limit:
|
||||||
|
merged_chunks.append(current_text)
|
||||||
|
merged_images.append(current_image)
|
||||||
|
overlap_part = ""
|
||||||
|
if overlapped_percent > 0:
|
||||||
|
overlap_len = int(len(current_text) * overlapped_percent / 100)
|
||||||
|
if overlap_len > 0:
|
||||||
|
overlap_part = current_text[-overlap_len:]
|
||||||
|
current_text = overlap_part
|
||||||
|
current_tokens = num_tokens_from_string(current_text)
|
||||||
|
current_image = current_image if overlap_part else None
|
||||||
|
|
||||||
|
if current_text:
|
||||||
|
current_text += "\n" + text
|
||||||
|
else:
|
||||||
|
current_text = text
|
||||||
|
current_tokens += sec_tokens
|
||||||
|
|
||||||
|
if sec_image:
|
||||||
|
current_image = concat_img(current_image, sec_image) if current_image else sec_image
|
||||||
|
|
||||||
|
if current_text:
|
||||||
|
merged_chunks.append(current_text)
|
||||||
|
merged_images.append(current_image)
|
||||||
|
|
||||||
|
chunks = merged_chunks
|
||||||
|
has_images = merged_images and any(img is not None for img in merged_images)
|
||||||
if kwargs.get("section_only", False):
|
if kwargs.get("section_only", False):
|
||||||
chunks.extend(embed_res)
|
chunks.extend(embed_res)
|
||||||
return chunks
|
return chunks
|
||||||
|
if has_images:
|
||||||
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
|
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images))
|
||||||
|
else:
|
||||||
|
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
|
||||||
else:
|
else:
|
||||||
chunks = naive_merge(
|
if section_images:
|
||||||
sections, int(parser_config.get(
|
if all(image is None for image in section_images):
|
||||||
"chunk_token_num", 128)), parser_config.get(
|
section_images = None
|
||||||
"delimiter", "\n!?。;!?"))
|
|
||||||
if kwargs.get("section_only", False):
|
|
||||||
chunks.extend(embed_res)
|
|
||||||
return chunks
|
|
||||||
|
|
||||||
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
|
if section_images:
|
||||||
|
chunks, images = naive_merge_with_images(sections, section_images,
|
||||||
|
int(parser_config.get(
|
||||||
|
"chunk_token_num", 128)), parser_config.get(
|
||||||
|
"delimiter", "\n!?。;!?"))
|
||||||
|
if kwargs.get("section_only", False):
|
||||||
|
chunks.extend(embed_res)
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
|
||||||
|
else:
|
||||||
|
chunks = naive_merge(
|
||||||
|
sections, int(parser_config.get(
|
||||||
|
"chunk_token_num", 128)), parser_config.get(
|
||||||
|
"delimiter", "\n!?。;!?"))
|
||||||
|
if kwargs.get("section_only", False):
|
||||||
|
chunks.extend(embed_res)
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
|
||||||
|
|
||||||
if urls and parser_config.get("analyze_hyperlink", False) and is_root:
|
if urls and parser_config.get("analyze_hyperlink", False) and is_root:
|
||||||
for index, url in enumerate(urls):
|
for index, url in enumerate(urls):
|
||||||
|
|||||||
@ -482,17 +482,25 @@ class Parser(ProcessBase):
|
|||||||
self.set_output("output_format", conf["output_format"])
|
self.set_output("output_format", conf["output_format"])
|
||||||
|
|
||||||
markdown_parser = naive_markdown_parser()
|
markdown_parser = naive_markdown_parser()
|
||||||
sections, tables = markdown_parser(name, blob, separate_tables=False)
|
sections, tables, section_images = markdown_parser(
|
||||||
|
name,
|
||||||
|
blob,
|
||||||
|
separate_tables=False,
|
||||||
|
delimiter=conf.get("delimiter"),
|
||||||
|
return_section_images=True,
|
||||||
|
)
|
||||||
|
|
||||||
if conf.get("output_format") == "json":
|
if conf.get("output_format") == "json":
|
||||||
json_results = []
|
json_results = []
|
||||||
|
|
||||||
for section_text, _ in sections:
|
for idx, (section_text, _) in enumerate(sections):
|
||||||
json_result = {
|
json_result = {
|
||||||
"text": section_text,
|
"text": section_text,
|
||||||
}
|
}
|
||||||
|
|
||||||
images = markdown_parser.get_pictures(section_text) if section_text else None
|
images = []
|
||||||
|
if section_images and len(section_images) > idx and section_images[idx] is not None:
|
||||||
|
images.append(section_images[idx])
|
||||||
if images:
|
if images:
|
||||||
# If multiple images found, combine them using concat_img
|
# If multiple images found, combine them using concat_img
|
||||||
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
|
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
|
||||||
|
|||||||
Reference in New Issue
Block a user