mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-08 20:42:30 +08:00
### What problem does this PR solve? Add support for MinerU http-client/server method. To use MinerU with vLLM server: 1. Set up a vLLM server running MinerU: ```bash mineru-vllm-server --port 30000 ``` 2. Configure the following environment variables: - `MINERU_EXECUTABLE=/ragflow/uv_tools/.venv/bin/mineru` (or the path to your MinerU executable) - `MINERU_BACKEND="vlm-http-client"` - `MINERU_SERVER_URL="http://your-vllm-server-ip:30000"` 3. Follow the standard MinerU setup steps as described above. With this configuration, RAGFlow will connect to your vLLM server to perform document parsing, which can significantly improve parsing performance for complex documents while reducing the resource requirements on your RAGFlow server.   ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Documentation Update --------- Co-authored-by: writinwaters <cai.keith@gmail.com>
755 lines
31 KiB
Python
755 lines
31 KiB
Python
#
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import logging
|
|
import re
|
|
import os
|
|
from functools import reduce
|
|
from io import BytesIO
|
|
from timeit import default_timer as timer
|
|
from docx import Document
|
|
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
|
|
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
|
|
from docx.opc.oxml import parse_xml
|
|
from markdown import markdown
|
|
from PIL import Image
|
|
from tika import parser
|
|
|
|
from api.db import LLMType
|
|
from api.db.services.llm_service import LLMBundle
|
|
from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
|
|
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
|
|
from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper
|
|
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
|
from deepdoc.parser.mineru_parser import MinerUParser
|
|
from deepdoc.parser.docling_parser import DoclingParser
|
|
from deepdoc.parser.tcadp_parser import TCADPParser
|
|
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table
|
|
|
|
|
|
class Docx(DocxParser):
|
|
def __init__(self):
|
|
pass
|
|
|
|
def get_picture(self, document, paragraph):
|
|
imgs = paragraph._element.xpath('.//pic:pic')
|
|
if not imgs:
|
|
return None
|
|
res_img = None
|
|
for img in imgs:
|
|
embed = img.xpath('.//a:blip/@r:embed')
|
|
if not embed:
|
|
continue
|
|
embed = embed[0]
|
|
try:
|
|
related_part = document.part.related_parts[embed]
|
|
image_blob = related_part.image.blob
|
|
except UnrecognizedImageError:
|
|
logging.info("Unrecognized image format. Skipping image.")
|
|
continue
|
|
except UnexpectedEndOfFileError:
|
|
logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
|
|
continue
|
|
except InvalidImageStreamError:
|
|
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
|
continue
|
|
except UnicodeDecodeError:
|
|
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
|
continue
|
|
except Exception:
|
|
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
|
continue
|
|
try:
|
|
image = Image.open(BytesIO(image_blob)).convert('RGB')
|
|
if res_img is None:
|
|
res_img = image
|
|
else:
|
|
res_img = concat_img(res_img, image)
|
|
except Exception:
|
|
continue
|
|
|
|
return res_img
|
|
|
|
def __clean(self, line):
|
|
line = re.sub(r"\u3000", " ", line).strip()
|
|
return line
|
|
|
|
def __get_nearest_title(self, table_index, filename):
|
|
"""Get the hierarchical title structure before the table"""
|
|
import re
|
|
from docx.text.paragraph import Paragraph
|
|
|
|
titles = []
|
|
blocks = []
|
|
|
|
# Get document name from filename parameter
|
|
doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename)
|
|
if not doc_name:
|
|
doc_name = "Untitled Document"
|
|
|
|
# Collect all document blocks while maintaining document order
|
|
try:
|
|
# Iterate through all paragraphs and tables in document order
|
|
for i, block in enumerate(self.doc._element.body):
|
|
if block.tag.endswith('p'): # Paragraph
|
|
p = Paragraph(block, self.doc)
|
|
blocks.append(('p', i, p))
|
|
elif block.tag.endswith('tbl'): # Table
|
|
blocks.append(('t', i, None)) # Table object will be retrieved later
|
|
except Exception as e:
|
|
logging.error(f"Error collecting blocks: {e}")
|
|
return ""
|
|
|
|
# Find the target table position
|
|
target_table_pos = -1
|
|
table_count = 0
|
|
for i, (block_type, pos, _) in enumerate(blocks):
|
|
if block_type == 't':
|
|
if table_count == table_index:
|
|
target_table_pos = pos
|
|
break
|
|
table_count += 1
|
|
|
|
if target_table_pos == -1:
|
|
return "" # Target table not found
|
|
|
|
# Find the nearest heading paragraph in reverse order
|
|
nearest_title = None
|
|
for i in range(len(blocks)-1, -1, -1):
|
|
block_type, pos, block = blocks[i]
|
|
if pos >= target_table_pos: # Skip blocks after the table
|
|
continue
|
|
|
|
if block_type != 'p':
|
|
continue
|
|
|
|
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
|
try:
|
|
level_match = re.search(r"(\d+)", block.style.name)
|
|
if level_match:
|
|
level = int(level_match.group(1))
|
|
if level <= 7: # Support up to 7 heading levels
|
|
title_text = block.text.strip()
|
|
if title_text: # Avoid empty titles
|
|
nearest_title = (level, title_text)
|
|
break
|
|
except Exception as e:
|
|
logging.error(f"Error parsing heading level: {e}")
|
|
|
|
if nearest_title:
|
|
# Add current title
|
|
titles.append(nearest_title)
|
|
current_level = nearest_title[0]
|
|
|
|
# Find all parent headings, allowing cross-level search
|
|
while current_level > 1:
|
|
found = False
|
|
for i in range(len(blocks)-1, -1, -1):
|
|
block_type, pos, block = blocks[i]
|
|
if pos >= target_table_pos: # Skip blocks after the table
|
|
continue
|
|
|
|
if block_type != 'p':
|
|
continue
|
|
|
|
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
|
try:
|
|
level_match = re.search(r"(\d+)", block.style.name)
|
|
if level_match:
|
|
level = int(level_match.group(1))
|
|
# Find any heading with a higher level
|
|
if level < current_level:
|
|
title_text = block.text.strip()
|
|
if title_text: # Avoid empty titles
|
|
titles.append((level, title_text))
|
|
current_level = level
|
|
found = True
|
|
break
|
|
except Exception as e:
|
|
logging.error(f"Error parsing parent heading: {e}")
|
|
|
|
if not found: # Break if no parent heading is found
|
|
break
|
|
|
|
# Sort by level (ascending, from highest to lowest)
|
|
titles.sort(key=lambda x: x[0])
|
|
# Organize titles (from highest to lowest)
|
|
hierarchy = [doc_name] + [t[1] for t in titles]
|
|
return " > ".join(hierarchy)
|
|
|
|
return ""
|
|
|
|
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
|
|
self.doc = Document(
|
|
filename) if not binary else Document(BytesIO(binary))
|
|
pn = 0
|
|
lines = []
|
|
last_image = None
|
|
for p in self.doc.paragraphs:
|
|
if pn > to_page:
|
|
break
|
|
if from_page <= pn < to_page:
|
|
if p.text.strip():
|
|
if p.style and p.style.name == 'Caption':
|
|
former_image = None
|
|
if lines and lines[-1][1] and lines[-1][2] != 'Caption':
|
|
former_image = lines[-1][1].pop()
|
|
elif last_image:
|
|
former_image = last_image
|
|
last_image = None
|
|
lines.append((self.__clean(p.text), [former_image], p.style.name))
|
|
else:
|
|
current_image = self.get_picture(self.doc, p)
|
|
image_list = [current_image]
|
|
if last_image:
|
|
image_list.insert(0, last_image)
|
|
last_image = None
|
|
lines.append((self.__clean(p.text), image_list, p.style.name if p.style else ""))
|
|
else:
|
|
if current_image := self.get_picture(self.doc, p):
|
|
if lines:
|
|
lines[-1][1].append(current_image)
|
|
else:
|
|
last_image = current_image
|
|
for run in p.runs:
|
|
if 'lastRenderedPageBreak' in run._element.xml:
|
|
pn += 1
|
|
continue
|
|
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
|
pn += 1
|
|
new_line = [(line[0], reduce(concat_img, line[1]) if line[1] else None) for line in lines]
|
|
|
|
tbls = []
|
|
for i, tb in enumerate(self.doc.tables):
|
|
title = self.__get_nearest_title(i, filename)
|
|
html = "<table>"
|
|
if title:
|
|
html += f"<caption>Table Location: {title}</caption>"
|
|
for r in tb.rows:
|
|
html += "<tr>"
|
|
i = 0
|
|
try:
|
|
while i < len(r.cells):
|
|
span = 1
|
|
c = r.cells[i]
|
|
for j in range(i + 1, len(r.cells)):
|
|
if c.text == r.cells[j].text:
|
|
span += 1
|
|
i = j
|
|
else:
|
|
break
|
|
i += 1
|
|
html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
|
|
except Exception as e:
|
|
logging.warning(f"Error parsing table, ignore: {e}")
|
|
html += "</tr>"
|
|
html += "</table>"
|
|
tbls.append(((None, html), ""))
|
|
return new_line, tbls
|
|
|
|
def to_markdown(self, filename=None, binary=None, inline_images: bool = True):
|
|
"""
|
|
This function uses mammoth, licensed under the BSD 2-Clause License.
|
|
"""
|
|
|
|
import base64
|
|
import uuid
|
|
|
|
import mammoth
|
|
from markdownify import markdownify
|
|
|
|
docx_file = BytesIO(binary) if binary else open(filename, "rb")
|
|
|
|
def _convert_image_to_base64(image):
|
|
try:
|
|
with image.open() as image_file:
|
|
image_bytes = image_file.read()
|
|
encoded = base64.b64encode(image_bytes).decode("utf-8")
|
|
base64_url = f"data:{image.content_type};base64,{encoded}"
|
|
|
|
alt_name = "image"
|
|
alt_name = f"img_{uuid.uuid4().hex[:8]}"
|
|
|
|
return {"src": base64_url, "alt": alt_name}
|
|
except Exception as e:
|
|
logging.warning(f"Failed to convert image to base64: {e}")
|
|
return {"src": "", "alt": "image"}
|
|
|
|
try:
|
|
if inline_images:
|
|
result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64))
|
|
else:
|
|
result = mammoth.convert_to_html(docx_file)
|
|
|
|
html = result.value
|
|
|
|
markdown_text = markdownify(html)
|
|
return markdown_text
|
|
|
|
finally:
|
|
if not binary:
|
|
docx_file.close()
|
|
|
|
|
|
class Pdf(PdfParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
|
|
def __call__(self, filename, binary=None, from_page=0,
|
|
to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
|
|
start = timer()
|
|
first_start = start
|
|
callback(msg="OCR started")
|
|
self.__images__(
|
|
filename if not binary else binary,
|
|
zoomin,
|
|
from_page,
|
|
to_page,
|
|
callback
|
|
)
|
|
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
|
logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
|
|
|
|
start = timer()
|
|
self._layouts_rec(zoomin)
|
|
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
|
|
|
|
start = timer()
|
|
self._table_transformer_job(zoomin)
|
|
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
|
|
|
|
start = timer()
|
|
self._text_merge(zoomin=zoomin)
|
|
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
|
|
|
|
if separate_tables_figures:
|
|
tbls, figures = self._extract_table_figure(True, zoomin, True, True, True)
|
|
self._concat_downward()
|
|
logging.info("layouts cost: {}s".format(timer() - first_start))
|
|
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls, figures
|
|
else:
|
|
tbls = self._extract_table_figure(True, zoomin, True, True)
|
|
self._naive_vertical_merge()
|
|
self._concat_downward()
|
|
self._final_reading_order_merge()
|
|
# self._filter_forpages()
|
|
logging.info("layouts cost: {}s".format(timer() - first_start))
|
|
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
|
|
|
|
|
|
class Markdown(MarkdownParser):
|
|
def md_to_html(self, sections):
|
|
if not sections:
|
|
return []
|
|
if isinstance(sections, type("")):
|
|
text = sections
|
|
elif isinstance(sections[0], type("")):
|
|
text = sections[0]
|
|
else:
|
|
return []
|
|
|
|
from bs4 import BeautifulSoup
|
|
html_content = markdown(text)
|
|
soup = BeautifulSoup(html_content, 'html.parser')
|
|
return soup
|
|
|
|
def get_picture_urls(self, soup):
|
|
if soup:
|
|
return [img.get('src') for img in soup.find_all('img') if img.get('src')]
|
|
return []
|
|
|
|
def get_hyperlink_urls(self, soup):
|
|
if soup:
|
|
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
|
|
return []
|
|
|
|
def get_pictures(self, text):
|
|
"""Download and open all images from markdown text."""
|
|
import requests
|
|
soup = self.md_to_html(text)
|
|
image_urls = self.get_picture_urls(soup)
|
|
images = []
|
|
# Find all image URLs in text
|
|
for url in image_urls:
|
|
try:
|
|
# check if the url is a local file or a remote URL
|
|
if url.startswith(('http://', 'https://')):
|
|
# For remote URLs, download the image
|
|
response = requests.get(url, stream=True, timeout=30)
|
|
if response.status_code == 200 and response.headers['Content-Type'].startswith('image/'):
|
|
img = Image.open(BytesIO(response.content)).convert('RGB')
|
|
images.append(img)
|
|
else:
|
|
# For local file paths, open the image directly
|
|
from pathlib import Path
|
|
local_path = Path(url)
|
|
if not local_path.exists():
|
|
logging.warning(f"Local image file not found: {url}")
|
|
continue
|
|
img = Image.open(url).convert('RGB')
|
|
images.append(img)
|
|
except Exception as e:
|
|
logging.error(f"Failed to download/open image from {url}: {e}")
|
|
continue
|
|
|
|
return images if images else None
|
|
|
|
def __call__(self, filename, binary=None, separate_tables=True,delimiter=None):
|
|
if binary:
|
|
encoding = find_codec(binary)
|
|
txt = binary.decode(encoding, errors="ignore")
|
|
else:
|
|
with open(filename, "r") as f:
|
|
txt = f.read()
|
|
|
|
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
|
|
|
|
extractor = MarkdownElementExtractor(txt)
|
|
element_sections = extractor.extract_elements(delimiter)
|
|
sections = [(element, "") for element in element_sections]
|
|
|
|
tbls = []
|
|
for table in tables:
|
|
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
|
|
return sections, tbls
|
|
|
|
def load_from_xml_v2(baseURI, rels_item_xml):
|
|
"""
|
|
Return |_SerializedRelationships| instance loaded with the
|
|
relationships contained in *rels_item_xml*. Returns an empty
|
|
collection if *rels_item_xml* is |None|.
|
|
"""
|
|
srels = _SerializedRelationships()
|
|
if rels_item_xml is not None:
|
|
rels_elm = parse_xml(rels_item_xml)
|
|
for rel_elm in rels_elm.Relationship_lst:
|
|
if rel_elm.target_ref in ('../NULL', 'NULL'):
|
|
continue
|
|
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
|
|
return srels
|
|
|
|
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
|
lang="Chinese", callback=None, **kwargs):
|
|
"""
|
|
Supported file formats are docx, pdf, excel, txt.
|
|
This method apply the naive ways to chunk files.
|
|
Successive text will be sliced into pieces using 'delimiter'.
|
|
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
|
|
"""
|
|
urls = set()
|
|
url_res = []
|
|
|
|
|
|
is_english = lang.lower() == "english" # is_english(cks)
|
|
parser_config = kwargs.get(
|
|
"parser_config", {
|
|
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
|
|
doc = {
|
|
"docnm_kwd": filename,
|
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
|
}
|
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
|
res = []
|
|
pdf_parser = None
|
|
section_images = None
|
|
|
|
is_root = kwargs.get("is_root", True)
|
|
embed_res = []
|
|
if is_root:
|
|
# Only extract embedded files at the root call
|
|
embeds = []
|
|
if binary is not None:
|
|
embeds = extract_embed_file(binary)
|
|
else:
|
|
raise Exception("Embedding extraction from file path is not supported.")
|
|
|
|
# Recursively chunk each embedded file and collect results
|
|
for embed_filename, embed_bytes in embeds:
|
|
try:
|
|
sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False, **kwargs) or []
|
|
embed_res.extend(sub_res)
|
|
except Exception as e:
|
|
if callback:
|
|
callback(0.05, f"Failed to chunk embed {embed_filename}: {e}")
|
|
continue
|
|
|
|
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
if parser_config.get("analyze_hyperlink", False) and is_root:
|
|
urls = extract_links_from_docx(binary)
|
|
for index, url in enumerate(urls):
|
|
html_bytes, metadata = extract_html(url)
|
|
if not html_bytes:
|
|
continue
|
|
try:
|
|
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
|
except Exception as e:
|
|
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
|
|
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
|
url_res.extend(sub_url_res)
|
|
|
|
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
|
|
_SerializedRelationships.load_from_xml = load_from_xml_v2
|
|
sections, tables = Docx()(filename, binary)
|
|
|
|
tables=vision_figure_parser_docx_wrapper(sections=sections,tbls=tables,callback=callback,**kwargs)
|
|
|
|
res = tokenize_table(tables, doc, is_english)
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
st = timer()
|
|
|
|
chunks, images = naive_merge_docx(
|
|
sections, int(parser_config.get(
|
|
"chunk_token_num", 128)), parser_config.get(
|
|
"delimiter", "\n!?。;!?"))
|
|
|
|
if kwargs.get("section_only", False):
|
|
chunks.extend(embed_res)
|
|
chunks.extend(url_res)
|
|
return chunks
|
|
|
|
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
|
|
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
|
res.extend(embed_res)
|
|
res.extend(url_res)
|
|
return res
|
|
|
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
|
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
|
if parser_config.get("analyze_hyperlink", False) and is_root:
|
|
urls = extract_links_from_pdf(binary)
|
|
|
|
if isinstance(layout_recognizer, bool):
|
|
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
|
callback(0.1, "Start to parse.")
|
|
|
|
if layout_recognizer == "DeepDOC":
|
|
pdf_parser = Pdf()
|
|
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
|
|
tables=vision_figure_parser_pdf_wrapper(tbls=tables,callback=callback,**kwargs)
|
|
|
|
res = tokenize_table(tables, doc, is_english)
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif layout_recognizer == "MinerU":
|
|
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
|
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
|
|
mineru_server_url = os.environ.get("MINERU_SERVER_URL", "")
|
|
mineru_backend = os.environ.get("MINERU_BACKEND", "pipeline")
|
|
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api, mineru_server_url=mineru_server_url)
|
|
ok, reason = pdf_parser.check_installation(backend=mineru_backend)
|
|
if not ok:
|
|
callback(-1, f"MinerU not found or server not accessible: {reason}")
|
|
return res
|
|
|
|
sections, tables = pdf_parser.parse_pdf(
|
|
filepath=filename,
|
|
binary=binary,
|
|
callback=callback,
|
|
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
|
backend=mineru_backend,
|
|
server_url=mineru_server_url,
|
|
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
|
)
|
|
parser_config["chunk_token_num"] = 0
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif layout_recognizer == "Docling":
|
|
pdf_parser = DoclingParser()
|
|
if not pdf_parser.check_installation():
|
|
callback(-1, "Docling not found.")
|
|
return res
|
|
|
|
sections, tables = pdf_parser.parse_pdf(
|
|
filepath=filename,
|
|
binary=binary,
|
|
callback=callback,
|
|
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
|
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
|
)
|
|
parser_config["chunk_token_num"] = 0
|
|
res = tokenize_table(tables, doc, is_english)
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif layout_recognizer == "TCADP Parser":
|
|
tcadp_parser = TCADPParser()
|
|
if not tcadp_parser.check_installation():
|
|
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
|
|
return res
|
|
|
|
sections, tables = tcadp_parser.parse_pdf(
|
|
filepath=filename,
|
|
binary=binary,
|
|
callback=callback,
|
|
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
|
|
file_type="PDF"
|
|
)
|
|
parser_config["chunk_token_num"] = 0
|
|
callback(0.8, "Finish parsing.")
|
|
else:
|
|
if layout_recognizer == "Plain Text":
|
|
pdf_parser = PlainParser()
|
|
else:
|
|
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT, llm_name=layout_recognizer, lang=lang)
|
|
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
|
|
|
|
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page,
|
|
callback=callback)
|
|
res = tokenize_table(tables, doc, is_english)
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
excel_parser = ExcelParser()
|
|
if parser_config.get("html4excel"):
|
|
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
|
|
else:
|
|
sections = [(_, "") for _ in excel_parser(binary) if _]
|
|
parser_config["chunk_token_num"] = 12800
|
|
|
|
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
sections = TxtParser()(filename, binary,
|
|
parser_config.get("chunk_token_num", 128),
|
|
parser_config.get("delimiter", "\n!?;。;!?"))
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif re.search(r"\.(md|markdown)$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
|
|
sections, tables = markdown_parser(filename, binary, separate_tables=False,delimiter=parser_config.get("delimiter", "\n!?;。;!?"))
|
|
|
|
try:
|
|
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
|
|
callback(0.2, "Visual model detected. Attempting to enhance figure extraction...")
|
|
except Exception:
|
|
vision_model = None
|
|
|
|
if vision_model:
|
|
# Process images for each section
|
|
section_images = []
|
|
for idx, (section_text, _) in enumerate(sections):
|
|
images = markdown_parser.get_pictures(section_text) if section_text else None
|
|
|
|
if images:
|
|
# If multiple images found, combine them using concat_img
|
|
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
|
|
section_images.append(combined_image)
|
|
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data= [((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
|
|
boosted_figures = markdown_vision_parser(callback=callback)
|
|
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
|
|
else:
|
|
section_images.append(None)
|
|
|
|
else:
|
|
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
|
|
|
|
if parser_config.get("hyperlink_urls", False) and is_root:
|
|
for idx, (section_text, _) in enumerate(sections):
|
|
soup = markdown_parser.md_to_html(section_text)
|
|
hyperlink_urls = markdown_parser.get_hyperlink_urls(soup)
|
|
urls.update(hyperlink_urls)
|
|
res = tokenize_table(tables, doc, is_english)
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
chunk_token_num = int(parser_config.get("chunk_token_num", 128))
|
|
sections = HtmlParser()(filename, binary, chunk_token_num)
|
|
sections = [(_, "") for _ in sections if _]
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif re.search(r"\.(json|jsonl|ldjson)$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
chunk_token_num = int(parser_config.get("chunk_token_num", 128))
|
|
sections = JsonParser(chunk_token_num)(binary)
|
|
sections = [(_, "") for _ in sections if _]
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif re.search(r"\.doc$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
|
|
binary = BytesIO(binary)
|
|
doc_parsed = parser.from_buffer(binary)
|
|
if doc_parsed.get('content', None) is not None:
|
|
sections = doc_parsed['content'].split('\n')
|
|
sections = [(_, "") for _ in sections if _]
|
|
callback(0.8, "Finish parsing.")
|
|
else:
|
|
callback(0.8, f"tika.parser got empty content from {filename}.")
|
|
logging.warning(f"tika.parser got empty content from {filename}.")
|
|
return []
|
|
else:
|
|
raise NotImplementedError(
|
|
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
|
|
|
st = timer()
|
|
if section_images:
|
|
# if all images are None, set section_images to None
|
|
if all(image is None for image in section_images):
|
|
section_images = None
|
|
|
|
if section_images:
|
|
chunks, images = naive_merge_with_images(sections, section_images,
|
|
int(parser_config.get(
|
|
"chunk_token_num", 128)), parser_config.get(
|
|
"delimiter", "\n!?。;!?"))
|
|
if kwargs.get("section_only", False):
|
|
chunks.extend(embed_res)
|
|
return chunks
|
|
|
|
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images))
|
|
else:
|
|
chunks = naive_merge(
|
|
sections, int(parser_config.get(
|
|
"chunk_token_num", 128)), parser_config.get(
|
|
"delimiter", "\n!?。;!?"))
|
|
if kwargs.get("section_only", False):
|
|
chunks.extend(embed_res)
|
|
return chunks
|
|
|
|
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser))
|
|
|
|
if urls and parser_config.get("analyze_hyperlink", False) and is_root:
|
|
for index, url in enumerate(urls):
|
|
html_bytes, metadata = extract_html(url)
|
|
if not html_bytes:
|
|
continue
|
|
try:
|
|
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
|
except Exception as e:
|
|
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
|
|
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
|
url_res.extend(sub_url_res)
|
|
|
|
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
|
|
|
if embed_res:
|
|
res.extend(embed_res)
|
|
if url_res:
|
|
res.extend(url_res)
|
|
return res
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
def dummy(prog=None, msg=""):
|
|
pass
|
|
|
|
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|