Files
ragflow/deepdoc/parser/mineru_parser.py
Billy Bao d3d2ccc76c Feat: add more chunking method (#11413)
### What problem does this PR solve?

Feat: add more chunking method #11311

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
2025-11-20 19:07:17 +08:00

599 lines
24 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import json
import logging
import os
import platform
import re
import subprocess
import sys
import tempfile
import threading
import time
import zipfile
from io import BytesIO
from os import PathLike
from pathlib import Path
from queue import Empty, Queue
from typing import Any, Callable, Optional
import numpy as np
import pdfplumber
import requests
from PIL import Image
from strenum import StrEnum
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
if LOCK_KEY_pdfplumber not in sys.modules:
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
class MinerUContentType(StrEnum):
IMAGE = "image"
TABLE = "table"
TEXT = "text"
EQUATION = "equation"
CODE = "code"
LIST = "list"
DISCARDED = "discarded"
class MinerUParser(RAGFlowPdfParser):
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "http://host.docker.internal:9987", mineru_server_url: str = ""):
self.mineru_path = Path(mineru_path)
self.mineru_api = mineru_api.rstrip("/")
self.mineru_server_url = mineru_server_url.rstrip("/")
self.using_api = False
self.outlines = []
self.logger = logging.getLogger(self.__class__.__name__)
def _extract_zip_no_root(self, zip_path, extract_to, root_dir):
with zipfile.ZipFile(zip_path, "r") as zip_ref:
if not root_dir:
files = zip_ref.namelist()
if files and files[0].endswith("/"):
root_dir = files[0]
else:
root_dir = None
if not root_dir or not root_dir.endswith("/"):
self.logger.info(f"[MinerU] No root directory found, extracting all...fff{root_dir}")
zip_ref.extractall(extract_to)
return
root_len = len(root_dir)
for member in zip_ref.infolist():
filename = member.filename
if filename == root_dir:
self.logger.info("[MinerU] Ignore root folder...")
continue
path = filename
if path.startswith(root_dir):
path = path[root_len:]
full_path = os.path.join(extract_to, path)
if member.is_dir():
os.makedirs(full_path, exist_ok=True)
else:
os.makedirs(os.path.dirname(full_path), exist_ok=True)
with open(full_path, "wb") as f:
f.write(zip_ref.read(filename))
def _is_http_endpoint_valid(self, url, timeout=5):
try:
response = requests.head(url, timeout=timeout, allow_redirects=True)
return response.status_code in [200, 301, 302, 307, 308]
except Exception:
return False
def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]:
reason = ""
valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine"]
if backend not in valid_backends:
reason = "[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
logging.warning(reason)
return False, reason
subprocess_kwargs = {
"capture_output": True,
"text": True,
"check": True,
"encoding": "utf-8",
"errors": "ignore",
}
if platform.system() == "Windows":
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
if server_url is None:
server_url = self.mineru_server_url
if backend == "vlm-http-client" and server_url:
try:
server_accessible = self._is_http_endpoint_valid(server_url + "/openapi.json")
logging.info(f"[MinerU] vlm-http-client server check: {server_accessible}")
if server_accessible:
self.using_api = False # We are using http client, not API
return True, reason
else:
reason = f"[MinerU] vlm-http-client server not accessible: {server_url}"
logging.warning(f"[MinerU] vlm-http-client server not accessible: {server_url}")
return False, reason
except Exception as e:
logging.warning(f"[MinerU] vlm-http-client server check failed: {e}")
try:
response = requests.get(server_url, timeout=5)
logging.info(f"[MinerU] vlm-http-client server connection check: success with status {response.status_code}")
self.using_api = False
return True, reason
except Exception as e:
reason = f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}"
logging.warning(f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}")
return False, reason
try:
result = subprocess.run([str(self.mineru_path), "--version"], **subprocess_kwargs)
version_info = result.stdout.strip()
if version_info:
logging.info(f"[MinerU] Detected version: {version_info}")
else:
logging.info("[MinerU] Detected MinerU, but version info is empty.")
return True, reason
except subprocess.CalledProcessError as e:
logging.warning(f"[MinerU] Execution failed (exit code {e.returncode}).")
except FileNotFoundError:
logging.warning("[MinerU] MinerU not found. Please install it via: pip install -U 'mineru[core]'")
except Exception as e:
logging.error(f"[MinerU] Unexpected error during installation check: {e}")
# If executable check fails, try API check
try:
if self.mineru_api:
# check openapi.json
openapi_exists = self._is_http_endpoint_valid(self.mineru_api + "/openapi.json")
if not openapi_exists:
reason = "[MinerU] Failed to detect vaild MinerU API server"
return openapi_exists, reason
logging.info(f"[MinerU] Detected {self.mineru_api}/openapi.json: {openapi_exists}")
self.using_api = openapi_exists
return openapi_exists, reason
else:
logging.info("[MinerU] api not exists.")
except Exception as e:
reason = f"[MinerU] Unexpected error during api check: {e}"
logging.error(f"[MinerU] Unexpected error during api check: {e}")
return False, reason
def _run_mineru(
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
):
if self.using_api:
self._run_mineru_api(input_path, output_dir, method, backend, lang, callback)
else:
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
OUTPUT_ZIP_PATH = os.path.join(str(output_dir), "output.zip")
pdf_file_path = str(input_path)
if not os.path.exists(pdf_file_path):
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
pdf_file_name = Path(pdf_file_path).stem.strip()
output_path = os.path.join(str(output_dir), pdf_file_name, method)
os.makedirs(output_path, exist_ok=True)
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
data = {
"output_dir": "./output",
"lang_list": lang,
"backend": backend,
"parse_method": method,
"formula_enable": True,
"table_enable": True,
"server_url": None,
"return_md": True,
"return_middle_json": True,
"return_model_output": True,
"return_content_list": True,
"return_images": True,
"response_format_zip": True,
"start_page_id": 0,
"end_page_id": 99999,
}
headers = {"Accept": "application/json"}
try:
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse")
if callback:
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers, timeout=1800)
response.raise_for_status()
if response.headers.get("Content-Type") == "application/zip":
self.logger.info(f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...")
if callback:
callback(0.30, f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...")
with open(OUTPUT_ZIP_PATH, "wb") as f:
f.write(response.content)
self.logger.info(f"[MinerU] Unzip to {output_path}...")
self._extract_zip_no_root(OUTPUT_ZIP_PATH, output_path, pdf_file_name + "/")
if callback:
callback(0.40, f"[MinerU] Unzip to {output_path}...")
else:
self.logger.warning("[MinerU] not zip returned from api%s " % response.headers.get("Content-Type"))
except Exception as e:
raise RuntimeError(f"[MinerU] api failed with exception {e}")
self.logger.info("[MinerU] Api completed successfully.")
def _run_mineru_executable(
self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, server_url: Optional[str] = None, callback: Optional[Callable] = None
):
cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method]
if backend:
cmd.extend(["-b", backend])
if lang:
cmd.extend(["-l", lang])
if server_url and backend == "vlm-http-client":
cmd.extend(["-u", server_url])
self.logger.info(f"[MinerU] Running command: {' '.join(cmd)}")
subprocess_kwargs = {
"stdout": subprocess.PIPE,
"stderr": subprocess.PIPE,
"text": True,
"encoding": "utf-8",
"errors": "ignore",
"bufsize": 1,
}
if platform.system() == "Windows":
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
process = subprocess.Popen(cmd, **subprocess_kwargs)
stdout_queue, stderr_queue = Queue(), Queue()
def enqueue_output(pipe, queue, prefix):
for line in iter(pipe.readline, ""):
if line.strip():
queue.put((prefix, line.strip()))
pipe.close()
threading.Thread(target=enqueue_output, args=(process.stdout, stdout_queue, "STDOUT"), daemon=True).start()
threading.Thread(target=enqueue_output, args=(process.stderr, stderr_queue, "STDERR"), daemon=True).start()
while process.poll() is None:
for q in (stdout_queue, stderr_queue):
try:
while True:
prefix, line = q.get_nowait()
if prefix == "STDOUT":
self.logger.info(f"[MinerU] {line}")
else:
self.logger.warning(f"[MinerU] {line}")
except Empty:
pass
time.sleep(0.1)
return_code = process.wait()
if return_code != 0:
raise RuntimeError(f"[MinerU] Process failed with exit code {return_code}")
self.logger.info("[MinerU] Command completed successfully.")
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
self.page_from = page_from
self.page_to = page_to
try:
with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
self.pdf = pdf
self.page_images = [p.to_image(resolution=72 * zoomin, antialias=True).original for _, p in enumerate(self.pdf.pages[page_from:page_to])]
except Exception as e:
self.page_images = None
self.total_page = 0
logging.exception(e)
def _line_tag(self, bx):
pn = [bx["page_idx"] + 1]
positions = bx.get("bbox", (0, 0, 0, 0))
x0, top, x1, bott = positions
if hasattr(self, "page_images") and self.page_images and len(self.page_images) > bx["page_idx"]:
page_width, page_height = self.page_images[bx["page_idx"]].size
x0 = (x0 / 1000.0) * page_width
x1 = (x1 / 1000.0) * page_width
top = (top / 1000.0) * page_height
bott = (bott / 1000.0) * page_height
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format("-".join([str(p) for p in pn]), x0, x1, top, bott)
def crop(self, text, ZM=1, need_position=False):
imgs = []
poss = self.extract_positions(text)
if not poss:
if need_position:
return None, None
return
if not getattr(self, "page_images", None):
self.logger.warning("[MinerU] crop called without page images; skipping image generation.")
if need_position:
return None, None
return
page_count = len(self.page_images)
filtered_poss = []
for pns, left, right, top, bottom in poss:
if not pns:
self.logger.warning("[MinerU] Empty page index list in crop; skipping this position.")
continue
valid_pns = [p for p in pns if 0 <= p < page_count]
if not valid_pns:
self.logger.warning(f"[MinerU] All page indices {pns} out of range for {page_count} pages; skipping.")
continue
filtered_poss.append((valid_pns, left, right, top, bottom))
poss = filtered_poss
if not poss:
self.logger.warning("[MinerU] No valid positions after filtering; skip cropping.")
if need_position:
return None, None
return
max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
GAP = 6
pos = poss[0]
first_page_idx = pos[0][0]
poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
pos = poss[-1]
last_page_idx = pos[0][-1]
if not (0 <= last_page_idx < page_count):
self.logger.warning(f"[MinerU] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
if need_position:
return None, None
return
last_page_height = self.page_images[last_page_idx].size[1]
poss.append(
(
[last_page_idx],
pos[1],
pos[2],
min(last_page_height, pos[4] + GAP),
min(last_page_height, pos[4] + 120),
)
)
positions = []
for ii, (pns, left, right, top, bottom) in enumerate(poss):
right = left + max_width
if bottom <= top:
bottom = top + 2
for pn in pns[1:]:
if 0 <= pn - 1 < page_count:
bottom += self.page_images[pn - 1].size[1]
else:
self.logger.warning(f"[MinerU] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
if not (0 <= pns[0] < page_count):
self.logger.warning(f"[MinerU] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
continue
img0 = self.page_images[pns[0]]
x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
crop0 = img0.crop((x0, y0, x1, y1))
imgs.append(crop0)
if 0 < ii < len(poss) - 1:
positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
bottom -= img0.size[1]
for pn in pns[1:]:
if not (0 <= pn < page_count):
self.logger.warning(f"[MinerU] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
continue
page = self.page_images[pn]
x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
cimgp = page.crop((x0, y0, x1, y1))
imgs.append(cimgp)
if 0 < ii < len(poss) - 1:
positions.append((pn + self.page_from, x0, x1, y0, y1))
bottom -= page.size[1]
if not imgs:
if need_position:
return None, None
return
height = 0
for img in imgs:
height += img.size[1] + GAP
height = int(height)
width = int(np.max([i.size[0] for i in imgs]))
pic = Image.new("RGB", (width, height), (245, 245, 245))
height = 0
for ii, img in enumerate(imgs):
if ii == 0 or ii + 1 == len(imgs):
img = img.convert("RGBA")
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
overlay.putalpha(128)
img = Image.alpha_composite(img, overlay).convert("RGB")
pic.paste(img, (0, int(height)))
height += img.size[1] + GAP
if need_position:
return pic, positions
return pic
@staticmethod
def extract_positions(txt: str):
poss = []
for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
left, right, top, bottom = float(left), float(right), float(top), float(bottom)
poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
return poss
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
subdir = output_dir / file_stem / method
if backend.startswith("vlm-"):
subdir = output_dir / file_stem / "vlm"
json_file = subdir / f"{file_stem}_content_list.json"
if not json_file.exists():
raise FileNotFoundError(f"[MinerU] Missing output file: {json_file}")
with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f)
for item in data:
for key in ("img_path", "table_img_path", "equation_img_path"):
if key in item and item[key]:
item[key] = str((subdir / item[key]).resolve())
return data
def _transfer_to_sections(self, outputs: list[dict[str, Any]], parse_method: str = None):
sections = []
for output in outputs:
match output["type"]:
case MinerUContentType.TEXT:
section = output["text"]
case MinerUContentType.TABLE:
section = output.get("table_body", "") + "\n".join(output.get("table_caption", [])) + "\n".join(output.get("table_footnote", []))
if not section.strip():
section = "FAILED TO PARSE TABLE"
case MinerUContentType.IMAGE:
section = "".join(output.get("image_caption", [])) + "\n" + "".join(output.get("image_footnote", []))
case MinerUContentType.EQUATION:
section = output["text"]
case MinerUContentType.CODE:
section = output["code_body"] + "\n".join(output.get("code_caption", []))
case MinerUContentType.LIST:
section = "\n".join(output.get("list_items", []))
case MinerUContentType.DISCARDED:
pass
if section and parse_method == "manual":
sections.append((section, output["type"], self._line_tag(output)))
elif section and parse_method == "paper":
sections.append((section + self._line_tag(output), output["type"]))
else:
sections.append((section, self._line_tag(output)))
return sections
def _transfer_to_tables(self, outputs: list[dict[str, Any]]):
return []
def parse_pdf(
self,
filepath: str | PathLike[str],
binary: BytesIO | bytes,
callback: Optional[Callable] = None,
*,
output_dir: Optional[str] = None,
backend: str = "pipeline",
lang: Optional[str] = None,
method: str = "auto",
server_url: Optional[str] = None,
delete_output: bool = True,
parse_method: str = "raw"
) -> tuple:
import shutil
temp_pdf = None
created_tmp_dir = False
# remove spaces, or mineru crash, and _read_output fail too
file_path = Path(filepath)
pdf_file_name = file_path.stem.replace(" ", "") + ".pdf"
pdf_file_path_valid = os.path.join(file_path.parent, pdf_file_name)
if binary:
temp_dir = Path(tempfile.mkdtemp(prefix="mineru_bin_pdf_"))
temp_pdf = temp_dir / pdf_file_name
with open(temp_pdf, "wb") as f:
f.write(binary)
pdf = temp_pdf
self.logger.info(f"[MinerU] Received binary PDF -> {temp_pdf}")
if callback:
callback(0.15, f"[MinerU] Received binary PDF -> {temp_pdf}")
else:
if pdf_file_path_valid != filepath:
self.logger.info(f"[MinerU] Remove all space in file name: {pdf_file_path_valid}")
shutil.move(filepath, pdf_file_path_valid)
pdf = Path(pdf_file_path_valid)
if not pdf.exists():
if callback:
callback(-1, f"[MinerU] PDF not found: {pdf}")
raise FileNotFoundError(f"[MinerU] PDF not found: {pdf}")
if output_dir:
out_dir = Path(output_dir)
out_dir.mkdir(parents=True, exist_ok=True)
else:
out_dir = Path(tempfile.mkdtemp(prefix="mineru_pdf_"))
created_tmp_dir = True
self.logger.info(f"[MinerU] Output directory: {out_dir}")
if callback:
callback(0.15, f"[MinerU] Output directory: {out_dir}")
self.__images__(pdf, zoomin=1)
try:
self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, server_url=server_url, callback=callback)
outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend)
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
if callback:
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs)
finally:
if temp_pdf and temp_pdf.exists():
try:
temp_pdf.unlink()
temp_pdf.parent.rmdir()
except Exception:
pass
if delete_output and created_tmp_dir and out_dir.exists():
try:
shutil.rmtree(out_dir)
except Exception:
pass
if __name__ == "__main__":
parser = MinerUParser("mineru")
ok, reason = parser.check_installation()
print("MinerU available:", ok)
filepath = ""
with open(filepath, "rb") as file:
outputs = parser.parse_pdf(filepath=filepath, binary=file.read())
for output in outputs:
print(output)