From b52f09adfe3561fcee4fbed67e13f751a91970c0 Mon Sep 17 00:00:00 2001 From: Edward Chen Date: Thu, 30 Oct 2025 17:31:46 +0800 Subject: [PATCH] Mineru api support (#10874) ### What problem does this PR solve? support local mineru api in docker instance. like no gpu in wsl on windows, but has mineru api with gpu support. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) --- deepdoc/parser/mineru_parser.py | 153 ++++++++++++++++++++++++++++++-- rag/app/naive.py | 3 +- rag/flow/parser/parser.py | 3 +- 3 files changed, 151 insertions(+), 8 deletions(-) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index 69a565c54..1c757ca5b 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -27,6 +27,9 @@ from os import PathLike from pathlib import Path from queue import Empty, Queue from typing import Any, Callable, Optional +import requests +import os +import zipfile import numpy as np import pdfplumber @@ -51,10 +54,52 @@ class MinerUContentType(StrEnum): class MinerUParser(RAGFlowPdfParser): - def __init__(self, mineru_path: str = "mineru"): + def __init__(self, mineru_path: str = "mineru", mineru_api: str = "http://host.docker.internal:9987"): self.mineru_path = Path(mineru_path) + self.mineru_api = mineru_api.rstrip('/') + self.using_api = False self.logger = logging.getLogger(self.__class__.__name__) + def _extract_zip_no_root(self, zip_path, extract_to, root_dir): + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + if not root_dir: + files = zip_ref.namelist() + if files and files[0].endswith('/'): + root_dir = files[0] + else: + root_dir = None + + if not root_dir or not root_dir.endswith('/'): + self.logger.info(f"[MinerU] No root directory found, extracting all...fff{root_dir}") + zip_ref.extractall(extract_to) + return + + root_len = len(root_dir) + for member in zip_ref.infolist(): + filename = member.filename + if filename == root_dir: + self.logger.info("[MinerU] Ignore root folder...") + continue + + path = filename + if path.startswith(root_dir): + path = path[root_len:] + + full_path = os.path.join(extract_to, path) + if member.is_dir(): + os.makedirs(full_path, exist_ok=True) + else: + os.makedirs(os.path.dirname(full_path), exist_ok=True) + with open(full_path, 'wb') as f: + f.write(zip_ref.read(filename)) + + def _is_http_endpoint_valid(self, url, timeout=5): + try: + response = requests.head(url, timeout=timeout, allow_redirects=True) + return response.status_code in [200, 301, 302, 307, 308] + except Exception: + return False + def check_installation(self) -> bool: subprocess_kwargs = { "capture_output": True, @@ -81,9 +126,97 @@ class MinerUParser(RAGFlowPdfParser): logging.warning("[MinerU] MinerU not found. Please install it via: pip install -U 'mineru[core]'") except Exception as e: logging.error(f"[MinerU] Unexpected error during installation check: {e}") + + try: + if self.mineru_api: + # check openapi.json + openapi_exists = self._is_http_endpoint_valid(self.mineru_api + "/openapi.json") + logging.info(f"[MinerU] Detected {self.mineru_api}/openapi.json: {openapi_exists}") + self.using_api = openapi_exists + return openapi_exists + else: + logging.info("[MinerU] api not exists.") + except Exception as e: + logging.error(f"[MinerU] Unexpected error during api check: {e}") return False - def _run_mineru(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None): + def _run_mineru(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None): + if self.using_api: + self._run_mineru_api(input_path, output_dir, method, backend, lang, callback) + else: + self._run_mineru_executable(input_path, output_dir, method, backend, lang, callback) + + def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None): + OUTPUT_ZIP_PATH = os.path.join(str(output_dir), "output.zip") + + pdf_file_path = str(input_path) + + if not os.path.exists(pdf_file_path): + raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}") + + pdf_file_name = Path(pdf_file_path).stem.strip() + output_path = os.path.join(str(output_dir), pdf_file_name, method) + os.makedirs(output_path, exist_ok=True) + + files = { + "files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf") + } + + data = { + "output_dir": "./output", + "lang_list": lang, + "backend": backend, + "parse_method": method, + "formula_enable": True, + "table_enable": True, + "server_url": None, + "return_md": True, + "return_middle_json": True, + "return_model_output": True, + "return_content_list": True, + "return_images": True, + "response_format_zip": True, + "start_page_id": 0, + "end_page_id": 99999 + } + + headers = { + "Accept": "application/json" + } + try: + self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse") + if callback: + callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse") + response = requests.post( + url=f"{self.mineru_api}/file_parse", + files=files, + data=data, + headers=headers, + timeout=1800 + ) + + response.raise_for_status() + if response.headers.get("Content-Type") == "application/zip": + self.logger.info(f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...") + + if callback: + callback(0.30, f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...") + + with open(OUTPUT_ZIP_PATH, "wb") as f: + f.write(response.content) + + self.logger.info(f"[MinerU] Unzip to {output_path}...") + self._extract_zip_no_root(OUTPUT_ZIP_PATH, output_path, pdf_file_name + "/") + + if callback: + callback(0.40, f"[MinerU] Unzip to {output_path}...") + else: + self.logger.warning("[MinerU] not zip returned from api:%s " % response.headers.get("Content-Type")) + except Exception as e: + raise RuntimeError(f"[MinerU] api failed with exception {e}") + self.logger.info("[MinerU] Api completed successfully.") + + def _run_mineru_executable(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None): cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", method] if backend: cmd.extend(["-b", backend]) @@ -261,7 +394,7 @@ class MinerUParser(RAGFlowPdfParser): case MinerUContentType.TEXT: section = output["text"] case MinerUContentType.TABLE: - section = output["table_body"] + "\n".join(output["table_caption"]) + "\n".join(output["table_footnote"]) + section = output["table_body"] if "table_body" in output else "" + "\n".join(output["table_caption"]) + "\n".join(output["table_footnote"]) case MinerUContentType.IMAGE: section = "".join(output["image_caption"]) + "\n" + "".join(output["image_footnote"]) case MinerUContentType.EQUATION: @@ -297,9 +430,14 @@ class MinerUParser(RAGFlowPdfParser): temp_pdf = None created_tmp_dir = False + # remove spaces, or mineru crash, and _read_output fail too + file_path = Path(filepath) + pdf_file_name = file_path.stem.replace(" ", "") + ".pdf" + pdf_file_path_valid = os.path.join(file_path.parent, pdf_file_name) + if binary: temp_dir = Path(tempfile.mkdtemp(prefix="mineru_bin_pdf_")) - temp_pdf = temp_dir / Path(filepath).name + temp_pdf = temp_dir / pdf_file_name with open(temp_pdf, "wb") as f: f.write(binary) pdf = temp_pdf @@ -307,7 +445,10 @@ class MinerUParser(RAGFlowPdfParser): if callback: callback(0.15, f"[MinerU] Received binary PDF -> {temp_pdf}") else: - pdf = Path(filepath) + if pdf_file_path_valid != filepath: + self.logger.info(f"[MinerU] Remove all space in file name: {pdf_file_path_valid}") + shutil.move(filepath, pdf_file_path_valid) + pdf = Path(pdf_file_path_valid) if not pdf.exists(): if callback: callback(-1, f"[MinerU] PDF not found: {pdf}") @@ -327,7 +468,7 @@ class MinerUParser(RAGFlowPdfParser): self.__images__(pdf, zoomin=1) try: - self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang) + self._run_mineru(pdf, out_dir, method=method, backend=backend, lang=lang, callback=callback) outputs = self._read_output(out_dir, pdf.stem, method=method, backend=backend) self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.") if callback: diff --git a/rag/app/naive.py b/rag/app/naive.py index 6c06e3b51..f4c523c99 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -520,7 +520,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, elif layout_recognizer == "MinerU": mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru") - pdf_parser = MinerUParser(mineru_path=mineru_executable) + mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987") + pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api) if not pdf_parser.check_installation(): callback(-1, "MinerU not found.") return res diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index 1212bd38f..b0b5da2cc 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -222,7 +222,8 @@ class Parser(ProcessBase): bboxes = [{"text": t} for t, _ in lines] elif conf.get("parse_method").lower() == "mineru": mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru") - pdf_parser = MinerUParser(mineru_path=mineru_executable) + mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987") + pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api) if not pdf_parser.check_installation(): raise RuntimeError("MinerU not found. Please install it via: pip install -U 'mineru[core]'.")