Refa: only support MinerU-API now (#11977)

### What problem does this PR solve?

Only support MinerU-API now, still need to complete frontend for
pipeline to allow the configuration of MinerU options.

### Type of change

- [x] Refactoring
This commit is contained in:
Yongteng Lei
2025-12-17 12:58:48 +08:00
committed by GitHub
parent 5e05f43c3d
commit 03f9be7cbb
19 changed files with 273 additions and 624 deletions

View File

@ -23,8 +23,6 @@ import subprocess
import sys import sys
import os import os
import logging import logging
from pathlib import Path
from typing import Dict
def get_uuid(): def get_uuid():
return uuid.uuid1().hex return uuid.uuid1().hex
@ -108,152 +106,3 @@ def pip_install_torch():
logging.info("Installing pytorch") logging.info("Installing pytorch")
pkg_names = ["torch>=2.5.0,<3.0.0"] pkg_names = ["torch>=2.5.0,<3.0.0"]
subprocess.check_call([sys.executable, "-m", "pip", "install", *pkg_names]) subprocess.check_call([sys.executable, "-m", "pip", "install", *pkg_names])
def parse_mineru_paths() -> Dict[str, Path]:
"""
Parse MinerU-related paths based on the MINERU_EXECUTABLE environment variable.
Expected layout (default convention):
MINERU_EXECUTABLE = /home/user/uv_tools/.venv/bin/mineru
From this path we derive:
- mineru_exec : full path to the mineru executable
- venv_dir : the virtual environment directory (.venv)
- tools_dir : the parent tools directory (e.g. uv_tools)
If MINERU_EXECUTABLE is not set, we fall back to the default layout:
$HOME/uv_tools/.venv/bin/mineru
Returns:
A dict with keys:
- "mineru_exec": Path
- "venv_dir": Path
- "tools_dir": Path
"""
mineru_exec_env = os.getenv("MINERU_EXECUTABLE")
if mineru_exec_env:
# Use the path from the environment variable
mineru_exec = Path(mineru_exec_env).expanduser().resolve()
venv_dir = mineru_exec.parent.parent
tools_dir = venv_dir.parent
else:
# Fall back to default convention: $HOME/uv_tools/.venv/bin/mineru
home = Path(os.path.expanduser("~"))
tools_dir = home / "uv_tools"
venv_dir = tools_dir / ".venv"
mineru_exec = venv_dir / "bin" / "mineru"
return {
"mineru_exec": mineru_exec,
"venv_dir": venv_dir,
"tools_dir": tools_dir,
}
@once
def check_and_install_mineru() -> None:
"""
Ensure MinerU is installed.
Behavior:
1. MinerU is enabled only when USE_MINERU is true/yes/1/y.
2. Resolve mineru_exec / venv_dir / tools_dir.
3. If mineru exists and works, log success and exit.
4. Otherwise:
- Create tools_dir
- Create venv if missing
- Install mineru[core], fallback to mineru[all]
- Validate with `--help`
5. Log installation success.
NOTE:
This function intentionally does NOT return the path.
Logging is used to indicate status.
"""
# Check if MinerU is enabled
use_mineru = os.getenv("USE_MINERU", "false").strip().lower()
if use_mineru != "true":
logging.info("USE_MINERU=%r. Skipping MinerU installation.", use_mineru)
return
# Resolve expected paths
paths = parse_mineru_paths()
mineru_exec: Path = paths["mineru_exec"]
venv_dir: Path = paths["venv_dir"]
tools_dir: Path = paths["tools_dir"]
# Construct environment variables for installation/execution
env = os.environ.copy()
env["VIRTUAL_ENV"] = str(venv_dir)
env["PATH"] = str(venv_dir / "bin") + os.pathsep + env.get("PATH", "")
# Configure HuggingFace endpoint
env.setdefault("HUGGINGFACE_HUB_ENDPOINT", os.getenv("HF_ENDPOINT") or "https://hf-mirror.com")
# Helper: check whether mineru works
def mineru_works() -> bool:
try:
subprocess.check_call(
[str(mineru_exec), "--help"],
stdout=subprocess.DEVNULL,
stderr=subprocess.PIPE,
env=env,
)
return True
except Exception:
return False
# If MinerU is already installed and functional
if mineru_exec.is_file() and os.access(mineru_exec, os.X_OK) and mineru_works():
logging.info("MinerU already installed.")
os.environ["MINERU_EXECUTABLE"] = str(mineru_exec)
return
logging.info("MinerU not found. Installing into virtualenv: %s", venv_dir)
# Ensure parent directory exists
tools_dir.mkdir(parents=True, exist_ok=True)
# Create venv if missing
if not venv_dir.exists():
subprocess.check_call(
["uv", "venv", str(venv_dir)],
cwd=str(tools_dir),
env=env,
# stdout=subprocess.DEVNULL,
# stderr=subprocess.PIPE,
)
else:
logging.info("Virtual environment exists at %s. Reusing it.", venv_dir)
# Helper for pip install
def pip_install(pkg: str) -> None:
subprocess.check_call(
[
"uv", "pip", "install", "-U", pkg,
"-i", "https://mirrors.aliyun.com/pypi/simple",
"--extra-index-url", "https://pypi.org/simple",
],
cwd=str(tools_dir),
# stdout=subprocess.DEVNULL,
# stderr=subprocess.PIPE,
env=env,
)
# Install core version first; fallback to all
try:
logging.info("Installing mineru[core] ...")
pip_install("mineru[core]")
except subprocess.CalledProcessError:
logging.warning("mineru[core] installation failed. Installing mineru[all] ...")
pip_install("mineru[all]")
# Validate installation
if not mineru_works():
logging.error("MinerU installation failed: %s does not work.", mineru_exec)
raise RuntimeError(f"MinerU installation failed: {mineru_exec} is not functional")
os.environ["MINERU_EXECUTABLE"] = str(mineru_exec)
logging.info("MinerU installation completed successfully. Executable: %s", mineru_exec)

View File

@ -16,19 +16,15 @@
import json import json
import logging import logging
import os import os
import platform
import re import re
import subprocess
import sys import sys
import tempfile import tempfile
import threading import threading
import time
import zipfile import zipfile
from dataclasses import dataclass from dataclasses import dataclass
from io import BytesIO from io import BytesIO
from os import PathLike from os import PathLike
from pathlib import Path from pathlib import Path
from queue import Empty, Queue
from typing import Any, Callable, Optional from typing import Any, Callable, Optional
import numpy as np import numpy as np
@ -137,10 +133,8 @@ class MinerUParseOptions:
class MinerUParser(RAGFlowPdfParser): class MinerUParser(RAGFlowPdfParser):
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "", mineru_server_url: str = ""): def __init__(self, mineru_path: str = "mineru", mineru_api: str = "", mineru_server_url: str = ""):
self.mineru_path = Path(mineru_path)
self.mineru_api = mineru_api.rstrip("/") self.mineru_api = mineru_api.rstrip("/")
self.mineru_server_url = mineru_server_url.rstrip("/") self.mineru_server_url = mineru_server_url.rstrip("/")
self.using_api = False
self.outlines = [] self.outlines = []
self.logger = logging.getLogger(self.__class__.__name__) self.logger = logging.getLogger(self.__class__.__name__)
@ -189,105 +183,59 @@ class MinerUParser(RAGFlowPdfParser):
def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]: def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]:
reason = "" reason = ""
valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine", "vlm-mlx-engine"] valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine", "vlm-mlx-engine", "vlm-vllm-async-engine", "vlm-lmdeploy-engine"]
if backend not in valid_backends: if backend not in valid_backends:
reason = "[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}" reason = f"[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
self.logger.warning(reason) self.logger.warning(reason)
return False, reason return False, reason
subprocess_kwargs = { if not self.mineru_api:
"capture_output": True, reason = "[MinerU] MINERU_APISERVER not configured."
"text": True, self.logger.warning(reason)
"check": True, return False, reason
"encoding": "utf-8",
"errors": "ignore",
}
if platform.system() == "Windows": api_openapi = f"{self.mineru_api}/openapi.json"
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
if server_url is None:
server_url = self.mineru_server_url
if backend == "vlm-http-client" and server_url:
try: try:
server_accessible = self._is_http_endpoint_valid(server_url + "/openapi.json") api_ok = self._is_http_endpoint_valid(api_openapi)
self.logger.info(f"[MinerU] vlm-http-client server check: {server_accessible}") self.logger.info(f"[MinerU] API openapi.json reachable={api_ok} url={api_openapi}")
if server_accessible: if not api_ok:
self.using_api = False # We are using http client, not API reason = f"[MinerU] MinerU API not accessible: {api_openapi}"
return False, reason
except Exception as exc:
reason = f"[MinerU] MinerU API check failed: {exc}"
self.logger.warning(reason)
return False, reason
if backend == "vlm-http-client":
resolved_server = server_url or self.mineru_server_url
if not resolved_server:
reason = "[MinerU] MINERU_SERVER_URL required for vlm-http-client backend."
self.logger.warning(reason)
return False, reason
try:
server_ok = self._is_http_endpoint_valid(resolved_server)
self.logger.info(f"[MinerU] vlm-http-client server check reachable={server_ok} url={resolved_server}")
except Exception as exc:
self.logger.warning(f"[MinerU] vlm-http-client server probe failed: {resolved_server}: {exc}")
return True, reason return True, reason
else:
reason = f"[MinerU] vlm-http-client server not accessible: {server_url}"
self.logger.warning(f"[MinerU] vlm-http-client server not accessible: {server_url}")
return False, reason
except Exception as e:
self.logger.warning(f"[MinerU] vlm-http-client server check failed: {e}")
try:
response = requests.get(server_url, timeout=5)
self.logger.info(
f"[MinerU] vlm-http-client server connection check: success with status {response.status_code}")
self.using_api = False
return True, reason
except Exception as e:
reason = f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}"
self.logger.warning(f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}")
return False, reason
try:
result = subprocess.run([str(self.mineru_path), "--version"], **subprocess_kwargs)
version_info = result.stdout.strip()
if version_info:
self.logger.info(f"[MinerU] Detected version: {version_info}")
else:
self.logger.info("[MinerU] Detected MinerU, but version info is empty.")
return True, reason
except subprocess.CalledProcessError as e:
self.logger.warning(f"[MinerU] Execution failed (exit code {e.returncode}).")
except FileNotFoundError:
self.logger.warning("[MinerU] MinerU not found. Please install it via: pip install -U 'mineru[core]'")
except Exception as e:
self.logger.error(f"[MinerU] Unexpected error during installation check: {e}")
# If executable check fails, try API check
try:
if self.mineru_api:
# check openapi.json
openapi_exists = self._is_http_endpoint_valid(self.mineru_api + "/openapi.json")
if not openapi_exists:
reason = "[MinerU] Failed to detect vaild MinerU API server"
return openapi_exists, reason
self.logger.info(f"[MinerU] Detected {self.mineru_api}/openapi.json: {openapi_exists}")
self.using_api = openapi_exists
return openapi_exists, reason
else:
reason = "[MinerU] api not exists. Setting MINERU_SERVER_URL if your backend is vlm-http-client."
self.logger.info(reason)
return False, reason
except Exception as e:
reason = f"[MinerU] Unexpected error during api check: {e}"
self.logger.error(f"[MinerU] Unexpected error during api check: {e}")
return False, reason
def _run_mineru( def _run_mineru(
self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
): ) -> Path:
if self.using_api: return self._run_mineru_api(input_path, output_dir, options, callback)
self._run_mineru_api(input_path, output_dir, options, callback)
else:
self._run_mineru_executable(input_path, output_dir, options, callback)
def _run_mineru_api(self, input_path: Path, output_dir: Path, options: MinerUParseOptions,
callback: Optional[Callable] = None):
output_zip_path = os.path.join(str(output_dir), "output.zip")
def _run_mineru_api(
self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
) -> Path:
pdf_file_path = str(input_path) pdf_file_path = str(input_path)
if not os.path.exists(pdf_file_path): if not os.path.exists(pdf_file_path):
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}") raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
pdf_file_name = Path(pdf_file_path).stem.strip() pdf_file_name = Path(pdf_file_path).stem.strip()
output_path = os.path.join(str(output_dir), pdf_file_name, options.method) output_path = tempfile.mkdtemp(prefix=f"{pdf_file_name}_{options.method}_", dir=str(output_dir))
os.makedirs(output_path, exist_ok=True) output_zip_path = os.path.join(str(output_dir), f"{Path(output_path).name}.zip")
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")} files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
@ -309,9 +257,19 @@ class MinerUParser(RAGFlowPdfParser):
"end_page_id": 99999, "end_page_id": 99999,
} }
if options.server_url:
data["server_url"] = options.server_url
elif self.mineru_server_url:
data["server_url"] = self.mineru_server_url
print("--------------------------------", flush=True)
print(f"{data=}", flush=True)
print(f"{options=}", flush=True)
print("--------------------------------", flush=True)
headers = {"Accept": "application/json"} headers = {"Accept": "application/json"}
try: try:
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse") self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse backend={options.backend} server_url={data.get('server_url')}")
if callback: if callback:
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse") callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers, response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers,
@ -333,65 +291,11 @@ class MinerUParser(RAGFlowPdfParser):
if callback: if callback:
callback(0.40, f"[MinerU] Unzip to {output_path}...") callback(0.40, f"[MinerU] Unzip to {output_path}...")
else: else:
self.logger.warning("[MinerU] not zip returned from api%s " % response.headers.get("Content-Type")) self.logger.warning(f"[MinerU] not zip returned from api: {response.headers.get('Content-Type')}")
except Exception as e: except Exception as e:
raise RuntimeError(f"[MinerU] api failed with exception {e}") raise RuntimeError(f"[MinerU] api failed with exception {e}")
self.logger.info("[MinerU] Api completed successfully.") self.logger.info("[MinerU] Api completed successfully.")
return Path(output_path)
def _run_mineru_executable(
self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
):
cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", options.method]
if options.backend:
cmd.extend(["-b", options.backend])
if options.lang:
cmd.extend(["-l", options.lang])
if options.server_url and options.backend == "vlm-http-client":
cmd.extend(["-u", options.server_url])
self.logger.info(f"[MinerU] Running command: {' '.join(cmd)}")
subprocess_kwargs = {
"stdout": subprocess.PIPE,
"stderr": subprocess.PIPE,
"text": True,
"encoding": "utf-8",
"errors": "ignore",
"bufsize": 1,
}
if platform.system() == "Windows":
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
process = subprocess.Popen(cmd, **subprocess_kwargs)
stdout_queue, stderr_queue = Queue(), Queue()
def enqueue_output(pipe, queue, prefix):
for line in iter(pipe.readline, ""):
if line.strip():
queue.put((prefix, line.strip()))
pipe.close()
threading.Thread(target=enqueue_output, args=(process.stdout, stdout_queue, "STDOUT"), daemon=True).start()
threading.Thread(target=enqueue_output, args=(process.stderr, stderr_queue, "STDERR"), daemon=True).start()
while process.poll() is None:
for q in (stdout_queue, stderr_queue):
try:
while True:
prefix, line = q.get_nowait()
if prefix == "STDOUT":
self.logger.info(f"[MinerU] {line}")
else:
self.logger.warning(f"[MinerU] {line}")
except Empty:
pass
time.sleep(0.1)
return_code = process.wait()
if return_code != 0:
raise RuntimeError(f"[MinerU] Process failed with exit code {return_code}")
self.logger.info("[MinerU] Command completed successfully.")
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None): def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
self.page_from = page_from self.page_from = page_from
@ -554,25 +458,6 @@ class MinerUParser(RAGFlowPdfParser):
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[ def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[
dict[str, Any]]: dict[str, Any]]:
candidates = []
seen = set()
def add_candidate_path(p: Path):
if p not in seen:
seen.add(p)
candidates.append(p)
if backend.startswith("vlm-"):
add_candidate_path(output_dir / file_stem / "vlm")
if method:
add_candidate_path(output_dir / file_stem / method)
add_candidate_path(output_dir / file_stem / "auto")
else:
if method:
add_candidate_path(output_dir / file_stem / method)
add_candidate_path(output_dir / file_stem / "vlm")
add_candidate_path(output_dir / file_stem / "auto")
json_file = None json_file = None
subdir = None subdir = None
attempted = [] attempted = []
@ -588,33 +473,28 @@ class MinerUParser(RAGFlowPdfParser):
safe_stem = _sanitize_filename(file_stem) safe_stem = _sanitize_filename(file_stem)
allowed_names = {f"{file_stem}_content_list.json", f"{safe_stem}_content_list.json"} allowed_names = {f"{file_stem}_content_list.json", f"{safe_stem}_content_list.json"}
self.logger.info(f"[MinerU] Expected output files: {', '.join(sorted(allowed_names))}") self.logger.info(f"[MinerU] Expected output files: {', '.join(sorted(allowed_names))}")
self.logger.info(f"[MinerU] Searching output candidates: {', '.join(str(c) for c in candidates)}") self.logger.info(f"[MinerU] Searching output in: {output_dir}")
for sub in candidates: jf = output_dir / f"{file_stem}_content_list.json"
jf = sub / f"{file_stem}_content_list.json"
self.logger.info(f"[MinerU] Trying original path: {jf}") self.logger.info(f"[MinerU] Trying original path: {jf}")
attempted.append(jf) attempted.append(jf)
if jf.exists(): if jf.exists():
subdir = sub subdir = output_dir
json_file = jf json_file = jf
break else:
alt = output_dir / f"{safe_stem}_content_list.json"
# MinerU API sanitizes non-ASCII filenames inside the ZIP root and file names.
alt = sub / f"{safe_stem}_content_list.json"
self.logger.info(f"[MinerU] Trying sanitized filename: {alt}") self.logger.info(f"[MinerU] Trying sanitized filename: {alt}")
attempted.append(alt) attempted.append(alt)
if alt.exists(): if alt.exists():
subdir = sub subdir = output_dir
json_file = alt json_file = alt
break else:
nested_alt = output_dir / safe_stem / f"{safe_stem}_content_list.json"
nested_alt = sub / safe_stem / f"{safe_stem}_content_list.json"
self.logger.info(f"[MinerU] Trying sanitized nested path: {nested_alt}") self.logger.info(f"[MinerU] Trying sanitized nested path: {nested_alt}")
attempted.append(nested_alt) attempted.append(nested_alt)
if nested_alt.exists(): if nested_alt.exists():
subdir = nested_alt.parent subdir = nested_alt.parent
json_file = nested_alt json_file = nested_alt
break
if not json_file: if not json_file:
raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(p) for p in attempted)}") raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(p) for p in attempted)}")
@ -680,12 +560,12 @@ class MinerUParser(RAGFlowPdfParser):
temp_pdf = None temp_pdf = None
created_tmp_dir = False created_tmp_dir = False
# Assuming the dict is defined as shown parser_cfg = kwargs.get('parser_config', {})
lang = kwargs.get('lang', 'English') lang = parser_cfg.get('mineru_lang') or kwargs.get('lang', 'English')
mineru_lang_code = LANGUAGE_TO_MINERU_MAP.get(lang, 'ch') # Returns 'ch' if lang not found mineru_lang_code = LANGUAGE_TO_MINERU_MAP.get(lang, 'ch') # Defaults to Chinese if not matched
mineru_method_raw_str = kwargs.get('parser_config', {}).get('mineru_parse_method', 'auto') mineru_method_raw_str = parser_cfg.get('mineru_parse_method', 'auto')
enable_formula = kwargs.get('parser_config', {}).get('mineru_formula_enable', True) enable_formula = parser_cfg.get('mineru_formula_enable', True)
enable_table = kwargs.get('parser_config', {}).get('mineru_enable', True) enable_table = parser_cfg.get('mineru_table_enable', True)
# remove spaces, or mineru crash, and _read_output fail too # remove spaces, or mineru crash, and _read_output fail too
file_path = Path(filepath) file_path = Path(filepath)
@ -718,7 +598,7 @@ class MinerUParser(RAGFlowPdfParser):
out_dir = Path(tempfile.mkdtemp(prefix="mineru_pdf_")) out_dir = Path(tempfile.mkdtemp(prefix="mineru_pdf_"))
created_tmp_dir = True created_tmp_dir = True
self.logger.info(f"[MinerU] Output directory: {out_dir}") self.logger.info(f"[MinerU] Output directory: {out_dir} backend={backend} api={self.mineru_api} server_url={server_url or self.mineru_server_url}")
if callback: if callback:
callback(0.15, f"[MinerU] Output directory: {out_dir}") callback(0.15, f"[MinerU] Output directory: {out_dir}")
@ -735,8 +615,8 @@ class MinerUParser(RAGFlowPdfParser):
formula_enable=enable_formula, formula_enable=enable_formula,
table_enable=enable_table, table_enable=enable_table,
) )
self._run_mineru(pdf, out_dir, options, callback=callback) final_out_dir = self._run_mineru(pdf, out_dir, options, callback=callback)
outputs = self._read_output(out_dir, pdf.stem, method=mineru_method_raw_str, backend=backend) outputs = self._read_output(final_out_dir, pdf.stem, method=mineru_method_raw_str, backend=backend)
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.") self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
if callback: if callback:
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.") callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")

View File

@ -201,64 +201,10 @@ function ensure_docling() {
|| python3 -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --extra-index-url https://pypi.org/simple --no-cache-dir "docling${DOCLING_PIN}" || python3 -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --extra-index-url https://pypi.org/simple --no-cache-dir "docling${DOCLING_PIN}"
} }
function ensure_mineru() {
[[ "${USE_MINERU}" == "true" ]] || { echo "[mineru] disabled by USE_MINERU"; return 0; }
export HUGGINGFACE_HUB_ENDPOINT="${HF_ENDPOINT:-https://hf-mirror.com}"
local default_prefix="/ragflow/uv_tools"
local venv_dir="${default_prefix}/.venv"
local exe="${MINERU_EXECUTABLE:-${venv_dir}/bin/mineru}"
local mineru_backend="${MINERU_BACKEND:-pipeline}"
local mineru_pkg="mineru[core]"
if [[ "${mineru_backend}" == vlm-* ]]; then
mineru_pkg="mineru[core,vlm]"
fi
if [[ -x "${exe}" ]]; then
echo "[mineru] found: ${exe} (MINERU_BACKEND=${mineru_backend})"
export MINERU_EXECUTABLE="${exe}"
if [[ "${mineru_backend}" == vlm-* ]]; then
if ! "${venv_dir}/bin/python3" -c "import importlib.util, sys; sys.exit(0 if importlib.util.find_spec('vllm') else 1)" >/dev/null 2>&1; then
echo "[mineru] vllm not found for MINERU_BACKEND=${mineru_backend}, installing ${mineru_pkg} ..."
(
set -e
source "${venv_dir}/bin/activate"
uv pip install -U "${mineru_pkg}" -i https://mirrors.aliyun.com/pypi/simple --extra-index-url https://pypi.org/simple
deactivate
) || return 1
fi
fi
return 0
fi
echo "[mineru] not found, bootstrapping with uv ... (MINERU_BACKEND=${mineru_backend}, pkg=${mineru_pkg})"
(
set -e
mkdir -p "${default_prefix}"
cd "${default_prefix}"
[[ -d "${venv_dir}" ]] || { echo "[mineru] creating venv at ${venv_dir} ..."; uv venv "${venv_dir}"; }
echo "[mineru] installing ${mineru_pkg} into ${venv_dir} ..."
source "${venv_dir}/bin/activate"
uv pip install -U "${mineru_pkg}" -i https://mirrors.aliyun.com/pypi/simple --extra-index-url https://pypi.org/simple
deactivate
)
export MINERU_EXECUTABLE="${exe}"
if ! "${MINERU_EXECUTABLE}" --help >/dev/null 2>&1; then
echo "[mineru] installation failed: ${MINERU_EXECUTABLE} not working" >&2
return 1
fi
echo "[mineru] installed: ${MINERU_EXECUTABLE}"
}
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Start components based on flags # Start components based on flags
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
ensure_docling ensure_docling
ensure_mineru
if [[ "${ENABLE_WEBSERVER}" -eq 1 ]]; then if [[ "${ENABLE_WEBSERVER}" -eq 1 ]]; then
echo "Starting nginx..." echo "Starting nginx..."

View File

@ -493,66 +493,37 @@ See [here](./guides/agent/best_practices/accelerate_agent_question_answering.md)
### How to use MinerU to parse PDF documents? ### How to use MinerU to parse PDF documents?
MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports MinerU (>= 2.6.3) as an optional PDF parser with multiple backends. RAGFlow acts only as a client for MinerU, calling it to parse documents, reading the output files, and ingesting the parsed content. To use this feature, follow these steps: MinerU PDF document parsing is available starting from v0.22.0. RAGFlow works only as a remote client to MinerU (>= 2.6.3) and does not install or execute MinerU locally. To use this feature:
1. Prepare MinerU
```bash
# docker/.env
...
USE_MINERU=true
...
```
Enabling `USE_MINERU=true` will internally perform the same setup as the manual configuration (including setting the MinerU executable path and related environment variables).
2. Start RAGFlow with MinerU enabled:
- **Source deployment** in the RAGFlow repo, continue to start the backend service:
```bash
...
source .venv/bin/activate
export PYTHONPATH=$(pwd)
bash docker/launch_backend_service.sh
```
- **Docker deployment** after setting `USE_MINERU=true`, restart the containers so that the new settings take effect:
```bash
# in RAGFlow repo
docker compose -f docker/docker-compose.yml restart
```
1. Prepare a reachable MinerU API service (for example, the FastAPI server provided by MinerU).
2. Configure RAGFlow with remote MinerU settings (environment variables or UI model provider):
- `MINERU_APISERVER`: MinerU API endpoint, for example `http://mineru-host:8886`.
- `MINERU_BACKEND`: MinerU backend, defaults to `pipeline` (supports `vlm-http-client`, `vlm-transformers`, `vlm-vllm-engine`, `vlm-mlx-engine`, `vlm-vllm-async-engine`, `vlm-lmdeploy-engine`).
- `MINERU_SERVER_URL`: (optional) For `vlm-http-client`, the downstream vLLM HTTP server, for example `http://vllm-host:30000`.
- `MINERU_OUTPUT_DIR`: (optional) Local directory to store MinerU API outputs (zip/JSON) before ingestion.
- `MINERU_DELETE_OUTPUT`: Whether to delete temporary output when a temp dir is used (`1` deletes temp outputs; set `0` to keep).
3. In the web UI, navigate to the **Configuration** page of your dataset. Click **Built-in** in the **Ingestion pipeline** section, select a chunking method from the **Built-in** dropdown (which supports PDF parsing), and select **MinerU** in **PDF parser**. 3. In the web UI, navigate to the **Configuration** page of your dataset. Click **Built-in** in the **Ingestion pipeline** section, select a chunking method from the **Built-in** dropdown (which supports PDF parsing), and select **MinerU** in **PDF parser**.
4. If you use a custom ingestion pipeline instead, you must also complete the first two steps before selecting **MinerU** in the **Parsing method** section of the **Parser** component. 4. If you use a custom ingestion pipeline instead, provide the same MinerU settings and select **MinerU** in the **Parsing method** section of the **Parser** component.
--- ---
### How to configure MinerU-specific settings? ### How to configure MinerU-specific settings?
The table below summarizes the most frequently used MinerU environment variables: The table below summarizes the most frequently used MinerU environment variables for remote MinerU:
| Environment variable | Description | Default | Example | | Environment variable | Description | Default | Example |
| ---------------------- | ---------------------------------- | ----------------------------------- | ----------------------------------------------------------------------------------------------- | | ---------------------- | ---------------------------------- | ----------------------------------- | ----------------------------------------------------------------------------------------------- |
| `MINERU_EXECUTABLE` | Path to the local MinerU executable | `mineru` | `MINERU_EXECUTABLE=/home/ragflow/uv_tools/.venv/bin/mineru` | | `MINERU_APISERVER` | URL of the MinerU API service | _unset_ | `MINERU_APISERVER=http://your-mineru-server:8886` |
| `MINERU_DELETE_OUTPUT` | Whether to delete MinerU output directory | `1` (do **not** keep the output directory) | `MINERU_DELETE_OUTPUT=0` | | `MINERU_BACKEND` | MinerU parsing backend | `pipeline` | `MINERU_BACKEND=pipeline\|vlm-transformers\|vlm-vllm-engine\|vlm-mlx-engine\|vlm-vllm-async-engine\|vlm-http-client` |
| `MINERU_SERVER_URL` | URL of remote vLLM server (for `vlm-http-client`) | _unset_ | `MINERU_SERVER_URL=http://your-vllm-server-ip:30000` |
| `MINERU_OUTPUT_DIR` | Directory for MinerU output files | System-defined temporary directory | `MINERU_OUTPUT_DIR=/home/ragflow/mineru/output` | | `MINERU_OUTPUT_DIR` | Directory for MinerU output files | System-defined temporary directory | `MINERU_OUTPUT_DIR=/home/ragflow/mineru/output` |
| `MINERU_BACKEND` | MinerU parsing backend | `pipeline` | `MINERU_BACKEND=pipeline\|vlm-transformers\|vlm-vllm-engine\|vlm-http-client` | | `MINERU_DELETE_OUTPUT` | Whether to delete MinerU output directory when a temp dir is used | `1` (delete temp output) | `MINERU_DELETE_OUTPUT=0` |
| `MINERU_SERVER_URL` | URL of remote vLLM server (only for `vlm-http-client` backend) | _unset_ | `MINERU_SERVER_URL=http://your-vllm-server-ip:30000` |
| `MINERU_APISERVER` | URL of remote MinerU service used as the parser (instead of local MinerU) | _unset_ | `MINERU_APISERVER=http://your-mineru-server:port` |
1. Set `MINERU_EXECUTABLE` to the path to the MinerU executable if the default `mineru` is not on `PATH`. 1. Set `MINERU_APISERVER` to point RAGFlow to your MinerU API server.
2. Set `MINERU_DELETE_OUTPUT` to `0` to keep MinerU's output. (Default: `1`, which deletes temporary output.) 2. Set `MINERU_BACKEND` to specify a parsing backend.
3. Set `MINERU_OUTPUT_DIR` to specify the output directory for MinerU; otherwise, a system temp directory is used. 3. If using the `"vlm-http-client"` backend, set `MINERU_SERVER_URL` to your vLLM server's URL. MinerU API expects `backend=vlm-http-client` and `server_url=http://<server>:30000` in the request body.
4. Set `MINERU_BACKEND` to specify a parsing backend: 4. Set `MINERU_OUTPUT_DIR` to specify where RAGFlow stores MinerU API output; otherwise, a system temp directory is used.
- `"pipeline"` (default): The traditional multimodel pipeline. 5. Set `MINERU_DELETE_OUTPUT` to `0` to keep MinerU's temp output (useful for debugging).
- `"vlm-transformers"`: A vision-language model using HuggingFace Transformers.
- `"vlm-vllm-engine"`: A vision-language model using a local vLLM engine (requires a local GPU).
- `"vlm-http-client"`: A vision-language model via HTTP client to a remote vLLM server (RAGFlow only requires CPU).
5. If using the `"vlm-http-client"` backend, you must also set `MINERU_SERVER_URL` to your vLLM server's URL.
6. If configuring RAGFlow to call a *remote* MinerU service, set `MINERU_APISERVER` to the MinerU server's URL.
:::tip NOTE :::tip NOTE
For information about other environment variables natively supported by MinerU, see [here](https://opendatalab.github.io/MinerU/usage/cli_tools/#environment-variables-description). For information about other environment variables natively supported by MinerU, see [here](https://opendatalab.github.io/MinerU/usage/cli_tools/#environment-variables-description).
@ -562,21 +533,16 @@ For information about other environment variables natively supported by MinerU,
### How to use MinerU with a vLLM server for document parsing? ### How to use MinerU with a vLLM server for document parsing?
RAGFlow supports MinerU's `vlm-http-client` backend, enabling you to delegate document parsing tasks to a remote vLLM server. With this configuration, RAGFlow will connect to your remote vLLM server as a client and use its powerful GPU resources for document parsing. This significantly improves performance for parsing complex documents while reducing the resources required on your RAGFlow server. To configure MinerU with a vLLM server: RAGFlow supports MinerU's `vlm-http-client` backend, enabling you to delegate document parsing tasks to a remote vLLM server while calling MinerU via HTTP. To configure:
1. Set up a vLLM server running MinerU: 1. Ensure a MinerU API service is reachable (for example `http://mineru-host:8886`).
2. Set up or point to a vLLM HTTP server (for example `http://vllm-host:30000`).
```bash 3. Configure the following in your **docker/.env** file (or your shell if running from source):
mineru-vllm-server --port 30000 - `MINERU_APISERVER=http://mineru-host:8886`
```
2. Configure the following environment variables in your **docker/.env** file (or your shell if running from source):
- `MINERU_EXECUTABLE=/home/ragflow/uv_tools/.venv/bin/mineru` (or the path to your MinerU executable)
- `MINERU_BACKEND="vlm-http-client"` - `MINERU_BACKEND="vlm-http-client"`
- `MINERU_SERVER_URL="http://your-vllm-server-ip:30000"` - `MINERU_SERVER_URL="http://vllm-host:30000"`
MinerU API calls expect `backend=vlm-http-client` and `server_url=http://<server>:30000` in the request body.
3. Complete the rest of the standard MinerU setup steps as described [here](#how-to-configure-mineru-specific-settings). 4. Configure `MINERU_OUTPUT_DIR` / `MINERU_DELETE_OUTPUT` as desired to manage the returned zip/JSON before ingestion.
:::tip NOTE :::tip NOTE
When using the `vlm-http-client` backend, the RAGFlow server requires no GPU, only network connectivity. This enables cost-effective distributed deployment with multiple RAGFlow instances sharing one remote vLLM server. When using the `vlm-http-client` backend, the RAGFlow server requires no GPU, only network connectivity. This enables cost-effective distributed deployment with multiple RAGFlow instances sharing one remote vLLM server.

View File

@ -40,56 +40,21 @@ The output of a PDF parser is `json`. In the PDF parser, you select the parsing
- A third-party visual model from a specific model provider. - A third-party visual model from a specific model provider.
:::danger IMPORTANT :::danger IMPORTANT
MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports MinerU (>= 2.6.3) as an optional PDF parser with multiple backends. RAGFlow acts only as a client for MinerU, calling it to parse documents, reading the output files, and ingesting the parsed content. To use this feature, follow these steps: MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports MinerU (>= 2.6.3) as an optional PDF parser with multiple backends. RAGFlow acts only as a **remote client** for MinerU, calling the MinerU API to parse documents, reading the returned output files, and ingesting the parsed content. To use this feature:
:::
1. Prepare MinerU: 1. Prepare a reachable MinerU API service (FastAPI server).
2. Configure RAGFlow with the remote MinerU settings (env or UI model provider):
- `MINERU_APISERVER`: MinerU API endpoint, for example `http://mineru-host:8886`.
- `MINERU_BACKEND`: MinerU backend, defaults to `pipeline` (supports `vlm-http-client`, `vlm-transformers`, `vlm-vllm-engine`, `vlm-mlx-engine`, `vlm-vllm-async-engine`, `vlm-lmdeploy-engine`).
- `MINERU_SERVER_URL`: (optional) For `vlm-http-client`, the downstream vLLM HTTP server, for example `http://vllm-host:30000`.
- `MINERU_OUTPUT_DIR`: (optional) Local directory to store MinerU API outputs (zip/JSON) before ingestion.
- `MINERU_DELETE_OUTPUT`: Whether to delete temporary output when a temp dir is used (`1` deletes temp outputs; set `0` to keep).
3. In the web UI, navigate to the **Configuration** page of your dataset. Click **Built-in** in the **Ingestion pipeline** section, select a chunking method from the **Built-in** dropdown, which supports PDF parsing, and select **MinerU** in **PDF parser**.
4. If you use a custom ingestion pipeline instead, provide the same MinerU settings and select **MinerU** in the **Parsing method** section of the **Parser** component.
- **If you deploy RAGFlow from source**, install MinerU into an isolated virtual environment (recommended path: `$HOME/uv_tools`): :::note
All MinerU environment variables are optional. If set, RAGFlow will auto-provision a MinerU OCR model for the tenant on first use with these values. To avoid auto-provisioning, configure MinerU solely through the UI and leave the env vars unset.
```bash
mkdir -p "$HOME/uv_tools"
cd "$HOME/uv_tools"
uv venv .venv
source .venv/bin/activate
uv pip install -U "mineru[core]" -i https://mirrors.aliyun.com/pypi/simple
# or
# uv pip install -U "mineru[all]" -i https://mirrors.aliyun.com/pypi/simple
```
- **If you deploy RAGFlow with Docker**, you usually only need to turn on MinerU support in `docker/.env`:
```bash
# docker/.env
...
USE_MINERU=true
...
```
Enabling `USE_MINERU=true` will internally perform the same setup as the manual configuration (including setting the MinerU executable path and related environment variables). You only need the manual installation above if you are running from source or want full control over the MinerU installation.
2. Start RAGFlow with MinerU enabled:
- **Source deployment** in the RAGFlow repo, export the key MinerU-related variables and start the backend service:
```bash
# in RAGFlow repo
export MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru"
export MINERU_DELETE_OUTPUT=0 # keep output directory
export MINERU_BACKEND=pipeline # or another backend you prefer
source .venv/bin/activate
export PYTHONPATH=$(pwd)
bash docker/launch_backend_service.sh
```
- **Docker deployment** after setting `USE_MINERU=true`, restart the containers so that the new settings take effect:
```bash
# in RAGFlow repo
docker compose -f docker/docker-compose.yml restart
```
3. Restart the ragflow-server.
::: :::
:::caution WARNING :::caution WARNING

View File

@ -33,65 +33,28 @@ RAGFlow isn't one-size-fits-all. It is built for flexibility and supports deeper
2. Select the option that works best with your scenario: 2. Select the option that works best with your scenario:
- DeepDoc: (Default) The default visual model performing OCR, TSR, and DLR tasks on PDFs, but can be time-consuming. - DeepDoc: (Default) The default visual model performing OCR, TSR, and DLR tasks on PDFs, but can be time-consuming.
- Naive: Skip OCR, TSR, and DLR tasks if *all* your PDFs are plain text. - Naive: Skip OCR, TSR, and DLR tasks if _all_ your PDFs are plain text.
- [MinerU](https://github.com/opendatalab/MinerU): (Experimental) An open-source tool that converts PDF into machine-readable formats. - [MinerU](https://github.com/opendatalab/MinerU): (Experimental) An open-source tool that converts PDF into machine-readable formats.
- [Docling](https://github.com/docling-project/docling): (Experimental) An open-source document processing tool for gen AI. - [Docling](https://github.com/docling-project/docling): (Experimental) An open-source document processing tool for gen AI.
- A third-party visual model from a specific model provider. - A third-party visual model from a specific model provider.
:::danger IMPORTANT :::danger IMPORTANT
MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports MinerU (>= 2.6.3) as an optional PDF parser with multiple backends. RAGFlow acts only as a client for MinerU, calling it to parse documents, reading the output files, and ingesting the parsed content. To use this feature, follow these steps: MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports MinerU (>= 2.6.3) as an optional PDF parser with multiple backends. RAGFlow acts only as a **remote client** for MinerU, calling the MinerU API to parse documents, reading the returned output files, and ingesting the parsed content. To use this feature:
1. Prepare MinerU: 1. Prepare a reachable MinerU API service (FastAPI server).
2. Configure RAGFlow with the remote MinerU settings (env or UI model provider):
- `MINERU_APISERVER`: MinerU API endpoint, for example `http://mineru-host:8886`.
- `MINERU_BACKEND`: MinerU backend, defaults to `pipeline` (supports `vlm-http-client`, `vlm-transformers`, `vlm-vllm-engine`, `vlm-mlx-engine`, `vlm-vllm-async-engine`).
- `MINERU_SERVER_URL`: (optional) For `vlm-http-client`, the downstream vLLM HTTP server, for example `http://vllm-host:30000`.
- `MINERU_OUTPUT_DIR`: (optional) Local directory to store MinerU API outputs (zip/JSON) before ingestion.
- `MINERU_DELETE_OUTPUT`: Whether to delete temporary output when a temp dir is used (`1` deletes temp outputs; set `0` to keep).
3. In the web UI, navigate to the **Configuration** page of your dataset. Click **Built-in** in the **Ingestion pipeline** section, select a chunking method from the **Built-in** dropdown, which supports PDF parsing, and select **MinerU** in **PDF parser**.
4. If you use a custom ingestion pipeline instead, provide the same MinerU settings and select **MinerU** in the **Parsing method** section of the **Parser** component.
:::
- **If you deploy RAGFlow from source**, install MinerU into an isolated virtual environment (recommended path: `$HOME/uv_tools`): :::note
All MinerU environment variables are optional. When they are set, RAGFlow will auto-create a MinerU OCR model for a tenant on first use using these values. If you do not want this auto-provisioning, configure MinerU only through the UI and leave the env vars unset.
```bash
mkdir -p "$HOME/uv_tools"
cd "$HOME/uv_tools"
uv venv .venv
source .venv/bin/activate
uv pip install -U "mineru[core]" -i https://mirrors.aliyun.com/pypi/simple
# or
# uv pip install -U "mineru[all]" -i https://mirrors.aliyun.com/pypi/simple
```
- **If you deploy RAGFlow with Docker**, you usually only need to turn on MinerU support in `docker/.env`:
```bash
# docker/.env
...
USE_MINERU=true
...
```
Enabling `USE_MINERU=true` will internally perform the same setup as the manual configuration (including setting the MinerU executable path and related environment variables). You only need the manual installation above if you are running from source or want full control over the MinerU installation.
2. Start RAGFlow with MinerU enabled:
- **Source deployment** in the RAGFlow repo, export the key MinerU-related variables and start the backend service:
```bash
# in RAGFlow repo
export MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru"
export MINERU_DELETE_OUTPUT=0 # keep output directory
export MINERU_BACKEND=pipeline # or another backend you prefer
source .venv/bin/activate
export PYTHONPATH=$(pwd)
bash docker/launch_backend_service.sh
```
- **Docker deployment** after setting `USE_MINERU=true`, restart the containers so that the new settings take effect:
```bash
# in RAGFlow repo
docker compose -f docker/docker-compose.yml restart
```
3. Restart the ragflow-server.
4. In the web UI, navigate to the **Configuration** page of your dataset. Click **Built-in** in the **Ingestion pipeline** section, select a chunking method from the **Built-in** dropdown, which supports PDF parsing, and select **MinerU** in **PDF parser**.
5. If you use a custom ingestion pipeline instead, you must also complete the first three steps before selecting **MinerU** in the **Parsing method** section of the **Parser** component.
::: :::
:::caution WARNING :::caution WARNING
@ -107,4 +70,3 @@ Use a visual model to extract data if your PDFs contain formatted or image-based
### Can I select a visual model to parse my DOCX files? ### Can I select a visual model to parse my DOCX files?
No, you cannot. This dropdown menu is for PDFs only. To use this feature, convert your DOCX files to PDF first. No, you cannot. This dropdown menu is for PDFs only. To use this feature, convert your DOCX files to PDF first.

View File

@ -280,6 +280,7 @@ class Parser(ProcessBase):
binary=blob, binary=blob,
callback=self.callback, callback=self.callback,
parse_method=conf.get("mineru_parse_method", "raw"), parse_method=conf.get("mineru_parse_method", "raw"),
lang=conf.get("lang", "Chinese"),
) )
bboxes = [] bboxes = []
for t, poss in lines: for t, poss in lines:

View File

@ -398,7 +398,7 @@ class JinaMultiVecEmbed(Base):
ress.append(chunk_emb) ress.append(chunk_emb)
token_count +=total_token_count_from_response(res) token_count += total_token_count_from_response(res)
except Exception as _e: except Exception as _e:
log_exception(_e, response) log_exception(_e, response)
raise Exception(f"Error: {response}") raise Exception(f"Error: {response}")

View File

@ -16,7 +16,7 @@
import json import json
import logging import logging
import os import os
from typing import Any, Optional, Tuple from typing import Any, Optional
from deepdoc.parser.mineru_parser import MinerUParser from deepdoc.parser.mineru_parser import MinerUParser
@ -25,7 +25,7 @@ class Base:
def __init__(self, key: str | dict, model_name: str, **kwargs): def __init__(self, key: str | dict, model_name: str, **kwargs):
self.model_name = model_name self.model_name = model_name
def parse_pdf(self, filepath: str, binary=None, **kwargs) -> Tuple[Any, Any]: def parse_pdf(self, filepath: str, binary=None, **kwargs) -> tuple[Any, Any]:
raise NotImplementedError("Please implement parse_pdf!") raise NotImplementedError("Please implement parse_pdf!")
@ -56,21 +56,22 @@ class MinerUOcrModel(Base, MinerUParser):
self.mineru_backend = _resolve_config("mineru_backend", "MINERU_BACKEND", "pipeline") self.mineru_backend = _resolve_config("mineru_backend", "MINERU_BACKEND", "pipeline")
self.mineru_server_url = _resolve_config("mineru_server_url", "MINERU_SERVER_URL", "") self.mineru_server_url = _resolve_config("mineru_server_url", "MINERU_SERVER_URL", "")
self.mineru_delete_output = bool(int(_resolve_config("mineru_delete_output", "MINERU_DELETE_OUTPUT", 1))) self.mineru_delete_output = bool(int(_resolve_config("mineru_delete_output", "MINERU_DELETE_OUTPUT", 1)))
self.mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
logging.info(f"Parsed MinerU config: {config}") logging.info(
f"Parsed MinerU config: backend={self.mineru_backend} api={self.mineru_api} server_url={self.mineru_server_url} output_dir={self.mineru_output_dir} delete_output={self.mineru_delete_output}"
)
MinerUParser.__init__(self, mineru_path=self.mineru_executable, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url) MinerUParser.__init__(self, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> Tuple[bool, str]: def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> tuple[bool, str]:
backend = backend or self.mineru_backend backend = backend or self.mineru_backend
server_url = server_url or self.mineru_server_url server_url = server_url or self.mineru_server_url
return self.check_installation(backend=backend, server_url=server_url) return self.check_installation(backend=backend, server_url=server_url)
def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw",**kwargs): def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw", **kwargs):
ok, reason = self.check_available() ok, reason = self.check_available()
if not ok: if not ok:
raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.") raise RuntimeError(f"MinerU server not accessible: {reason}")
sections, tables = MinerUParser.parse_pdf( sections, tables = MinerUParser.parse_pdf(
self, self,

View File

@ -69,7 +69,6 @@ from common.signal_utils import start_tracemalloc_and_snapshot, stop_tracemalloc
from common.exceptions import TaskCanceledException from common.exceptions import TaskCanceledException
from common import settings from common import settings
from common.constants import PAGERANK_FLD, TAG_FLD, SVR_CONSUMER_GROUP_NAME from common.constants import PAGERANK_FLD, TAG_FLD, SVR_CONSUMER_GROUP_NAME
from common.misc_utils import check_and_install_mineru
BATCH_SIZE = 64 BATCH_SIZE = 64
@ -1169,7 +1168,6 @@ async def main():
show_configs() show_configs()
settings.init_settings() settings.init_settings()
settings.check_and_install_torch() settings.check_and_install_torch()
check_and_install_mineru()
logging.info(f'default embedding config: {settings.EMBEDDING_CFG}') logging.info(f'default embedding config: {settings.EMBEDDING_CFG}')
settings.print_rag_settings() settings.print_rag_settings()
if sys.platform != "win32": if sys.platform != "win32":

View File

@ -118,6 +118,10 @@ export function ChunkMethodDialog({
auto_questions: z.coerce.number().optional(), auto_questions: z.coerce.number().optional(),
html4excel: z.boolean().optional(), html4excel: z.boolean().optional(),
toc_extraction: z.boolean().optional(), toc_extraction: z.boolean().optional(),
mineru_parse_method: z.enum(['auto', 'txt', 'ocr']).optional(),
mineru_formula_enable: z.boolean().optional(),
mineru_table_enable: z.boolean().optional(),
mineru_lang: z.string().optional(),
// raptor: z // raptor: z
// .object({ // .object({
// use_raptor: z.boolean().optional(), // use_raptor: z.boolean().optional(),
@ -166,6 +170,9 @@ export function ChunkMethodDialog({
name: 'parser_id', name: 'parser_id',
control: form.control, control: form.control,
}); });
const isMineruSelected =
selectedTag?.toLowerCase().includes('mineru') ||
layoutRecognize?.toLowerCase?.()?.includes('mineru');
const isPdf = documentExtension === 'pdf'; const isPdf = documentExtension === 'pdf';
@ -328,7 +335,7 @@ export function ChunkMethodDialog({
className="space-y-3" className="space-y-3"
> >
{showOne && ( {showOne && (
<LayoutRecognizeFormField></LayoutRecognizeFormField> <LayoutRecognizeFormField showMineruOptions={false} />
)} )}
{showMaxTokenNumber && ( {showMaxTokenNumber && (
<> <>
@ -345,9 +352,16 @@ export function ChunkMethodDialog({
)} )}
</FormContainer> </FormContainer>
<FormContainer <FormContainer
show={showAutoKeywords(selectedTag) || showExcelToHtml} show={
isMineruSelected ||
showAutoKeywords(selectedTag) ||
showExcelToHtml
}
className="space-y-3" className="space-y-3"
> >
{isMineruSelected && (
<LayoutRecognizeFormField showMineruOptions />
)}
{selectedTag === DocumentParserType.Naive && ( {selectedTag === DocumentParserType.Naive && (
<EnableTocToggle /> <EnableTocToggle />
)} )}

View File

@ -18,6 +18,10 @@ export function useDefaultParserValues() {
auto_questions: 0, auto_questions: 0,
html4excel: false, html4excel: false,
toc_extraction: false, toc_extraction: false,
mineru_parse_method: 'auto',
mineru_formula_enable: true,
mineru_table_enable: true,
mineru_lang: 'English',
// raptor: { // raptor: {
// use_raptor: false, // use_raptor: false,
// prompt: t('knowledgeConfiguration.promptText'), // prompt: t('knowledgeConfiguration.promptText'),

View File

@ -5,6 +5,7 @@ import { cn } from '@/lib/utils';
import { camelCase } from 'lodash'; import { camelCase } from 'lodash';
import { ReactNode, useMemo } from 'react'; import { ReactNode, useMemo } from 'react';
import { useFormContext } from 'react-hook-form'; import { useFormContext } from 'react-hook-form';
import { MinerUOptionsFormField } from './mineru-options-form-field';
import { SelectWithSearch } from './originui/select-with-search'; import { SelectWithSearch } from './originui/select-with-search';
import { import {
FormControl, FormControl,
@ -26,11 +27,13 @@ export function LayoutRecognizeFormField({
horizontal = true, horizontal = true,
optionsWithoutLLM, optionsWithoutLLM,
label, label,
showMineruOptions = true,
}: { }: {
name?: string; name?: string;
horizontal?: boolean; horizontal?: boolean;
optionsWithoutLLM?: { value: string; label: string }[]; optionsWithoutLLM?: { value: string; label: string }[];
label?: ReactNode; label?: ReactNode;
showMineruOptions?: boolean;
}) { }) {
const form = useFormContext(); const form = useFormContext();
@ -79,6 +82,7 @@ export function LayoutRecognizeFormField({
name={name} name={name}
render={({ field }) => { render={({ field }) => {
return ( return (
<>
<FormItem className={'items-center space-y-0 '}> <FormItem className={'items-center space-y-0 '}>
<div <div
className={cn('flex', { className={cn('flex', {
@ -108,6 +112,8 @@ export function LayoutRecognizeFormField({
<FormMessage /> <FormMessage />
</div> </div>
</FormItem> </FormItem>
{showMineruOptions && <MinerUOptionsFormField />}
</>
); );
}} }}
/> />

View File

@ -7,10 +7,38 @@ import { useFormContext, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next'; import { useTranslation } from 'react-i18next';
const parseMethodOptions = buildOptions(['auto', 'txt', 'ocr']); const parseMethodOptions = buildOptions(['auto', 'txt', 'ocr']);
const languageOptions = buildOptions([
'English',
'Chinese',
'Traditional Chinese',
'Russian',
'Ukrainian',
'Indonesian',
'Spanish',
'Vietnamese',
'Japanese',
'Korean',
'Portuguese BR',
'German',
'French',
'Italian',
'Tamil',
'Telugu',
'Kannada',
'Thai',
'Greek',
'Hindi',
]);
export function MinerUOptionsFormField() { export function MinerUOptionsFormField({
namePrefix = 'parser_config',
}: {
namePrefix?: string;
}) {
const form = useFormContext(); const form = useFormContext();
const { t } = useTranslation(); const { t } = useTranslation();
const buildName = (field: string) =>
namePrefix ? `${namePrefix}.${field}` : field;
const layoutRecognize = useWatch({ const layoutRecognize = useWatch({
control: form.control, control: form.control,
@ -33,7 +61,7 @@ export function MinerUOptionsFormField() {
</div> </div>
<RAGFlowFormItem <RAGFlowFormItem
name="parser_config.mineru_parse_method" name={buildName('mineru_parse_method')}
label={t('knowledgeConfiguration.mineruParseMethod', 'Parse Method')} label={t('knowledgeConfiguration.mineruParseMethod', 'Parse Method')}
tooltip={t( tooltip={t(
'knowledgeConfiguration.mineruParseMethodTip', 'knowledgeConfiguration.mineruParseMethodTip',
@ -52,7 +80,26 @@ export function MinerUOptionsFormField() {
</RAGFlowFormItem> </RAGFlowFormItem>
<RAGFlowFormItem <RAGFlowFormItem
name="parser_config.mineru_formula_enable" name={buildName('mineru_lang')}
label={t('knowledgeConfiguration.mineruLanguage', 'Language')}
tooltip={t(
'knowledgeConfiguration.mineruLanguageTip',
'Preferred OCR language for MinerU.',
)}
horizontal={true}
>
{(field) => (
<RAGFlowSelect
value={field.value || 'English'}
onChange={field.onChange}
options={languageOptions}
placeholder={t('common.selectPlaceholder', 'Select value')}
/>
)}
</RAGFlowFormItem>
<RAGFlowFormItem
name={buildName('mineru_formula_enable')}
label={t( label={t(
'knowledgeConfiguration.mineruFormulaEnable', 'knowledgeConfiguration.mineruFormulaEnable',
'Formula Recognition', 'Formula Recognition',
@ -73,7 +120,7 @@ export function MinerUOptionsFormField() {
</RAGFlowFormItem> </RAGFlowFormItem>
<RAGFlowFormItem <RAGFlowFormItem
name="parser_config.mineru_table_enable" name={buildName('mineru_table_enable')}
label={t( label={t(
'knowledgeConfiguration.mineruTableEnable', 'knowledgeConfiguration.mineruTableEnable',
'Table Recognition', 'Table Recognition',

View File

@ -34,8 +34,13 @@ export interface IDocumentInfo {
export interface IParserConfig { export interface IParserConfig {
delimiter?: string; delimiter?: string;
html4excel?: boolean; html4excel?: boolean;
layout_recognize?: boolean; layout_recognize?: string;
pages: any[]; pages?: any[];
chunk_token_num?: number;
auto_keywords?: number;
auto_questions?: number;
toc_extraction?: boolean;
task_page_size?: number;
raptor?: Raptor; raptor?: Raptor;
graphrag?: GraphRag; graphrag?: GraphRag;
} }

View File

@ -1,8 +1,13 @@
export interface IChangeParserConfigRequestBody { export interface IChangeParserConfigRequestBody {
pages: number[][]; pages?: number[][];
chunk_token_num: number; chunk_token_num?: number;
layout_recognize: boolean; layout_recognize?: string;
task_page_size: number; task_page_size?: number;
delimiter?: string;
auto_keywords?: number;
auto_questions?: number;
html4excel?: boolean;
toc_extraction?: boolean;
} }
export interface IChangeParserRequestBody { export interface IChangeParserRequestBody {

View File

@ -7,7 +7,6 @@ import { DelimiterFormField } from '@/components/delimiter-form-field';
import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field'; import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field';
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field'; import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
import { MaxTokenNumberFormField } from '@/components/max-token-number-from-field'; import { MaxTokenNumberFormField } from '@/components/max-token-number-from-field';
import { MinerUOptionsFormField } from '@/components/mineru-options-form-field';
import { import {
ConfigurationFormContainer, ConfigurationFormContainer,
MainContainer, MainContainer,
@ -19,7 +18,6 @@ export function NaiveConfiguration() {
<MainContainer> <MainContainer>
<ConfigurationFormContainer> <ConfigurationFormContainer>
<LayoutRecognizeFormField></LayoutRecognizeFormField> <LayoutRecognizeFormField></LayoutRecognizeFormField>
<MinerUOptionsFormField></MinerUOptionsFormField>
<MaxTokenNumberFormField initialValue={512}></MaxTokenNumberFormField> <MaxTokenNumberFormField initialValue={512}></MaxTokenNumberFormField>
<DelimiterFormField></DelimiterFormField> <DelimiterFormField></DelimiterFormField>
<ChildrenDelimiterForm /> <ChildrenDelimiterForm />

View File

@ -37,6 +37,7 @@ export const formSchema = z
mineru_parse_method: z.enum(['auto', 'txt', 'ocr']).optional(), mineru_parse_method: z.enum(['auto', 'txt', 'ocr']).optional(),
mineru_formula_enable: z.boolean().optional(), mineru_formula_enable: z.boolean().optional(),
mineru_table_enable: z.boolean().optional(), mineru_table_enable: z.boolean().optional(),
mineru_lang: z.string().optional(),
raptor: z raptor: z
.object({ .object({
use_raptor: z.boolean().optional(), use_raptor: z.boolean().optional(),

View File

@ -75,6 +75,7 @@ export default function DatasetSettings() {
mineru_parse_method: 'auto', mineru_parse_method: 'auto',
mineru_formula_enable: true, mineru_formula_enable: true,
mineru_table_enable: true, mineru_table_enable: true,
mineru_lang: 'English',
raptor: { raptor: {
use_raptor: true, use_raptor: true,
max_token: 256, max_token: 256,