mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-19 12:06:42 +08:00
Refa: only support MinerU-API now (#11977)
### What problem does this PR solve? Only support MinerU-API now, still need to complete frontend for pipeline to allow the configuration of MinerU options. ### Type of change - [x] Refactoring
This commit is contained in:
@ -23,8 +23,6 @@ import subprocess
|
||||
import sys
|
||||
import os
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
def get_uuid():
|
||||
return uuid.uuid1().hex
|
||||
@ -108,152 +106,3 @@ def pip_install_torch():
|
||||
logging.info("Installing pytorch")
|
||||
pkg_names = ["torch>=2.5.0,<3.0.0"]
|
||||
subprocess.check_call([sys.executable, "-m", "pip", "install", *pkg_names])
|
||||
|
||||
|
||||
def parse_mineru_paths() -> Dict[str, Path]:
|
||||
"""
|
||||
Parse MinerU-related paths based on the MINERU_EXECUTABLE environment variable.
|
||||
|
||||
Expected layout (default convention):
|
||||
MINERU_EXECUTABLE = /home/user/uv_tools/.venv/bin/mineru
|
||||
|
||||
From this path we derive:
|
||||
- mineru_exec : full path to the mineru executable
|
||||
- venv_dir : the virtual environment directory (.venv)
|
||||
- tools_dir : the parent tools directory (e.g. uv_tools)
|
||||
|
||||
If MINERU_EXECUTABLE is not set, we fall back to the default layout:
|
||||
$HOME/uv_tools/.venv/bin/mineru
|
||||
|
||||
Returns:
|
||||
A dict with keys:
|
||||
- "mineru_exec": Path
|
||||
- "venv_dir": Path
|
||||
- "tools_dir": Path
|
||||
"""
|
||||
mineru_exec_env = os.getenv("MINERU_EXECUTABLE")
|
||||
|
||||
if mineru_exec_env:
|
||||
# Use the path from the environment variable
|
||||
mineru_exec = Path(mineru_exec_env).expanduser().resolve()
|
||||
venv_dir = mineru_exec.parent.parent
|
||||
tools_dir = venv_dir.parent
|
||||
else:
|
||||
# Fall back to default convention: $HOME/uv_tools/.venv/bin/mineru
|
||||
home = Path(os.path.expanduser("~"))
|
||||
tools_dir = home / "uv_tools"
|
||||
venv_dir = tools_dir / ".venv"
|
||||
mineru_exec = venv_dir / "bin" / "mineru"
|
||||
|
||||
return {
|
||||
"mineru_exec": mineru_exec,
|
||||
"venv_dir": venv_dir,
|
||||
"tools_dir": tools_dir,
|
||||
}
|
||||
|
||||
|
||||
@once
|
||||
def check_and_install_mineru() -> None:
|
||||
"""
|
||||
Ensure MinerU is installed.
|
||||
|
||||
Behavior:
|
||||
1. MinerU is enabled only when USE_MINERU is true/yes/1/y.
|
||||
2. Resolve mineru_exec / venv_dir / tools_dir.
|
||||
3. If mineru exists and works, log success and exit.
|
||||
4. Otherwise:
|
||||
- Create tools_dir
|
||||
- Create venv if missing
|
||||
- Install mineru[core], fallback to mineru[all]
|
||||
- Validate with `--help`
|
||||
5. Log installation success.
|
||||
|
||||
NOTE:
|
||||
This function intentionally does NOT return the path.
|
||||
Logging is used to indicate status.
|
||||
"""
|
||||
# Check if MinerU is enabled
|
||||
use_mineru = os.getenv("USE_MINERU", "false").strip().lower()
|
||||
if use_mineru != "true":
|
||||
logging.info("USE_MINERU=%r. Skipping MinerU installation.", use_mineru)
|
||||
return
|
||||
|
||||
# Resolve expected paths
|
||||
paths = parse_mineru_paths()
|
||||
mineru_exec: Path = paths["mineru_exec"]
|
||||
venv_dir: Path = paths["venv_dir"]
|
||||
tools_dir: Path = paths["tools_dir"]
|
||||
|
||||
# Construct environment variables for installation/execution
|
||||
env = os.environ.copy()
|
||||
env["VIRTUAL_ENV"] = str(venv_dir)
|
||||
env["PATH"] = str(venv_dir / "bin") + os.pathsep + env.get("PATH", "")
|
||||
|
||||
# Configure HuggingFace endpoint
|
||||
env.setdefault("HUGGINGFACE_HUB_ENDPOINT", os.getenv("HF_ENDPOINT") or "https://hf-mirror.com")
|
||||
|
||||
# Helper: check whether mineru works
|
||||
def mineru_works() -> bool:
|
||||
try:
|
||||
subprocess.check_call(
|
||||
[str(mineru_exec), "--help"],
|
||||
stdout=subprocess.DEVNULL,
|
||||
stderr=subprocess.PIPE,
|
||||
env=env,
|
||||
)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
# If MinerU is already installed and functional
|
||||
if mineru_exec.is_file() and os.access(mineru_exec, os.X_OK) and mineru_works():
|
||||
logging.info("MinerU already installed.")
|
||||
os.environ["MINERU_EXECUTABLE"] = str(mineru_exec)
|
||||
return
|
||||
|
||||
logging.info("MinerU not found. Installing into virtualenv: %s", venv_dir)
|
||||
|
||||
# Ensure parent directory exists
|
||||
tools_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create venv if missing
|
||||
if not venv_dir.exists():
|
||||
subprocess.check_call(
|
||||
["uv", "venv", str(venv_dir)],
|
||||
cwd=str(tools_dir),
|
||||
env=env,
|
||||
# stdout=subprocess.DEVNULL,
|
||||
# stderr=subprocess.PIPE,
|
||||
)
|
||||
else:
|
||||
logging.info("Virtual environment exists at %s. Reusing it.", venv_dir)
|
||||
|
||||
# Helper for pip install
|
||||
def pip_install(pkg: str) -> None:
|
||||
subprocess.check_call(
|
||||
[
|
||||
"uv", "pip", "install", "-U", pkg,
|
||||
"-i", "https://mirrors.aliyun.com/pypi/simple",
|
||||
"--extra-index-url", "https://pypi.org/simple",
|
||||
],
|
||||
cwd=str(tools_dir),
|
||||
# stdout=subprocess.DEVNULL,
|
||||
# stderr=subprocess.PIPE,
|
||||
env=env,
|
||||
)
|
||||
|
||||
# Install core version first; fallback to all
|
||||
try:
|
||||
logging.info("Installing mineru[core] ...")
|
||||
pip_install("mineru[core]")
|
||||
except subprocess.CalledProcessError:
|
||||
logging.warning("mineru[core] installation failed. Installing mineru[all] ...")
|
||||
pip_install("mineru[all]")
|
||||
|
||||
# Validate installation
|
||||
if not mineru_works():
|
||||
logging.error("MinerU installation failed: %s does not work.", mineru_exec)
|
||||
raise RuntimeError(f"MinerU installation failed: {mineru_exec} is not functional")
|
||||
|
||||
os.environ["MINERU_EXECUTABLE"] = str(mineru_exec)
|
||||
logging.info("MinerU installation completed successfully. Executable: %s", mineru_exec)
|
||||
|
||||
@ -16,19 +16,15 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import re
|
||||
import subprocess
|
||||
import sys
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
import zipfile
|
||||
from dataclasses import dataclass
|
||||
from io import BytesIO
|
||||
from os import PathLike
|
||||
from pathlib import Path
|
||||
from queue import Empty, Queue
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
import numpy as np
|
||||
@ -137,10 +133,8 @@ class MinerUParseOptions:
|
||||
|
||||
class MinerUParser(RAGFlowPdfParser):
|
||||
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "", mineru_server_url: str = ""):
|
||||
self.mineru_path = Path(mineru_path)
|
||||
self.mineru_api = mineru_api.rstrip("/")
|
||||
self.mineru_server_url = mineru_server_url.rstrip("/")
|
||||
self.using_api = False
|
||||
self.outlines = []
|
||||
self.logger = logging.getLogger(self.__class__.__name__)
|
||||
|
||||
@ -189,105 +183,59 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]:
|
||||
reason = ""
|
||||
|
||||
valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine", "vlm-mlx-engine"]
|
||||
valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine", "vlm-mlx-engine", "vlm-vllm-async-engine", "vlm-lmdeploy-engine"]
|
||||
if backend not in valid_backends:
|
||||
reason = "[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
|
||||
reason = f"[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
|
||||
self.logger.warning(reason)
|
||||
return False, reason
|
||||
|
||||
subprocess_kwargs = {
|
||||
"capture_output": True,
|
||||
"text": True,
|
||||
"check": True,
|
||||
"encoding": "utf-8",
|
||||
"errors": "ignore",
|
||||
}
|
||||
if not self.mineru_api:
|
||||
reason = "[MinerU] MINERU_APISERVER not configured."
|
||||
self.logger.warning(reason)
|
||||
return False, reason
|
||||
|
||||
if platform.system() == "Windows":
|
||||
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
|
||||
|
||||
if server_url is None:
|
||||
server_url = self.mineru_server_url
|
||||
|
||||
if backend == "vlm-http-client" and server_url:
|
||||
api_openapi = f"{self.mineru_api}/openapi.json"
|
||||
try:
|
||||
server_accessible = self._is_http_endpoint_valid(server_url + "/openapi.json")
|
||||
self.logger.info(f"[MinerU] vlm-http-client server check: {server_accessible}")
|
||||
if server_accessible:
|
||||
self.using_api = False # We are using http client, not API
|
||||
api_ok = self._is_http_endpoint_valid(api_openapi)
|
||||
self.logger.info(f"[MinerU] API openapi.json reachable={api_ok} url={api_openapi}")
|
||||
if not api_ok:
|
||||
reason = f"[MinerU] MinerU API not accessible: {api_openapi}"
|
||||
return False, reason
|
||||
except Exception as exc:
|
||||
reason = f"[MinerU] MinerU API check failed: {exc}"
|
||||
self.logger.warning(reason)
|
||||
return False, reason
|
||||
|
||||
if backend == "vlm-http-client":
|
||||
resolved_server = server_url or self.mineru_server_url
|
||||
if not resolved_server:
|
||||
reason = "[MinerU] MINERU_SERVER_URL required for vlm-http-client backend."
|
||||
self.logger.warning(reason)
|
||||
return False, reason
|
||||
try:
|
||||
server_ok = self._is_http_endpoint_valid(resolved_server)
|
||||
self.logger.info(f"[MinerU] vlm-http-client server check reachable={server_ok} url={resolved_server}")
|
||||
except Exception as exc:
|
||||
self.logger.warning(f"[MinerU] vlm-http-client server probe failed: {resolved_server}: {exc}")
|
||||
|
||||
return True, reason
|
||||
else:
|
||||
reason = f"[MinerU] vlm-http-client server not accessible: {server_url}"
|
||||
self.logger.warning(f"[MinerU] vlm-http-client server not accessible: {server_url}")
|
||||
return False, reason
|
||||
except Exception as e:
|
||||
self.logger.warning(f"[MinerU] vlm-http-client server check failed: {e}")
|
||||
try:
|
||||
response = requests.get(server_url, timeout=5)
|
||||
self.logger.info(
|
||||
f"[MinerU] vlm-http-client server connection check: success with status {response.status_code}")
|
||||
self.using_api = False
|
||||
return True, reason
|
||||
except Exception as e:
|
||||
reason = f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}"
|
||||
self.logger.warning(f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}")
|
||||
return False, reason
|
||||
|
||||
try:
|
||||
result = subprocess.run([str(self.mineru_path), "--version"], **subprocess_kwargs)
|
||||
version_info = result.stdout.strip()
|
||||
if version_info:
|
||||
self.logger.info(f"[MinerU] Detected version: {version_info}")
|
||||
else:
|
||||
self.logger.info("[MinerU] Detected MinerU, but version info is empty.")
|
||||
return True, reason
|
||||
except subprocess.CalledProcessError as e:
|
||||
self.logger.warning(f"[MinerU] Execution failed (exit code {e.returncode}).")
|
||||
except FileNotFoundError:
|
||||
self.logger.warning("[MinerU] MinerU not found. Please install it via: pip install -U 'mineru[core]'")
|
||||
except Exception as e:
|
||||
self.logger.error(f"[MinerU] Unexpected error during installation check: {e}")
|
||||
|
||||
# If executable check fails, try API check
|
||||
try:
|
||||
if self.mineru_api:
|
||||
# check openapi.json
|
||||
openapi_exists = self._is_http_endpoint_valid(self.mineru_api + "/openapi.json")
|
||||
if not openapi_exists:
|
||||
reason = "[MinerU] Failed to detect vaild MinerU API server"
|
||||
return openapi_exists, reason
|
||||
self.logger.info(f"[MinerU] Detected {self.mineru_api}/openapi.json: {openapi_exists}")
|
||||
self.using_api = openapi_exists
|
||||
return openapi_exists, reason
|
||||
else:
|
||||
reason = "[MinerU] api not exists. Setting MINERU_SERVER_URL if your backend is vlm-http-client."
|
||||
self.logger.info(reason)
|
||||
return False, reason
|
||||
except Exception as e:
|
||||
reason = f"[MinerU] Unexpected error during api check: {e}"
|
||||
self.logger.error(f"[MinerU] Unexpected error during api check: {e}")
|
||||
return False, reason
|
||||
|
||||
def _run_mineru(
|
||||
self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
|
||||
):
|
||||
if self.using_api:
|
||||
self._run_mineru_api(input_path, output_dir, options, callback)
|
||||
else:
|
||||
self._run_mineru_executable(input_path, output_dir, options, callback)
|
||||
|
||||
def _run_mineru_api(self, input_path: Path, output_dir: Path, options: MinerUParseOptions,
|
||||
callback: Optional[Callable] = None):
|
||||
output_zip_path = os.path.join(str(output_dir), "output.zip")
|
||||
) -> Path:
|
||||
return self._run_mineru_api(input_path, output_dir, options, callback)
|
||||
|
||||
def _run_mineru_api(
|
||||
self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
|
||||
) -> Path:
|
||||
pdf_file_path = str(input_path)
|
||||
|
||||
if not os.path.exists(pdf_file_path):
|
||||
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
|
||||
|
||||
pdf_file_name = Path(pdf_file_path).stem.strip()
|
||||
output_path = os.path.join(str(output_dir), pdf_file_name, options.method)
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
output_path = tempfile.mkdtemp(prefix=f"{pdf_file_name}_{options.method}_", dir=str(output_dir))
|
||||
output_zip_path = os.path.join(str(output_dir), f"{Path(output_path).name}.zip")
|
||||
|
||||
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
|
||||
|
||||
@ -309,9 +257,19 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
"end_page_id": 99999,
|
||||
}
|
||||
|
||||
if options.server_url:
|
||||
data["server_url"] = options.server_url
|
||||
elif self.mineru_server_url:
|
||||
data["server_url"] = self.mineru_server_url
|
||||
|
||||
print("--------------------------------", flush=True)
|
||||
print(f"{data=}", flush=True)
|
||||
print(f"{options=}", flush=True)
|
||||
print("--------------------------------", flush=True)
|
||||
|
||||
headers = {"Accept": "application/json"}
|
||||
try:
|
||||
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
||||
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse backend={options.backend} server_url={data.get('server_url')}")
|
||||
if callback:
|
||||
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
|
||||
response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers,
|
||||
@ -333,65 +291,11 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
if callback:
|
||||
callback(0.40, f"[MinerU] Unzip to {output_path}...")
|
||||
else:
|
||||
self.logger.warning("[MinerU] not zip returned from api:%s " % response.headers.get("Content-Type"))
|
||||
self.logger.warning(f"[MinerU] not zip returned from api: {response.headers.get('Content-Type')}")
|
||||
except Exception as e:
|
||||
raise RuntimeError(f"[MinerU] api failed with exception {e}")
|
||||
self.logger.info("[MinerU] Api completed successfully.")
|
||||
|
||||
def _run_mineru_executable(
|
||||
self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
|
||||
):
|
||||
cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", options.method]
|
||||
if options.backend:
|
||||
cmd.extend(["-b", options.backend])
|
||||
if options.lang:
|
||||
cmd.extend(["-l", options.lang])
|
||||
if options.server_url and options.backend == "vlm-http-client":
|
||||
cmd.extend(["-u", options.server_url])
|
||||
|
||||
self.logger.info(f"[MinerU] Running command: {' '.join(cmd)}")
|
||||
|
||||
subprocess_kwargs = {
|
||||
"stdout": subprocess.PIPE,
|
||||
"stderr": subprocess.PIPE,
|
||||
"text": True,
|
||||
"encoding": "utf-8",
|
||||
"errors": "ignore",
|
||||
"bufsize": 1,
|
||||
}
|
||||
|
||||
if platform.system() == "Windows":
|
||||
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
|
||||
|
||||
process = subprocess.Popen(cmd, **subprocess_kwargs)
|
||||
stdout_queue, stderr_queue = Queue(), Queue()
|
||||
|
||||
def enqueue_output(pipe, queue, prefix):
|
||||
for line in iter(pipe.readline, ""):
|
||||
if line.strip():
|
||||
queue.put((prefix, line.strip()))
|
||||
pipe.close()
|
||||
|
||||
threading.Thread(target=enqueue_output, args=(process.stdout, stdout_queue, "STDOUT"), daemon=True).start()
|
||||
threading.Thread(target=enqueue_output, args=(process.stderr, stderr_queue, "STDERR"), daemon=True).start()
|
||||
|
||||
while process.poll() is None:
|
||||
for q in (stdout_queue, stderr_queue):
|
||||
try:
|
||||
while True:
|
||||
prefix, line = q.get_nowait()
|
||||
if prefix == "STDOUT":
|
||||
self.logger.info(f"[MinerU] {line}")
|
||||
else:
|
||||
self.logger.warning(f"[MinerU] {line}")
|
||||
except Empty:
|
||||
pass
|
||||
time.sleep(0.1)
|
||||
|
||||
return_code = process.wait()
|
||||
if return_code != 0:
|
||||
raise RuntimeError(f"[MinerU] Process failed with exit code {return_code}")
|
||||
self.logger.info("[MinerU] Command completed successfully.")
|
||||
return Path(output_path)
|
||||
|
||||
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
|
||||
self.page_from = page_from
|
||||
@ -554,25 +458,6 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
|
||||
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[
|
||||
dict[str, Any]]:
|
||||
candidates = []
|
||||
seen = set()
|
||||
|
||||
def add_candidate_path(p: Path):
|
||||
if p not in seen:
|
||||
seen.add(p)
|
||||
candidates.append(p)
|
||||
|
||||
if backend.startswith("vlm-"):
|
||||
add_candidate_path(output_dir / file_stem / "vlm")
|
||||
if method:
|
||||
add_candidate_path(output_dir / file_stem / method)
|
||||
add_candidate_path(output_dir / file_stem / "auto")
|
||||
else:
|
||||
if method:
|
||||
add_candidate_path(output_dir / file_stem / method)
|
||||
add_candidate_path(output_dir / file_stem / "vlm")
|
||||
add_candidate_path(output_dir / file_stem / "auto")
|
||||
|
||||
json_file = None
|
||||
subdir = None
|
||||
attempted = []
|
||||
@ -588,33 +473,28 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
safe_stem = _sanitize_filename(file_stem)
|
||||
allowed_names = {f"{file_stem}_content_list.json", f"{safe_stem}_content_list.json"}
|
||||
self.logger.info(f"[MinerU] Expected output files: {', '.join(sorted(allowed_names))}")
|
||||
self.logger.info(f"[MinerU] Searching output candidates: {', '.join(str(c) for c in candidates)}")
|
||||
self.logger.info(f"[MinerU] Searching output in: {output_dir}")
|
||||
|
||||
for sub in candidates:
|
||||
jf = sub / f"{file_stem}_content_list.json"
|
||||
jf = output_dir / f"{file_stem}_content_list.json"
|
||||
self.logger.info(f"[MinerU] Trying original path: {jf}")
|
||||
attempted.append(jf)
|
||||
if jf.exists():
|
||||
subdir = sub
|
||||
subdir = output_dir
|
||||
json_file = jf
|
||||
break
|
||||
|
||||
# MinerU API sanitizes non-ASCII filenames inside the ZIP root and file names.
|
||||
alt = sub / f"{safe_stem}_content_list.json"
|
||||
else:
|
||||
alt = output_dir / f"{safe_stem}_content_list.json"
|
||||
self.logger.info(f"[MinerU] Trying sanitized filename: {alt}")
|
||||
attempted.append(alt)
|
||||
if alt.exists():
|
||||
subdir = sub
|
||||
subdir = output_dir
|
||||
json_file = alt
|
||||
break
|
||||
|
||||
nested_alt = sub / safe_stem / f"{safe_stem}_content_list.json"
|
||||
else:
|
||||
nested_alt = output_dir / safe_stem / f"{safe_stem}_content_list.json"
|
||||
self.logger.info(f"[MinerU] Trying sanitized nested path: {nested_alt}")
|
||||
attempted.append(nested_alt)
|
||||
if nested_alt.exists():
|
||||
subdir = nested_alt.parent
|
||||
json_file = nested_alt
|
||||
break
|
||||
|
||||
if not json_file:
|
||||
raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(p) for p in attempted)}")
|
||||
@ -680,12 +560,12 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
temp_pdf = None
|
||||
created_tmp_dir = False
|
||||
|
||||
# Assuming the dict is defined as shown
|
||||
lang = kwargs.get('lang', 'English')
|
||||
mineru_lang_code = LANGUAGE_TO_MINERU_MAP.get(lang, 'ch') # Returns 'ch' if lang not found
|
||||
mineru_method_raw_str = kwargs.get('parser_config', {}).get('mineru_parse_method', 'auto')
|
||||
enable_formula = kwargs.get('parser_config', {}).get('mineru_formula_enable', True)
|
||||
enable_table = kwargs.get('parser_config', {}).get('mineru_enable', True)
|
||||
parser_cfg = kwargs.get('parser_config', {})
|
||||
lang = parser_cfg.get('mineru_lang') or kwargs.get('lang', 'English')
|
||||
mineru_lang_code = LANGUAGE_TO_MINERU_MAP.get(lang, 'ch') # Defaults to Chinese if not matched
|
||||
mineru_method_raw_str = parser_cfg.get('mineru_parse_method', 'auto')
|
||||
enable_formula = parser_cfg.get('mineru_formula_enable', True)
|
||||
enable_table = parser_cfg.get('mineru_table_enable', True)
|
||||
|
||||
# remove spaces, or mineru crash, and _read_output fail too
|
||||
file_path = Path(filepath)
|
||||
@ -718,7 +598,7 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
out_dir = Path(tempfile.mkdtemp(prefix="mineru_pdf_"))
|
||||
created_tmp_dir = True
|
||||
|
||||
self.logger.info(f"[MinerU] Output directory: {out_dir}")
|
||||
self.logger.info(f"[MinerU] Output directory: {out_dir} backend={backend} api={self.mineru_api} server_url={server_url or self.mineru_server_url}")
|
||||
if callback:
|
||||
callback(0.15, f"[MinerU] Output directory: {out_dir}")
|
||||
|
||||
@ -735,8 +615,8 @@ class MinerUParser(RAGFlowPdfParser):
|
||||
formula_enable=enable_formula,
|
||||
table_enable=enable_table,
|
||||
)
|
||||
self._run_mineru(pdf, out_dir, options, callback=callback)
|
||||
outputs = self._read_output(out_dir, pdf.stem, method=mineru_method_raw_str, backend=backend)
|
||||
final_out_dir = self._run_mineru(pdf, out_dir, options, callback=callback)
|
||||
outputs = self._read_output(final_out_dir, pdf.stem, method=mineru_method_raw_str, backend=backend)
|
||||
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
||||
if callback:
|
||||
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
|
||||
|
||||
@ -201,64 +201,10 @@ function ensure_docling() {
|
||||
|| python3 -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --extra-index-url https://pypi.org/simple --no-cache-dir "docling${DOCLING_PIN}"
|
||||
}
|
||||
|
||||
function ensure_mineru() {
|
||||
[[ "${USE_MINERU}" == "true" ]] || { echo "[mineru] disabled by USE_MINERU"; return 0; }
|
||||
|
||||
export HUGGINGFACE_HUB_ENDPOINT="${HF_ENDPOINT:-https://hf-mirror.com}"
|
||||
|
||||
local default_prefix="/ragflow/uv_tools"
|
||||
local venv_dir="${default_prefix}/.venv"
|
||||
local exe="${MINERU_EXECUTABLE:-${venv_dir}/bin/mineru}"
|
||||
local mineru_backend="${MINERU_BACKEND:-pipeline}"
|
||||
local mineru_pkg="mineru[core]"
|
||||
|
||||
if [[ "${mineru_backend}" == vlm-* ]]; then
|
||||
mineru_pkg="mineru[core,vlm]"
|
||||
fi
|
||||
|
||||
if [[ -x "${exe}" ]]; then
|
||||
echo "[mineru] found: ${exe} (MINERU_BACKEND=${mineru_backend})"
|
||||
export MINERU_EXECUTABLE="${exe}"
|
||||
|
||||
if [[ "${mineru_backend}" == vlm-* ]]; then
|
||||
if ! "${venv_dir}/bin/python3" -c "import importlib.util, sys; sys.exit(0 if importlib.util.find_spec('vllm') else 1)" >/dev/null 2>&1; then
|
||||
echo "[mineru] vllm not found for MINERU_BACKEND=${mineru_backend}, installing ${mineru_pkg} ..."
|
||||
(
|
||||
set -e
|
||||
source "${venv_dir}/bin/activate"
|
||||
uv pip install -U "${mineru_pkg}" -i https://mirrors.aliyun.com/pypi/simple --extra-index-url https://pypi.org/simple
|
||||
deactivate
|
||||
) || return 1
|
||||
fi
|
||||
fi
|
||||
return 0
|
||||
fi
|
||||
|
||||
echo "[mineru] not found, bootstrapping with uv ... (MINERU_BACKEND=${mineru_backend}, pkg=${mineru_pkg})"
|
||||
|
||||
(
|
||||
set -e
|
||||
mkdir -p "${default_prefix}"
|
||||
cd "${default_prefix}"
|
||||
[[ -d "${venv_dir}" ]] || { echo "[mineru] creating venv at ${venv_dir} ..."; uv venv "${venv_dir}"; }
|
||||
|
||||
echo "[mineru] installing ${mineru_pkg} into ${venv_dir} ..."
|
||||
source "${venv_dir}/bin/activate"
|
||||
uv pip install -U "${mineru_pkg}" -i https://mirrors.aliyun.com/pypi/simple --extra-index-url https://pypi.org/simple
|
||||
deactivate
|
||||
)
|
||||
export MINERU_EXECUTABLE="${exe}"
|
||||
if ! "${MINERU_EXECUTABLE}" --help >/dev/null 2>&1; then
|
||||
echo "[mineru] installation failed: ${MINERU_EXECUTABLE} not working" >&2
|
||||
return 1
|
||||
fi
|
||||
echo "[mineru] installed: ${MINERU_EXECUTABLE}"
|
||||
}
|
||||
# -----------------------------------------------------------------------------
|
||||
# Start components based on flags
|
||||
# -----------------------------------------------------------------------------
|
||||
ensure_docling
|
||||
ensure_mineru
|
||||
|
||||
if [[ "${ENABLE_WEBSERVER}" -eq 1 ]]; then
|
||||
echo "Starting nginx..."
|
||||
|
||||
88
docs/faq.mdx
88
docs/faq.mdx
@ -493,66 +493,37 @@ See [here](./guides/agent/best_practices/accelerate_agent_question_answering.md)
|
||||
|
||||
### How to use MinerU to parse PDF documents?
|
||||
|
||||
MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports MinerU (>= 2.6.3) as an optional PDF parser with multiple backends. RAGFlow acts only as a client for MinerU, calling it to parse documents, reading the output files, and ingesting the parsed content. To use this feature, follow these steps:
|
||||
|
||||
1. Prepare MinerU
|
||||
|
||||
```bash
|
||||
# docker/.env
|
||||
...
|
||||
USE_MINERU=true
|
||||
...
|
||||
```
|
||||
|
||||
Enabling `USE_MINERU=true` will internally perform the same setup as the manual configuration (including setting the MinerU executable path and related environment variables).
|
||||
|
||||
|
||||
2. Start RAGFlow with MinerU enabled:
|
||||
|
||||
- **Source deployment** – in the RAGFlow repo, continue to start the backend service:
|
||||
|
||||
```bash
|
||||
...
|
||||
source .venv/bin/activate
|
||||
export PYTHONPATH=$(pwd)
|
||||
bash docker/launch_backend_service.sh
|
||||
```
|
||||
|
||||
- **Docker deployment** – after setting `USE_MINERU=true`, restart the containers so that the new settings take effect:
|
||||
|
||||
```bash
|
||||
# in RAGFlow repo
|
||||
docker compose -f docker/docker-compose.yml restart
|
||||
```
|
||||
MinerU PDF document parsing is available starting from v0.22.0. RAGFlow works only as a remote client to MinerU (>= 2.6.3) and does not install or execute MinerU locally. To use this feature:
|
||||
|
||||
1. Prepare a reachable MinerU API service (for example, the FastAPI server provided by MinerU).
|
||||
2. Configure RAGFlow with remote MinerU settings (environment variables or UI model provider):
|
||||
- `MINERU_APISERVER`: MinerU API endpoint, for example `http://mineru-host:8886`.
|
||||
- `MINERU_BACKEND`: MinerU backend, defaults to `pipeline` (supports `vlm-http-client`, `vlm-transformers`, `vlm-vllm-engine`, `vlm-mlx-engine`, `vlm-vllm-async-engine`, `vlm-lmdeploy-engine`).
|
||||
- `MINERU_SERVER_URL`: (optional) For `vlm-http-client`, the downstream vLLM HTTP server, for example `http://vllm-host:30000`.
|
||||
- `MINERU_OUTPUT_DIR`: (optional) Local directory to store MinerU API outputs (zip/JSON) before ingestion.
|
||||
- `MINERU_DELETE_OUTPUT`: Whether to delete temporary output when a temp dir is used (`1` deletes temp outputs; set `0` to keep).
|
||||
3. In the web UI, navigate to the **Configuration** page of your dataset. Click **Built-in** in the **Ingestion pipeline** section, select a chunking method from the **Built-in** dropdown (which supports PDF parsing), and select **MinerU** in **PDF parser**.
|
||||
4. If you use a custom ingestion pipeline instead, you must also complete the first two steps before selecting **MinerU** in the **Parsing method** section of the **Parser** component.
|
||||
4. If you use a custom ingestion pipeline instead, provide the same MinerU settings and select **MinerU** in the **Parsing method** section of the **Parser** component.
|
||||
|
||||
---
|
||||
|
||||
### How to configure MinerU-specific settings?
|
||||
|
||||
The table below summarizes the most frequently used MinerU environment variables:
|
||||
The table below summarizes the most frequently used MinerU environment variables for remote MinerU:
|
||||
|
||||
| Environment variable | Description | Default | Example |
|
||||
| ---------------------- | ---------------------------------- | ----------------------------------- | ----------------------------------------------------------------------------------------------- |
|
||||
| `MINERU_EXECUTABLE` | Path to the local MinerU executable | `mineru` | `MINERU_EXECUTABLE=/home/ragflow/uv_tools/.venv/bin/mineru` |
|
||||
| `MINERU_DELETE_OUTPUT` | Whether to delete MinerU output directory | `1` (do **not** keep the output directory) | `MINERU_DELETE_OUTPUT=0` |
|
||||
| `MINERU_APISERVER` | URL of the MinerU API service | _unset_ | `MINERU_APISERVER=http://your-mineru-server:8886` |
|
||||
| `MINERU_BACKEND` | MinerU parsing backend | `pipeline` | `MINERU_BACKEND=pipeline\|vlm-transformers\|vlm-vllm-engine\|vlm-mlx-engine\|vlm-vllm-async-engine\|vlm-http-client` |
|
||||
| `MINERU_SERVER_URL` | URL of remote vLLM server (for `vlm-http-client`) | _unset_ | `MINERU_SERVER_URL=http://your-vllm-server-ip:30000` |
|
||||
| `MINERU_OUTPUT_DIR` | Directory for MinerU output files | System-defined temporary directory | `MINERU_OUTPUT_DIR=/home/ragflow/mineru/output` |
|
||||
| `MINERU_BACKEND` | MinerU parsing backend | `pipeline` | `MINERU_BACKEND=pipeline\|vlm-transformers\|vlm-vllm-engine\|vlm-http-client` |
|
||||
| `MINERU_SERVER_URL` | URL of remote vLLM server (only for `vlm-http-client` backend) | _unset_ | `MINERU_SERVER_URL=http://your-vllm-server-ip:30000` |
|
||||
| `MINERU_APISERVER` | URL of remote MinerU service used as the parser (instead of local MinerU) | _unset_ | `MINERU_APISERVER=http://your-mineru-server:port` |
|
||||
| `MINERU_DELETE_OUTPUT` | Whether to delete MinerU output directory when a temp dir is used | `1` (delete temp output) | `MINERU_DELETE_OUTPUT=0` |
|
||||
|
||||
1. Set `MINERU_EXECUTABLE` to the path to the MinerU executable if the default `mineru` is not on `PATH`.
|
||||
2. Set `MINERU_DELETE_OUTPUT` to `0` to keep MinerU's output. (Default: `1`, which deletes temporary output.)
|
||||
3. Set `MINERU_OUTPUT_DIR` to specify the output directory for MinerU; otherwise, a system temp directory is used.
|
||||
4. Set `MINERU_BACKEND` to specify a parsing backend:
|
||||
- `"pipeline"` (default): The traditional multimodel pipeline.
|
||||
- `"vlm-transformers"`: A vision-language model using HuggingFace Transformers.
|
||||
- `"vlm-vllm-engine"`: A vision-language model using a local vLLM engine (requires a local GPU).
|
||||
- `"vlm-http-client"`: A vision-language model via HTTP client to a remote vLLM server (RAGFlow only requires CPU).
|
||||
5. If using the `"vlm-http-client"` backend, you must also set `MINERU_SERVER_URL` to your vLLM server's URL.
|
||||
6. If configuring RAGFlow to call a *remote* MinerU service, set `MINERU_APISERVER` to the MinerU server's URL.
|
||||
1. Set `MINERU_APISERVER` to point RAGFlow to your MinerU API server.
|
||||
2. Set `MINERU_BACKEND` to specify a parsing backend.
|
||||
3. If using the `"vlm-http-client"` backend, set `MINERU_SERVER_URL` to your vLLM server's URL. MinerU API expects `backend=vlm-http-client` and `server_url=http://<server>:30000` in the request body.
|
||||
4. Set `MINERU_OUTPUT_DIR` to specify where RAGFlow stores MinerU API output; otherwise, a system temp directory is used.
|
||||
5. Set `MINERU_DELETE_OUTPUT` to `0` to keep MinerU's temp output (useful for debugging).
|
||||
|
||||
:::tip NOTE
|
||||
For information about other environment variables natively supported by MinerU, see [here](https://opendatalab.github.io/MinerU/usage/cli_tools/#environment-variables-description).
|
||||
@ -562,21 +533,16 @@ For information about other environment variables natively supported by MinerU,
|
||||
|
||||
### How to use MinerU with a vLLM server for document parsing?
|
||||
|
||||
RAGFlow supports MinerU's `vlm-http-client` backend, enabling you to delegate document parsing tasks to a remote vLLM server. With this configuration, RAGFlow will connect to your remote vLLM server as a client and use its powerful GPU resources for document parsing. This significantly improves performance for parsing complex documents while reducing the resources required on your RAGFlow server. To configure MinerU with a vLLM server:
|
||||
RAGFlow supports MinerU's `vlm-http-client` backend, enabling you to delegate document parsing tasks to a remote vLLM server while calling MinerU via HTTP. To configure:
|
||||
|
||||
1. Set up a vLLM server running MinerU:
|
||||
|
||||
```bash
|
||||
mineru-vllm-server --port 30000
|
||||
```
|
||||
|
||||
2. Configure the following environment variables in your **docker/.env** file (or your shell if running from source):
|
||||
|
||||
- `MINERU_EXECUTABLE=/home/ragflow/uv_tools/.venv/bin/mineru` (or the path to your MinerU executable)
|
||||
1. Ensure a MinerU API service is reachable (for example `http://mineru-host:8886`).
|
||||
2. Set up or point to a vLLM HTTP server (for example `http://vllm-host:30000`).
|
||||
3. Configure the following in your **docker/.env** file (or your shell if running from source):
|
||||
- `MINERU_APISERVER=http://mineru-host:8886`
|
||||
- `MINERU_BACKEND="vlm-http-client"`
|
||||
- `MINERU_SERVER_URL="http://your-vllm-server-ip:30000"`
|
||||
|
||||
3. Complete the rest of the standard MinerU setup steps as described [here](#how-to-configure-mineru-specific-settings).
|
||||
- `MINERU_SERVER_URL="http://vllm-host:30000"`
|
||||
MinerU API calls expect `backend=vlm-http-client` and `server_url=http://<server>:30000` in the request body.
|
||||
4. Configure `MINERU_OUTPUT_DIR` / `MINERU_DELETE_OUTPUT` as desired to manage the returned zip/JSON before ingestion.
|
||||
|
||||
:::tip NOTE
|
||||
When using the `vlm-http-client` backend, the RAGFlow server requires no GPU, only network connectivity. This enables cost-effective distributed deployment with multiple RAGFlow instances sharing one remote vLLM server.
|
||||
|
||||
@ -40,56 +40,21 @@ The output of a PDF parser is `json`. In the PDF parser, you select the parsing
|
||||
- A third-party visual model from a specific model provider.
|
||||
|
||||
:::danger IMPORTANT
|
||||
MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports MinerU (>= 2.6.3) as an optional PDF parser with multiple backends. RAGFlow acts only as a client for MinerU, calling it to parse documents, reading the output files, and ingesting the parsed content. To use this feature, follow these steps:
|
||||
MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports MinerU (>= 2.6.3) as an optional PDF parser with multiple backends. RAGFlow acts only as a **remote client** for MinerU, calling the MinerU API to parse documents, reading the returned output files, and ingesting the parsed content. To use this feature:
|
||||
:::
|
||||
|
||||
1. Prepare MinerU:
|
||||
1. Prepare a reachable MinerU API service (FastAPI server).
|
||||
2. Configure RAGFlow with the remote MinerU settings (env or UI model provider):
|
||||
- `MINERU_APISERVER`: MinerU API endpoint, for example `http://mineru-host:8886`.
|
||||
- `MINERU_BACKEND`: MinerU backend, defaults to `pipeline` (supports `vlm-http-client`, `vlm-transformers`, `vlm-vllm-engine`, `vlm-mlx-engine`, `vlm-vllm-async-engine`, `vlm-lmdeploy-engine`).
|
||||
- `MINERU_SERVER_URL`: (optional) For `vlm-http-client`, the downstream vLLM HTTP server, for example `http://vllm-host:30000`.
|
||||
- `MINERU_OUTPUT_DIR`: (optional) Local directory to store MinerU API outputs (zip/JSON) before ingestion.
|
||||
- `MINERU_DELETE_OUTPUT`: Whether to delete temporary output when a temp dir is used (`1` deletes temp outputs; set `0` to keep).
|
||||
3. In the web UI, navigate to the **Configuration** page of your dataset. Click **Built-in** in the **Ingestion pipeline** section, select a chunking method from the **Built-in** dropdown, which supports PDF parsing, and select **MinerU** in **PDF parser**.
|
||||
4. If you use a custom ingestion pipeline instead, provide the same MinerU settings and select **MinerU** in the **Parsing method** section of the **Parser** component.
|
||||
|
||||
- **If you deploy RAGFlow from source**, install MinerU into an isolated virtual environment (recommended path: `$HOME/uv_tools`):
|
||||
|
||||
```bash
|
||||
mkdir -p "$HOME/uv_tools"
|
||||
cd "$HOME/uv_tools"
|
||||
uv venv .venv
|
||||
source .venv/bin/activate
|
||||
uv pip install -U "mineru[core]" -i https://mirrors.aliyun.com/pypi/simple
|
||||
# or
|
||||
# uv pip install -U "mineru[all]" -i https://mirrors.aliyun.com/pypi/simple
|
||||
```
|
||||
|
||||
- **If you deploy RAGFlow with Docker**, you usually only need to turn on MinerU support in `docker/.env`:
|
||||
|
||||
```bash
|
||||
# docker/.env
|
||||
...
|
||||
USE_MINERU=true
|
||||
...
|
||||
```
|
||||
|
||||
Enabling `USE_MINERU=true` will internally perform the same setup as the manual configuration (including setting the MinerU executable path and related environment variables). You only need the manual installation above if you are running from source or want full control over the MinerU installation.
|
||||
|
||||
2. Start RAGFlow with MinerU enabled:
|
||||
|
||||
- **Source deployment** – in the RAGFlow repo, export the key MinerU-related variables and start the backend service:
|
||||
|
||||
```bash
|
||||
# in RAGFlow repo
|
||||
export MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru"
|
||||
export MINERU_DELETE_OUTPUT=0 # keep output directory
|
||||
export MINERU_BACKEND=pipeline # or another backend you prefer
|
||||
|
||||
source .venv/bin/activate
|
||||
export PYTHONPATH=$(pwd)
|
||||
bash docker/launch_backend_service.sh
|
||||
```
|
||||
|
||||
- **Docker deployment** – after setting `USE_MINERU=true`, restart the containers so that the new settings take effect:
|
||||
|
||||
```bash
|
||||
# in RAGFlow repo
|
||||
docker compose -f docker/docker-compose.yml restart
|
||||
```
|
||||
|
||||
3. Restart the ragflow-server.
|
||||
:::note
|
||||
All MinerU environment variables are optional. If set, RAGFlow will auto-provision a MinerU OCR model for the tenant on first use with these values. To avoid auto-provisioning, configure MinerU solely through the UI and leave the env vars unset.
|
||||
:::
|
||||
|
||||
:::caution WARNING
|
||||
|
||||
@ -33,65 +33,28 @@ RAGFlow isn't one-size-fits-all. It is built for flexibility and supports deeper
|
||||
|
||||
2. Select the option that works best with your scenario:
|
||||
|
||||
- DeepDoc: (Default) The default visual model performing OCR, TSR, and DLR tasks on PDFs, but can be time-consuming.
|
||||
- Naive: Skip OCR, TSR, and DLR tasks if *all* your PDFs are plain text.
|
||||
- [MinerU](https://github.com/opendatalab/MinerU): (Experimental) An open-source tool that converts PDF into machine-readable formats.
|
||||
- [Docling](https://github.com/docling-project/docling): (Experimental) An open-source document processing tool for gen AI.
|
||||
- A third-party visual model from a specific model provider.
|
||||
- DeepDoc: (Default) The default visual model performing OCR, TSR, and DLR tasks on PDFs, but can be time-consuming.
|
||||
- Naive: Skip OCR, TSR, and DLR tasks if _all_ your PDFs are plain text.
|
||||
- [MinerU](https://github.com/opendatalab/MinerU): (Experimental) An open-source tool that converts PDF into machine-readable formats.
|
||||
- [Docling](https://github.com/docling-project/docling): (Experimental) An open-source document processing tool for gen AI.
|
||||
- A third-party visual model from a specific model provider.
|
||||
|
||||
:::danger IMPORTANT
|
||||
MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports MinerU (>= 2.6.3) as an optional PDF parser with multiple backends. RAGFlow acts only as a client for MinerU, calling it to parse documents, reading the output files, and ingesting the parsed content. To use this feature, follow these steps:
|
||||
MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports MinerU (>= 2.6.3) as an optional PDF parser with multiple backends. RAGFlow acts only as a **remote client** for MinerU, calling the MinerU API to parse documents, reading the returned output files, and ingesting the parsed content. To use this feature:
|
||||
|
||||
1. Prepare MinerU:
|
||||
1. Prepare a reachable MinerU API service (FastAPI server).
|
||||
2. Configure RAGFlow with the remote MinerU settings (env or UI model provider):
|
||||
- `MINERU_APISERVER`: MinerU API endpoint, for example `http://mineru-host:8886`.
|
||||
- `MINERU_BACKEND`: MinerU backend, defaults to `pipeline` (supports `vlm-http-client`, `vlm-transformers`, `vlm-vllm-engine`, `vlm-mlx-engine`, `vlm-vllm-async-engine`).
|
||||
- `MINERU_SERVER_URL`: (optional) For `vlm-http-client`, the downstream vLLM HTTP server, for example `http://vllm-host:30000`.
|
||||
- `MINERU_OUTPUT_DIR`: (optional) Local directory to store MinerU API outputs (zip/JSON) before ingestion.
|
||||
- `MINERU_DELETE_OUTPUT`: Whether to delete temporary output when a temp dir is used (`1` deletes temp outputs; set `0` to keep).
|
||||
3. In the web UI, navigate to the **Configuration** page of your dataset. Click **Built-in** in the **Ingestion pipeline** section, select a chunking method from the **Built-in** dropdown, which supports PDF parsing, and select **MinerU** in **PDF parser**.
|
||||
4. If you use a custom ingestion pipeline instead, provide the same MinerU settings and select **MinerU** in the **Parsing method** section of the **Parser** component.
|
||||
:::
|
||||
|
||||
- **If you deploy RAGFlow from source**, install MinerU into an isolated virtual environment (recommended path: `$HOME/uv_tools`):
|
||||
|
||||
```bash
|
||||
mkdir -p "$HOME/uv_tools"
|
||||
cd "$HOME/uv_tools"
|
||||
uv venv .venv
|
||||
source .venv/bin/activate
|
||||
uv pip install -U "mineru[core]" -i https://mirrors.aliyun.com/pypi/simple
|
||||
# or
|
||||
# uv pip install -U "mineru[all]" -i https://mirrors.aliyun.com/pypi/simple
|
||||
```
|
||||
|
||||
- **If you deploy RAGFlow with Docker**, you usually only need to turn on MinerU support in `docker/.env`:
|
||||
|
||||
```bash
|
||||
# docker/.env
|
||||
...
|
||||
USE_MINERU=true
|
||||
...
|
||||
```
|
||||
|
||||
Enabling `USE_MINERU=true` will internally perform the same setup as the manual configuration (including setting the MinerU executable path and related environment variables). You only need the manual installation above if you are running from source or want full control over the MinerU installation.
|
||||
|
||||
2. Start RAGFlow with MinerU enabled:
|
||||
|
||||
- **Source deployment** – in the RAGFlow repo, export the key MinerU-related variables and start the backend service:
|
||||
|
||||
```bash
|
||||
# in RAGFlow repo
|
||||
export MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru"
|
||||
export MINERU_DELETE_OUTPUT=0 # keep output directory
|
||||
export MINERU_BACKEND=pipeline # or another backend you prefer
|
||||
|
||||
source .venv/bin/activate
|
||||
export PYTHONPATH=$(pwd)
|
||||
bash docker/launch_backend_service.sh
|
||||
```
|
||||
|
||||
- **Docker deployment** – after setting `USE_MINERU=true`, restart the containers so that the new settings take effect:
|
||||
|
||||
```bash
|
||||
# in RAGFlow repo
|
||||
docker compose -f docker/docker-compose.yml restart
|
||||
```
|
||||
|
||||
3. Restart the ragflow-server.
|
||||
4. In the web UI, navigate to the **Configuration** page of your dataset. Click **Built-in** in the **Ingestion pipeline** section, select a chunking method from the **Built-in** dropdown, which supports PDF parsing, and select **MinerU** in **PDF parser**.
|
||||
5. If you use a custom ingestion pipeline instead, you must also complete the first three steps before selecting **MinerU** in the **Parsing method** section of the **Parser** component.
|
||||
:::note
|
||||
All MinerU environment variables are optional. When they are set, RAGFlow will auto-create a MinerU OCR model for a tenant on first use using these values. If you do not want this auto-provisioning, configure MinerU only through the UI and leave the env vars unset.
|
||||
:::
|
||||
|
||||
:::caution WARNING
|
||||
@ -107,4 +70,3 @@ Use a visual model to extract data if your PDFs contain formatted or image-based
|
||||
### Can I select a visual model to parse my DOCX files?
|
||||
|
||||
No, you cannot. This dropdown menu is for PDFs only. To use this feature, convert your DOCX files to PDF first.
|
||||
|
||||
|
||||
@ -280,6 +280,7 @@ class Parser(ProcessBase):
|
||||
binary=blob,
|
||||
callback=self.callback,
|
||||
parse_method=conf.get("mineru_parse_method", "raw"),
|
||||
lang=conf.get("lang", "Chinese"),
|
||||
)
|
||||
bboxes = []
|
||||
for t, poss in lines:
|
||||
|
||||
@ -398,7 +398,7 @@ class JinaMultiVecEmbed(Base):
|
||||
|
||||
ress.append(chunk_emb)
|
||||
|
||||
token_count +=total_token_count_from_response(res)
|
||||
token_count += total_token_count_from_response(res)
|
||||
except Exception as _e:
|
||||
log_exception(_e, response)
|
||||
raise Exception(f"Error: {response}")
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Optional, Tuple
|
||||
from typing import Any, Optional
|
||||
|
||||
from deepdoc.parser.mineru_parser import MinerUParser
|
||||
|
||||
@ -25,7 +25,7 @@ class Base:
|
||||
def __init__(self, key: str | dict, model_name: str, **kwargs):
|
||||
self.model_name = model_name
|
||||
|
||||
def parse_pdf(self, filepath: str, binary=None, **kwargs) -> Tuple[Any, Any]:
|
||||
def parse_pdf(self, filepath: str, binary=None, **kwargs) -> tuple[Any, Any]:
|
||||
raise NotImplementedError("Please implement parse_pdf!")
|
||||
|
||||
|
||||
@ -56,21 +56,22 @@ class MinerUOcrModel(Base, MinerUParser):
|
||||
self.mineru_backend = _resolve_config("mineru_backend", "MINERU_BACKEND", "pipeline")
|
||||
self.mineru_server_url = _resolve_config("mineru_server_url", "MINERU_SERVER_URL", "")
|
||||
self.mineru_delete_output = bool(int(_resolve_config("mineru_delete_output", "MINERU_DELETE_OUTPUT", 1)))
|
||||
self.mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
||||
|
||||
logging.info(f"Parsed MinerU config: {config}")
|
||||
logging.info(
|
||||
f"Parsed MinerU config: backend={self.mineru_backend} api={self.mineru_api} server_url={self.mineru_server_url} output_dir={self.mineru_output_dir} delete_output={self.mineru_delete_output}"
|
||||
)
|
||||
|
||||
MinerUParser.__init__(self, mineru_path=self.mineru_executable, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
|
||||
MinerUParser.__init__(self, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
|
||||
|
||||
def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> Tuple[bool, str]:
|
||||
def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> tuple[bool, str]:
|
||||
backend = backend or self.mineru_backend
|
||||
server_url = server_url or self.mineru_server_url
|
||||
return self.check_installation(backend=backend, server_url=server_url)
|
||||
|
||||
def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw",**kwargs):
|
||||
def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw", **kwargs):
|
||||
ok, reason = self.check_available()
|
||||
if not ok:
|
||||
raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.")
|
||||
raise RuntimeError(f"MinerU server not accessible: {reason}")
|
||||
|
||||
sections, tables = MinerUParser.parse_pdf(
|
||||
self,
|
||||
|
||||
@ -69,7 +69,6 @@ from common.signal_utils import start_tracemalloc_and_snapshot, stop_tracemalloc
|
||||
from common.exceptions import TaskCanceledException
|
||||
from common import settings
|
||||
from common.constants import PAGERANK_FLD, TAG_FLD, SVR_CONSUMER_GROUP_NAME
|
||||
from common.misc_utils import check_and_install_mineru
|
||||
|
||||
BATCH_SIZE = 64
|
||||
|
||||
@ -1169,7 +1168,6 @@ async def main():
|
||||
show_configs()
|
||||
settings.init_settings()
|
||||
settings.check_and_install_torch()
|
||||
check_and_install_mineru()
|
||||
logging.info(f'default embedding config: {settings.EMBEDDING_CFG}')
|
||||
settings.print_rag_settings()
|
||||
if sys.platform != "win32":
|
||||
|
||||
@ -118,6 +118,10 @@ export function ChunkMethodDialog({
|
||||
auto_questions: z.coerce.number().optional(),
|
||||
html4excel: z.boolean().optional(),
|
||||
toc_extraction: z.boolean().optional(),
|
||||
mineru_parse_method: z.enum(['auto', 'txt', 'ocr']).optional(),
|
||||
mineru_formula_enable: z.boolean().optional(),
|
||||
mineru_table_enable: z.boolean().optional(),
|
||||
mineru_lang: z.string().optional(),
|
||||
// raptor: z
|
||||
// .object({
|
||||
// use_raptor: z.boolean().optional(),
|
||||
@ -166,6 +170,9 @@ export function ChunkMethodDialog({
|
||||
name: 'parser_id',
|
||||
control: form.control,
|
||||
});
|
||||
const isMineruSelected =
|
||||
selectedTag?.toLowerCase().includes('mineru') ||
|
||||
layoutRecognize?.toLowerCase?.()?.includes('mineru');
|
||||
|
||||
const isPdf = documentExtension === 'pdf';
|
||||
|
||||
@ -328,7 +335,7 @@ export function ChunkMethodDialog({
|
||||
className="space-y-3"
|
||||
>
|
||||
{showOne && (
|
||||
<LayoutRecognizeFormField></LayoutRecognizeFormField>
|
||||
<LayoutRecognizeFormField showMineruOptions={false} />
|
||||
)}
|
||||
{showMaxTokenNumber && (
|
||||
<>
|
||||
@ -345,9 +352,16 @@ export function ChunkMethodDialog({
|
||||
)}
|
||||
</FormContainer>
|
||||
<FormContainer
|
||||
show={showAutoKeywords(selectedTag) || showExcelToHtml}
|
||||
show={
|
||||
isMineruSelected ||
|
||||
showAutoKeywords(selectedTag) ||
|
||||
showExcelToHtml
|
||||
}
|
||||
className="space-y-3"
|
||||
>
|
||||
{isMineruSelected && (
|
||||
<LayoutRecognizeFormField showMineruOptions />
|
||||
)}
|
||||
{selectedTag === DocumentParserType.Naive && (
|
||||
<EnableTocToggle />
|
||||
)}
|
||||
|
||||
@ -18,6 +18,10 @@ export function useDefaultParserValues() {
|
||||
auto_questions: 0,
|
||||
html4excel: false,
|
||||
toc_extraction: false,
|
||||
mineru_parse_method: 'auto',
|
||||
mineru_formula_enable: true,
|
||||
mineru_table_enable: true,
|
||||
mineru_lang: 'English',
|
||||
// raptor: {
|
||||
// use_raptor: false,
|
||||
// prompt: t('knowledgeConfiguration.promptText'),
|
||||
|
||||
@ -5,6 +5,7 @@ import { cn } from '@/lib/utils';
|
||||
import { camelCase } from 'lodash';
|
||||
import { ReactNode, useMemo } from 'react';
|
||||
import { useFormContext } from 'react-hook-form';
|
||||
import { MinerUOptionsFormField } from './mineru-options-form-field';
|
||||
import { SelectWithSearch } from './originui/select-with-search';
|
||||
import {
|
||||
FormControl,
|
||||
@ -26,11 +27,13 @@ export function LayoutRecognizeFormField({
|
||||
horizontal = true,
|
||||
optionsWithoutLLM,
|
||||
label,
|
||||
showMineruOptions = true,
|
||||
}: {
|
||||
name?: string;
|
||||
horizontal?: boolean;
|
||||
optionsWithoutLLM?: { value: string; label: string }[];
|
||||
label?: ReactNode;
|
||||
showMineruOptions?: boolean;
|
||||
}) {
|
||||
const form = useFormContext();
|
||||
|
||||
@ -79,6 +82,7 @@ export function LayoutRecognizeFormField({
|
||||
name={name}
|
||||
render={({ field }) => {
|
||||
return (
|
||||
<>
|
||||
<FormItem className={'items-center space-y-0 '}>
|
||||
<div
|
||||
className={cn('flex', {
|
||||
@ -108,6 +112,8 @@ export function LayoutRecognizeFormField({
|
||||
<FormMessage />
|
||||
</div>
|
||||
</FormItem>
|
||||
{showMineruOptions && <MinerUOptionsFormField />}
|
||||
</>
|
||||
);
|
||||
}}
|
||||
/>
|
||||
|
||||
@ -7,10 +7,38 @@ import { useFormContext, useWatch } from 'react-hook-form';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
|
||||
const parseMethodOptions = buildOptions(['auto', 'txt', 'ocr']);
|
||||
const languageOptions = buildOptions([
|
||||
'English',
|
||||
'Chinese',
|
||||
'Traditional Chinese',
|
||||
'Russian',
|
||||
'Ukrainian',
|
||||
'Indonesian',
|
||||
'Spanish',
|
||||
'Vietnamese',
|
||||
'Japanese',
|
||||
'Korean',
|
||||
'Portuguese BR',
|
||||
'German',
|
||||
'French',
|
||||
'Italian',
|
||||
'Tamil',
|
||||
'Telugu',
|
||||
'Kannada',
|
||||
'Thai',
|
||||
'Greek',
|
||||
'Hindi',
|
||||
]);
|
||||
|
||||
export function MinerUOptionsFormField() {
|
||||
export function MinerUOptionsFormField({
|
||||
namePrefix = 'parser_config',
|
||||
}: {
|
||||
namePrefix?: string;
|
||||
}) {
|
||||
const form = useFormContext();
|
||||
const { t } = useTranslation();
|
||||
const buildName = (field: string) =>
|
||||
namePrefix ? `${namePrefix}.${field}` : field;
|
||||
|
||||
const layoutRecognize = useWatch({
|
||||
control: form.control,
|
||||
@ -33,7 +61,7 @@ export function MinerUOptionsFormField() {
|
||||
</div>
|
||||
|
||||
<RAGFlowFormItem
|
||||
name="parser_config.mineru_parse_method"
|
||||
name={buildName('mineru_parse_method')}
|
||||
label={t('knowledgeConfiguration.mineruParseMethod', 'Parse Method')}
|
||||
tooltip={t(
|
||||
'knowledgeConfiguration.mineruParseMethodTip',
|
||||
@ -52,7 +80,26 @@ export function MinerUOptionsFormField() {
|
||||
</RAGFlowFormItem>
|
||||
|
||||
<RAGFlowFormItem
|
||||
name="parser_config.mineru_formula_enable"
|
||||
name={buildName('mineru_lang')}
|
||||
label={t('knowledgeConfiguration.mineruLanguage', 'Language')}
|
||||
tooltip={t(
|
||||
'knowledgeConfiguration.mineruLanguageTip',
|
||||
'Preferred OCR language for MinerU.',
|
||||
)}
|
||||
horizontal={true}
|
||||
>
|
||||
{(field) => (
|
||||
<RAGFlowSelect
|
||||
value={field.value || 'English'}
|
||||
onChange={field.onChange}
|
||||
options={languageOptions}
|
||||
placeholder={t('common.selectPlaceholder', 'Select value')}
|
||||
/>
|
||||
)}
|
||||
</RAGFlowFormItem>
|
||||
|
||||
<RAGFlowFormItem
|
||||
name={buildName('mineru_formula_enable')}
|
||||
label={t(
|
||||
'knowledgeConfiguration.mineruFormulaEnable',
|
||||
'Formula Recognition',
|
||||
@ -73,7 +120,7 @@ export function MinerUOptionsFormField() {
|
||||
</RAGFlowFormItem>
|
||||
|
||||
<RAGFlowFormItem
|
||||
name="parser_config.mineru_table_enable"
|
||||
name={buildName('mineru_table_enable')}
|
||||
label={t(
|
||||
'knowledgeConfiguration.mineruTableEnable',
|
||||
'Table Recognition',
|
||||
|
||||
@ -34,8 +34,13 @@ export interface IDocumentInfo {
|
||||
export interface IParserConfig {
|
||||
delimiter?: string;
|
||||
html4excel?: boolean;
|
||||
layout_recognize?: boolean;
|
||||
pages: any[];
|
||||
layout_recognize?: string;
|
||||
pages?: any[];
|
||||
chunk_token_num?: number;
|
||||
auto_keywords?: number;
|
||||
auto_questions?: number;
|
||||
toc_extraction?: boolean;
|
||||
task_page_size?: number;
|
||||
raptor?: Raptor;
|
||||
graphrag?: GraphRag;
|
||||
}
|
||||
|
||||
@ -1,8 +1,13 @@
|
||||
export interface IChangeParserConfigRequestBody {
|
||||
pages: number[][];
|
||||
chunk_token_num: number;
|
||||
layout_recognize: boolean;
|
||||
task_page_size: number;
|
||||
pages?: number[][];
|
||||
chunk_token_num?: number;
|
||||
layout_recognize?: string;
|
||||
task_page_size?: number;
|
||||
delimiter?: string;
|
||||
auto_keywords?: number;
|
||||
auto_questions?: number;
|
||||
html4excel?: boolean;
|
||||
toc_extraction?: boolean;
|
||||
}
|
||||
|
||||
export interface IChangeParserRequestBody {
|
||||
|
||||
@ -7,7 +7,6 @@ import { DelimiterFormField } from '@/components/delimiter-form-field';
|
||||
import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field';
|
||||
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
|
||||
import { MaxTokenNumberFormField } from '@/components/max-token-number-from-field';
|
||||
import { MinerUOptionsFormField } from '@/components/mineru-options-form-field';
|
||||
import {
|
||||
ConfigurationFormContainer,
|
||||
MainContainer,
|
||||
@ -19,7 +18,6 @@ export function NaiveConfiguration() {
|
||||
<MainContainer>
|
||||
<ConfigurationFormContainer>
|
||||
<LayoutRecognizeFormField></LayoutRecognizeFormField>
|
||||
<MinerUOptionsFormField></MinerUOptionsFormField>
|
||||
<MaxTokenNumberFormField initialValue={512}></MaxTokenNumberFormField>
|
||||
<DelimiterFormField></DelimiterFormField>
|
||||
<ChildrenDelimiterForm />
|
||||
|
||||
@ -37,6 +37,7 @@ export const formSchema = z
|
||||
mineru_parse_method: z.enum(['auto', 'txt', 'ocr']).optional(),
|
||||
mineru_formula_enable: z.boolean().optional(),
|
||||
mineru_table_enable: z.boolean().optional(),
|
||||
mineru_lang: z.string().optional(),
|
||||
raptor: z
|
||||
.object({
|
||||
use_raptor: z.boolean().optional(),
|
||||
|
||||
@ -75,6 +75,7 @@ export default function DatasetSettings() {
|
||||
mineru_parse_method: 'auto',
|
||||
mineru_formula_enable: true,
|
||||
mineru_table_enable: true,
|
||||
mineru_lang: 'English',
|
||||
raptor: {
|
||||
use_raptor: true,
|
||||
max_token: 256,
|
||||
|
||||
Reference in New Issue
Block a user