Feat: add mineru auto installer (#11649)

### What problem does this PR solve?

Feat: add mineru auto installer

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Billy Bao
2025-12-02 17:29:26 +08:00
committed by GitHub
parent ba6e2af5fd
commit c946858328
3 changed files with 157 additions and 21 deletions

View File

@ -23,6 +23,8 @@ import subprocess
import sys
import os
import logging
from pathlib import Path
from typing import Dict
def get_uuid():
return uuid.uuid1().hex
@ -106,3 +108,152 @@ def pip_install_torch():
logging.info("Installing pytorch")
pkg_names = ["torch>=2.5.0,<3.0.0"]
subprocess.check_call([sys.executable, "-m", "pip", "install", *pkg_names])
def parse_mineru_paths() -> Dict[str, Path]:
"""
Parse MinerU-related paths based on the MINERU_EXECUTABLE environment variable.
Expected layout (default convention):
MINERU_EXECUTABLE = /home/user/uv_tools/.venv/bin/mineru
From this path we derive:
- mineru_exec : full path to the mineru executable
- venv_dir : the virtual environment directory (.venv)
- tools_dir : the parent tools directory (e.g. uv_tools)
If MINERU_EXECUTABLE is not set, we fall back to the default layout:
$HOME/uv_tools/.venv/bin/mineru
Returns:
A dict with keys:
- "mineru_exec": Path
- "venv_dir": Path
- "tools_dir": Path
"""
mineru_exec_env = os.getenv("MINERU_EXECUTABLE")
if mineru_exec_env:
# Use the path from the environment variable
mineru_exec = Path(mineru_exec_env).expanduser().resolve()
venv_dir = mineru_exec.parent.parent
tools_dir = venv_dir.parent
else:
# Fall back to default convention: $HOME/uv_tools/.venv/bin/mineru
home = Path(os.path.expanduser("~"))
tools_dir = home / "uv_tools"
venv_dir = tools_dir / ".venv"
mineru_exec = venv_dir / "bin" / "mineru"
return {
"mineru_exec": mineru_exec,
"venv_dir": venv_dir,
"tools_dir": tools_dir,
}
@once
def install_mineru() -> None:
"""
Ensure MinerU is installed.
Behavior:
1. MinerU is enabled only when USE_MINERU is true/yes/1/y.
2. Resolve mineru_exec / venv_dir / tools_dir.
3. If mineru exists and works, log success and exit.
4. Otherwise:
- Create tools_dir
- Create venv if missing
- Install mineru[core], fallback to mineru[all]
- Validate with `--help`
5. Log installation success.
NOTE:
This function intentionally does NOT return the path.
Logging is used to indicate status.
"""
# Check if MinerU is enabled
use_mineru = os.getenv("USE_MINERU", "").strip().lower()
if use_mineru == "false":
logging.info("USE_MINERU=%r. Skipping MinerU installation.", use_mineru)
return
# Resolve expected paths
paths = parse_mineru_paths()
mineru_exec: Path = paths["mineru_exec"]
venv_dir: Path = paths["venv_dir"]
tools_dir: Path = paths["tools_dir"]
# Construct environment variables for installation/execution
env = os.environ.copy()
env["VIRTUAL_ENV"] = str(venv_dir)
env["PATH"] = str(venv_dir / "bin") + os.pathsep + env.get("PATH", "")
# Configure HuggingFace endpoint
env.setdefault("HUGGINGFACE_HUB_ENDPOINT", os.getenv("HF_ENDPOINT") or "https://hf-mirror.com")
# Helper: check whether mineru works
def mineru_works() -> bool:
try:
subprocess.check_call(
[str(mineru_exec), "--help"],
stdout=subprocess.DEVNULL,
stderr=subprocess.PIPE,
env=env,
)
return True
except Exception:
return False
# If MinerU is already installed and functional
if mineru_exec.is_file() and os.access(mineru_exec, os.X_OK) and mineru_works():
logging.info("MinerU already installed.")
os.environ["MINERU_EXECUTABLE"] = str(mineru_exec)
return
logging.info("MinerU not found. Installing into virtualenv: %s", venv_dir)
# Ensure parent directory exists
tools_dir.mkdir(parents=True, exist_ok=True)
# Create venv if missing
if not venv_dir.exists():
subprocess.check_call(
["uv", "venv", str(venv_dir)],
cwd=str(tools_dir),
env=env,
# stdout=subprocess.DEVNULL,
# stderr=subprocess.PIPE,
)
else:
logging.info("Virtual environment exists at %s. Reusing it.", venv_dir)
# Helper for pip install
def pip_install(pkg: str) -> None:
subprocess.check_call(
[
"uv", "pip", "install", "-U", pkg,
"-i", "https://mirrors.aliyun.com/pypi/simple",
"--extra-index-url", "https://pypi.org/simple",
],
cwd=str(tools_dir),
# stdout=subprocess.DEVNULL,
# stderr=subprocess.PIPE,
env=env,
)
# Install core version first; fallback to all
try:
logging.info("Installing mineru[core] ...")
pip_install("mineru[core]")
except subprocess.CalledProcessError:
logging.warning("mineru[core] installation failed. Installing mineru[all] ...")
pip_install("mineru[all]")
# Validate installation
if not mineru_works():
logging.error("MinerU installation failed: %s does not work.", mineru_exec)
raise RuntimeError(f"MinerU installation failed: {mineru_exec} is not functional")
os.environ["MINERU_EXECUTABLE"] = str(mineru_exec)
logging.info("MinerU installation completed successfully. Executable: %s", mineru_exec)

View File

@ -497,20 +497,6 @@ MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports
1. Prepare MinerU
- **If you deploy RAGFlow from source**, install MinerU into an isolated virtual environment (recommended path: `$HOME/uv_tools`):
```bash
mkdir -p "$HOME/uv_tools"
cd "$HOME/uv_tools"
uv venv .venv
source .venv/bin/activate
uv pip install -U "mineru[core]" -i https://mirrors.aliyun.com/pypi/simple
# or
# uv pip install -U "mineru[all]" -i https://mirrors.aliyun.com/pypi/simple
```
- **If you deploy RAGFlow with Docker**, you usually only need to turn on MinerU support in `docker/.env`:
```bash
# docker/.env
...
@ -518,18 +504,15 @@ MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports
...
```
Enabling `USE_MINERU=true` will internally perform the same setup as the manual configuration (including setting the MinerU executable path and related environment variables). You only need the manual installation above if you are running from source or want full control over the MinerU installation.
Enabling `USE_MINERU=true` will internally perform the same setup as the manual configuration (including setting the MinerU executable path and related environment variables).
2. Start RAGFlow with MinerU enabled:
- **Source deployment** in the RAGFlow repo, export the key MinerU-related variables and start the backend service:
- **Source deployment** in the RAGFlow repo, continue to start the backend service:
```bash
# in RAGFlow repo
export MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru"
export MINERU_DELETE_OUTPUT=0 # keep output directory
export MINERU_BACKEND=pipeline # or another backend you prefer
...
source .venv/bin/activate
export PYTHONPATH=$(pwd)
bash docker/launch_backend_service.sh

View File

@ -68,6 +68,7 @@ from common.signal_utils import start_tracemalloc_and_snapshot, stop_tracemalloc
from common.exceptions import TaskCanceledException
from common import settings
from common.constants import PAGERANK_FLD, TAG_FLD, SVR_CONSUMER_GROUP_NAME
from common.misc_utils import install_mineru
BATCH_SIZE = 64
@ -1100,6 +1101,7 @@ async def main():
show_configs()
settings.init_settings()
settings.check_and_install_torch()
install_mineru()
logging.info(f'settings.EMBEDDING_CFG: {settings.EMBEDDING_CFG}')
settings.print_rag_settings()
if sys.platform != "win32":