diff --git a/common/misc_utils.py b/common/misc_utils.py index ae56fe5c4..daebf4c8c 100644 --- a/common/misc_utils.py +++ b/common/misc_utils.py @@ -23,6 +23,8 @@ import subprocess import sys import os import logging +from pathlib import Path +from typing import Dict def get_uuid(): return uuid.uuid1().hex @@ -106,3 +108,152 @@ def pip_install_torch(): logging.info("Installing pytorch") pkg_names = ["torch>=2.5.0,<3.0.0"] subprocess.check_call([sys.executable, "-m", "pip", "install", *pkg_names]) + + +def parse_mineru_paths() -> Dict[str, Path]: + """ + Parse MinerU-related paths based on the MINERU_EXECUTABLE environment variable. + + Expected layout (default convention): + MINERU_EXECUTABLE = /home/user/uv_tools/.venv/bin/mineru + + From this path we derive: + - mineru_exec : full path to the mineru executable + - venv_dir : the virtual environment directory (.venv) + - tools_dir : the parent tools directory (e.g. uv_tools) + + If MINERU_EXECUTABLE is not set, we fall back to the default layout: + $HOME/uv_tools/.venv/bin/mineru + + Returns: + A dict with keys: + - "mineru_exec": Path + - "venv_dir": Path + - "tools_dir": Path + """ + mineru_exec_env = os.getenv("MINERU_EXECUTABLE") + + if mineru_exec_env: + # Use the path from the environment variable + mineru_exec = Path(mineru_exec_env).expanduser().resolve() + venv_dir = mineru_exec.parent.parent + tools_dir = venv_dir.parent + else: + # Fall back to default convention: $HOME/uv_tools/.venv/bin/mineru + home = Path(os.path.expanduser("~")) + tools_dir = home / "uv_tools" + venv_dir = tools_dir / ".venv" + mineru_exec = venv_dir / "bin" / "mineru" + + return { + "mineru_exec": mineru_exec, + "venv_dir": venv_dir, + "tools_dir": tools_dir, + } + + +@once +def install_mineru() -> None: + """ + Ensure MinerU is installed. + + Behavior: + 1. MinerU is enabled only when USE_MINERU is true/yes/1/y. + 2. Resolve mineru_exec / venv_dir / tools_dir. + 3. If mineru exists and works, log success and exit. + 4. Otherwise: + - Create tools_dir + - Create venv if missing + - Install mineru[core], fallback to mineru[all] + - Validate with `--help` + 5. Log installation success. + + NOTE: + This function intentionally does NOT return the path. + Logging is used to indicate status. + """ + # Check if MinerU is enabled + use_mineru = os.getenv("USE_MINERU", "").strip().lower() + if use_mineru == "false": + logging.info("USE_MINERU=%r. Skipping MinerU installation.", use_mineru) + return + + # Resolve expected paths + paths = parse_mineru_paths() + mineru_exec: Path = paths["mineru_exec"] + venv_dir: Path = paths["venv_dir"] + tools_dir: Path = paths["tools_dir"] + + # Construct environment variables for installation/execution + env = os.environ.copy() + env["VIRTUAL_ENV"] = str(venv_dir) + env["PATH"] = str(venv_dir / "bin") + os.pathsep + env.get("PATH", "") + + # Configure HuggingFace endpoint + env.setdefault("HUGGINGFACE_HUB_ENDPOINT", os.getenv("HF_ENDPOINT") or "https://hf-mirror.com") + + # Helper: check whether mineru works + def mineru_works() -> bool: + try: + subprocess.check_call( + [str(mineru_exec), "--help"], + stdout=subprocess.DEVNULL, + stderr=subprocess.PIPE, + env=env, + ) + return True + except Exception: + return False + + # If MinerU is already installed and functional + if mineru_exec.is_file() and os.access(mineru_exec, os.X_OK) and mineru_works(): + logging.info("MinerU already installed.") + os.environ["MINERU_EXECUTABLE"] = str(mineru_exec) + return + + logging.info("MinerU not found. Installing into virtualenv: %s", venv_dir) + + # Ensure parent directory exists + tools_dir.mkdir(parents=True, exist_ok=True) + + # Create venv if missing + if not venv_dir.exists(): + subprocess.check_call( + ["uv", "venv", str(venv_dir)], + cwd=str(tools_dir), + env=env, + # stdout=subprocess.DEVNULL, + # stderr=subprocess.PIPE, + ) + else: + logging.info("Virtual environment exists at %s. Reusing it.", venv_dir) + + # Helper for pip install + def pip_install(pkg: str) -> None: + subprocess.check_call( + [ + "uv", "pip", "install", "-U", pkg, + "-i", "https://mirrors.aliyun.com/pypi/simple", + "--extra-index-url", "https://pypi.org/simple", + ], + cwd=str(tools_dir), + # stdout=subprocess.DEVNULL, + # stderr=subprocess.PIPE, + env=env, + ) + + # Install core version first; fallback to all + try: + logging.info("Installing mineru[core] ...") + pip_install("mineru[core]") + except subprocess.CalledProcessError: + logging.warning("mineru[core] installation failed. Installing mineru[all] ...") + pip_install("mineru[all]") + + # Validate installation + if not mineru_works(): + logging.error("MinerU installation failed: %s does not work.", mineru_exec) + raise RuntimeError(f"MinerU installation failed: {mineru_exec} is not functional") + + os.environ["MINERU_EXECUTABLE"] = str(mineru_exec) + logging.info("MinerU installation completed successfully. Executable: %s", mineru_exec) diff --git a/docs/faq.mdx b/docs/faq.mdx index 3da434c8d..55997e1c3 100644 --- a/docs/faq.mdx +++ b/docs/faq.mdx @@ -497,20 +497,6 @@ MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports 1. Prepare MinerU - - **If you deploy RAGFlow from source**, install MinerU into an isolated virtual environment (recommended path: `$HOME/uv_tools`): - - ```bash - mkdir -p "$HOME/uv_tools" - cd "$HOME/uv_tools" - uv venv .venv - source .venv/bin/activate - uv pip install -U "mineru[core]" -i https://mirrors.aliyun.com/pypi/simple - # or - # uv pip install -U "mineru[all]" -i https://mirrors.aliyun.com/pypi/simple - ``` - - - **If you deploy RAGFlow with Docker**, you usually only need to turn on MinerU support in `docker/.env`: - ```bash # docker/.env ... @@ -518,18 +504,15 @@ MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports ... ``` - Enabling `USE_MINERU=true` will internally perform the same setup as the manual configuration (including setting the MinerU executable path and related environment variables). You only need the manual installation above if you are running from source or want full control over the MinerU installation. + Enabling `USE_MINERU=true` will internally perform the same setup as the manual configuration (including setting the MinerU executable path and related environment variables). + 2. Start RAGFlow with MinerU enabled: - - **Source deployment** – in the RAGFlow repo, export the key MinerU-related variables and start the backend service: + - **Source deployment** – in the RAGFlow repo, continue to start the backend service: ```bash - # in RAGFlow repo - export MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru" - export MINERU_DELETE_OUTPUT=0 # keep output directory - export MINERU_BACKEND=pipeline # or another backend you prefer - + ... source .venv/bin/activate export PYTHONPATH=$(pwd) bash docker/launch_backend_service.sh diff --git a/rag/svr/task_executor.py b/rag/svr/task_executor.py index 714b886eb..a91421007 100644 --- a/rag/svr/task_executor.py +++ b/rag/svr/task_executor.py @@ -68,6 +68,7 @@ from common.signal_utils import start_tracemalloc_and_snapshot, stop_tracemalloc from common.exceptions import TaskCanceledException from common import settings from common.constants import PAGERANK_FLD, TAG_FLD, SVR_CONSUMER_GROUP_NAME +from common.misc_utils import install_mineru BATCH_SIZE = 64 @@ -1100,6 +1101,7 @@ async def main(): show_configs() settings.init_settings() settings.check_and_install_torch() + install_mineru() logging.info(f'settings.EMBEDDING_CFG: {settings.EMBEDDING_CFG}') settings.print_rag_settings() if sys.platform != "win32":