Refa: only support MinerU-API now (#11977)

### What problem does this PR solve?

Only support MinerU-API now, still need to complete frontend for
pipeline to allow the configuration of MinerU options.

### Type of change

- [x] Refactoring
This commit is contained in:
Yongteng Lei
2025-12-17 12:58:48 +08:00
committed by GitHub
parent 5e05f43c3d
commit 03f9be7cbb
19 changed files with 273 additions and 624 deletions

View File

@ -23,8 +23,6 @@ import subprocess
import sys
import os
import logging
from pathlib import Path
from typing import Dict
def get_uuid():
return uuid.uuid1().hex
@ -108,152 +106,3 @@ def pip_install_torch():
logging.info("Installing pytorch")
pkg_names = ["torch>=2.5.0,<3.0.0"]
subprocess.check_call([sys.executable, "-m", "pip", "install", *pkg_names])
def parse_mineru_paths() -> Dict[str, Path]:
"""
Parse MinerU-related paths based on the MINERU_EXECUTABLE environment variable.
Expected layout (default convention):
MINERU_EXECUTABLE = /home/user/uv_tools/.venv/bin/mineru
From this path we derive:
- mineru_exec : full path to the mineru executable
- venv_dir : the virtual environment directory (.venv)
- tools_dir : the parent tools directory (e.g. uv_tools)
If MINERU_EXECUTABLE is not set, we fall back to the default layout:
$HOME/uv_tools/.venv/bin/mineru
Returns:
A dict with keys:
- "mineru_exec": Path
- "venv_dir": Path
- "tools_dir": Path
"""
mineru_exec_env = os.getenv("MINERU_EXECUTABLE")
if mineru_exec_env:
# Use the path from the environment variable
mineru_exec = Path(mineru_exec_env).expanduser().resolve()
venv_dir = mineru_exec.parent.parent
tools_dir = venv_dir.parent
else:
# Fall back to default convention: $HOME/uv_tools/.venv/bin/mineru
home = Path(os.path.expanduser("~"))
tools_dir = home / "uv_tools"
venv_dir = tools_dir / ".venv"
mineru_exec = venv_dir / "bin" / "mineru"
return {
"mineru_exec": mineru_exec,
"venv_dir": venv_dir,
"tools_dir": tools_dir,
}
@once
def check_and_install_mineru() -> None:
"""
Ensure MinerU is installed.
Behavior:
1. MinerU is enabled only when USE_MINERU is true/yes/1/y.
2. Resolve mineru_exec / venv_dir / tools_dir.
3. If mineru exists and works, log success and exit.
4. Otherwise:
- Create tools_dir
- Create venv if missing
- Install mineru[core], fallback to mineru[all]
- Validate with `--help`
5. Log installation success.
NOTE:
This function intentionally does NOT return the path.
Logging is used to indicate status.
"""
# Check if MinerU is enabled
use_mineru = os.getenv("USE_MINERU", "false").strip().lower()
if use_mineru != "true":
logging.info("USE_MINERU=%r. Skipping MinerU installation.", use_mineru)
return
# Resolve expected paths
paths = parse_mineru_paths()
mineru_exec: Path = paths["mineru_exec"]
venv_dir: Path = paths["venv_dir"]
tools_dir: Path = paths["tools_dir"]
# Construct environment variables for installation/execution
env = os.environ.copy()
env["VIRTUAL_ENV"] = str(venv_dir)
env["PATH"] = str(venv_dir / "bin") + os.pathsep + env.get("PATH", "")
# Configure HuggingFace endpoint
env.setdefault("HUGGINGFACE_HUB_ENDPOINT", os.getenv("HF_ENDPOINT") or "https://hf-mirror.com")
# Helper: check whether mineru works
def mineru_works() -> bool:
try:
subprocess.check_call(
[str(mineru_exec), "--help"],
stdout=subprocess.DEVNULL,
stderr=subprocess.PIPE,
env=env,
)
return True
except Exception:
return False
# If MinerU is already installed and functional
if mineru_exec.is_file() and os.access(mineru_exec, os.X_OK) and mineru_works():
logging.info("MinerU already installed.")
os.environ["MINERU_EXECUTABLE"] = str(mineru_exec)
return
logging.info("MinerU not found. Installing into virtualenv: %s", venv_dir)
# Ensure parent directory exists
tools_dir.mkdir(parents=True, exist_ok=True)
# Create venv if missing
if not venv_dir.exists():
subprocess.check_call(
["uv", "venv", str(venv_dir)],
cwd=str(tools_dir),
env=env,
# stdout=subprocess.DEVNULL,
# stderr=subprocess.PIPE,
)
else:
logging.info("Virtual environment exists at %s. Reusing it.", venv_dir)
# Helper for pip install
def pip_install(pkg: str) -> None:
subprocess.check_call(
[
"uv", "pip", "install", "-U", pkg,
"-i", "https://mirrors.aliyun.com/pypi/simple",
"--extra-index-url", "https://pypi.org/simple",
],
cwd=str(tools_dir),
# stdout=subprocess.DEVNULL,
# stderr=subprocess.PIPE,
env=env,
)
# Install core version first; fallback to all
try:
logging.info("Installing mineru[core] ...")
pip_install("mineru[core]")
except subprocess.CalledProcessError:
logging.warning("mineru[core] installation failed. Installing mineru[all] ...")
pip_install("mineru[all]")
# Validate installation
if not mineru_works():
logging.error("MinerU installation failed: %s does not work.", mineru_exec)
raise RuntimeError(f"MinerU installation failed: {mineru_exec} is not functional")
os.environ["MINERU_EXECUTABLE"] = str(mineru_exec)
logging.info("MinerU installation completed successfully. Executable: %s", mineru_exec)

View File

@ -16,19 +16,15 @@
import json
import logging
import os
import platform
import re
import subprocess
import sys
import tempfile
import threading
import time
import zipfile
from dataclasses import dataclass
from io import BytesIO
from os import PathLike
from pathlib import Path
from queue import Empty, Queue
from typing import Any, Callable, Optional
import numpy as np
@ -137,10 +133,8 @@ class MinerUParseOptions:
class MinerUParser(RAGFlowPdfParser):
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "", mineru_server_url: str = ""):
self.mineru_path = Path(mineru_path)
self.mineru_api = mineru_api.rstrip("/")
self.mineru_server_url = mineru_server_url.rstrip("/")
self.using_api = False
self.outlines = []
self.logger = logging.getLogger(self.__class__.__name__)
@ -189,105 +183,59 @@ class MinerUParser(RAGFlowPdfParser):
def check_installation(self, backend: str = "pipeline", server_url: Optional[str] = None) -> tuple[bool, str]:
reason = ""
valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine", "vlm-mlx-engine"]
valid_backends = ["pipeline", "vlm-http-client", "vlm-transformers", "vlm-vllm-engine", "vlm-mlx-engine", "vlm-vllm-async-engine", "vlm-lmdeploy-engine"]
if backend not in valid_backends:
reason = "[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
reason = f"[MinerU] Invalid backend '{backend}'. Valid backends are: {valid_backends}"
self.logger.warning(reason)
return False, reason
subprocess_kwargs = {
"capture_output": True,
"text": True,
"check": True,
"encoding": "utf-8",
"errors": "ignore",
}
if platform.system() == "Windows":
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
if server_url is None:
server_url = self.mineru_server_url
if backend == "vlm-http-client" and server_url:
try:
server_accessible = self._is_http_endpoint_valid(server_url + "/openapi.json")
self.logger.info(f"[MinerU] vlm-http-client server check: {server_accessible}")
if server_accessible:
self.using_api = False # We are using http client, not API
return True, reason
else:
reason = f"[MinerU] vlm-http-client server not accessible: {server_url}"
self.logger.warning(f"[MinerU] vlm-http-client server not accessible: {server_url}")
return False, reason
except Exception as e:
self.logger.warning(f"[MinerU] vlm-http-client server check failed: {e}")
try:
response = requests.get(server_url, timeout=5)
self.logger.info(
f"[MinerU] vlm-http-client server connection check: success with status {response.status_code}")
self.using_api = False
return True, reason
except Exception as e:
reason = f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}"
self.logger.warning(f"[MinerU] vlm-http-client server connection check failed: {server_url}: {e}")
return False, reason
if not self.mineru_api:
reason = "[MinerU] MINERU_APISERVER not configured."
self.logger.warning(reason)
return False, reason
api_openapi = f"{self.mineru_api}/openapi.json"
try:
result = subprocess.run([str(self.mineru_path), "--version"], **subprocess_kwargs)
version_info = result.stdout.strip()
if version_info:
self.logger.info(f"[MinerU] Detected version: {version_info}")
else:
self.logger.info("[MinerU] Detected MinerU, but version info is empty.")
return True, reason
except subprocess.CalledProcessError as e:
self.logger.warning(f"[MinerU] Execution failed (exit code {e.returncode}).")
except FileNotFoundError:
self.logger.warning("[MinerU] MinerU not found. Please install it via: pip install -U 'mineru[core]'")
except Exception as e:
self.logger.error(f"[MinerU] Unexpected error during installation check: {e}")
# If executable check fails, try API check
try:
if self.mineru_api:
# check openapi.json
openapi_exists = self._is_http_endpoint_valid(self.mineru_api + "/openapi.json")
if not openapi_exists:
reason = "[MinerU] Failed to detect vaild MinerU API server"
return openapi_exists, reason
self.logger.info(f"[MinerU] Detected {self.mineru_api}/openapi.json: {openapi_exists}")
self.using_api = openapi_exists
return openapi_exists, reason
else:
reason = "[MinerU] api not exists. Setting MINERU_SERVER_URL if your backend is vlm-http-client."
self.logger.info(reason)
api_ok = self._is_http_endpoint_valid(api_openapi)
self.logger.info(f"[MinerU] API openapi.json reachable={api_ok} url={api_openapi}")
if not api_ok:
reason = f"[MinerU] MinerU API not accessible: {api_openapi}"
return False, reason
except Exception as e:
reason = f"[MinerU] Unexpected error during api check: {e}"
self.logger.error(f"[MinerU] Unexpected error during api check: {e}")
return False, reason
except Exception as exc:
reason = f"[MinerU] MinerU API check failed: {exc}"
self.logger.warning(reason)
return False, reason
if backend == "vlm-http-client":
resolved_server = server_url or self.mineru_server_url
if not resolved_server:
reason = "[MinerU] MINERU_SERVER_URL required for vlm-http-client backend."
self.logger.warning(reason)
return False, reason
try:
server_ok = self._is_http_endpoint_valid(resolved_server)
self.logger.info(f"[MinerU] vlm-http-client server check reachable={server_ok} url={resolved_server}")
except Exception as exc:
self.logger.warning(f"[MinerU] vlm-http-client server probe failed: {resolved_server}: {exc}")
return True, reason
def _run_mineru(
self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
):
if self.using_api:
self._run_mineru_api(input_path, output_dir, options, callback)
else:
self._run_mineru_executable(input_path, output_dir, options, callback)
def _run_mineru_api(self, input_path: Path, output_dir: Path, options: MinerUParseOptions,
callback: Optional[Callable] = None):
output_zip_path = os.path.join(str(output_dir), "output.zip")
self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
) -> Path:
return self._run_mineru_api(input_path, output_dir, options, callback)
def _run_mineru_api(
self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
) -> Path:
pdf_file_path = str(input_path)
if not os.path.exists(pdf_file_path):
raise RuntimeError(f"[MinerU] PDF file not exists: {pdf_file_path}")
pdf_file_name = Path(pdf_file_path).stem.strip()
output_path = os.path.join(str(output_dir), pdf_file_name, options.method)
os.makedirs(output_path, exist_ok=True)
output_path = tempfile.mkdtemp(prefix=f"{pdf_file_name}_{options.method}_", dir=str(output_dir))
output_zip_path = os.path.join(str(output_dir), f"{Path(output_path).name}.zip")
files = {"files": (pdf_file_name + ".pdf", open(pdf_file_path, "rb"), "application/pdf")}
@ -309,9 +257,19 @@ class MinerUParser(RAGFlowPdfParser):
"end_page_id": 99999,
}
if options.server_url:
data["server_url"] = options.server_url
elif self.mineru_server_url:
data["server_url"] = self.mineru_server_url
print("--------------------------------", flush=True)
print(f"{data=}", flush=True)
print(f"{options=}", flush=True)
print("--------------------------------", flush=True)
headers = {"Accept": "application/json"}
try:
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse")
self.logger.info(f"[MinerU] invoke api: {self.mineru_api}/file_parse backend={options.backend} server_url={data.get('server_url')}")
if callback:
callback(0.20, f"[MinerU] invoke api: {self.mineru_api}/file_parse")
response = requests.post(url=f"{self.mineru_api}/file_parse", files=files, data=data, headers=headers,
@ -333,65 +291,11 @@ class MinerUParser(RAGFlowPdfParser):
if callback:
callback(0.40, f"[MinerU] Unzip to {output_path}...")
else:
self.logger.warning("[MinerU] not zip returned from api%s " % response.headers.get("Content-Type"))
self.logger.warning(f"[MinerU] not zip returned from api: {response.headers.get('Content-Type')}")
except Exception as e:
raise RuntimeError(f"[MinerU] api failed with exception {e}")
self.logger.info("[MinerU] Api completed successfully.")
def _run_mineru_executable(
self, input_path: Path, output_dir: Path, options: MinerUParseOptions, callback: Optional[Callable] = None
):
cmd = [str(self.mineru_path), "-p", str(input_path), "-o", str(output_dir), "-m", options.method]
if options.backend:
cmd.extend(["-b", options.backend])
if options.lang:
cmd.extend(["-l", options.lang])
if options.server_url and options.backend == "vlm-http-client":
cmd.extend(["-u", options.server_url])
self.logger.info(f"[MinerU] Running command: {' '.join(cmd)}")
subprocess_kwargs = {
"stdout": subprocess.PIPE,
"stderr": subprocess.PIPE,
"text": True,
"encoding": "utf-8",
"errors": "ignore",
"bufsize": 1,
}
if platform.system() == "Windows":
subprocess_kwargs["creationflags"] = getattr(subprocess, "CREATE_NO_WINDOW", 0)
process = subprocess.Popen(cmd, **subprocess_kwargs)
stdout_queue, stderr_queue = Queue(), Queue()
def enqueue_output(pipe, queue, prefix):
for line in iter(pipe.readline, ""):
if line.strip():
queue.put((prefix, line.strip()))
pipe.close()
threading.Thread(target=enqueue_output, args=(process.stdout, stdout_queue, "STDOUT"), daemon=True).start()
threading.Thread(target=enqueue_output, args=(process.stderr, stderr_queue, "STDERR"), daemon=True).start()
while process.poll() is None:
for q in (stdout_queue, stderr_queue):
try:
while True:
prefix, line = q.get_nowait()
if prefix == "STDOUT":
self.logger.info(f"[MinerU] {line}")
else:
self.logger.warning(f"[MinerU] {line}")
except Empty:
pass
time.sleep(0.1)
return_code = process.wait()
if return_code != 0:
raise RuntimeError(f"[MinerU] Process failed with exit code {return_code}")
self.logger.info("[MinerU] Command completed successfully.")
return Path(output_path)
def __images__(self, fnm, zoomin: int = 1, page_from=0, page_to=600, callback=None):
self.page_from = page_from
@ -554,25 +458,6 @@ class MinerUParser(RAGFlowPdfParser):
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[
dict[str, Any]]:
candidates = []
seen = set()
def add_candidate_path(p: Path):
if p not in seen:
seen.add(p)
candidates.append(p)
if backend.startswith("vlm-"):
add_candidate_path(output_dir / file_stem / "vlm")
if method:
add_candidate_path(output_dir / file_stem / method)
add_candidate_path(output_dir / file_stem / "auto")
else:
if method:
add_candidate_path(output_dir / file_stem / method)
add_candidate_path(output_dir / file_stem / "vlm")
add_candidate_path(output_dir / file_stem / "auto")
json_file = None
subdir = None
attempted = []
@ -588,33 +473,28 @@ class MinerUParser(RAGFlowPdfParser):
safe_stem = _sanitize_filename(file_stem)
allowed_names = {f"{file_stem}_content_list.json", f"{safe_stem}_content_list.json"}
self.logger.info(f"[MinerU] Expected output files: {', '.join(sorted(allowed_names))}")
self.logger.info(f"[MinerU] Searching output candidates: {', '.join(str(c) for c in candidates)}")
self.logger.info(f"[MinerU] Searching output in: {output_dir}")
for sub in candidates:
jf = sub / f"{file_stem}_content_list.json"
self.logger.info(f"[MinerU] Trying original path: {jf}")
attempted.append(jf)
if jf.exists():
subdir = sub
json_file = jf
break
# MinerU API sanitizes non-ASCII filenames inside the ZIP root and file names.
alt = sub / f"{safe_stem}_content_list.json"
jf = output_dir / f"{file_stem}_content_list.json"
self.logger.info(f"[MinerU] Trying original path: {jf}")
attempted.append(jf)
if jf.exists():
subdir = output_dir
json_file = jf
else:
alt = output_dir / f"{safe_stem}_content_list.json"
self.logger.info(f"[MinerU] Trying sanitized filename: {alt}")
attempted.append(alt)
if alt.exists():
subdir = sub
subdir = output_dir
json_file = alt
break
nested_alt = sub / safe_stem / f"{safe_stem}_content_list.json"
self.logger.info(f"[MinerU] Trying sanitized nested path: {nested_alt}")
attempted.append(nested_alt)
if nested_alt.exists():
subdir = nested_alt.parent
json_file = nested_alt
break
else:
nested_alt = output_dir / safe_stem / f"{safe_stem}_content_list.json"
self.logger.info(f"[MinerU] Trying sanitized nested path: {nested_alt}")
attempted.append(nested_alt)
if nested_alt.exists():
subdir = nested_alt.parent
json_file = nested_alt
if not json_file:
raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(p) for p in attempted)}")
@ -680,12 +560,12 @@ class MinerUParser(RAGFlowPdfParser):
temp_pdf = None
created_tmp_dir = False
# Assuming the dict is defined as shown
lang = kwargs.get('lang', 'English')
mineru_lang_code = LANGUAGE_TO_MINERU_MAP.get(lang, 'ch') # Returns 'ch' if lang not found
mineru_method_raw_str = kwargs.get('parser_config', {}).get('mineru_parse_method', 'auto')
enable_formula = kwargs.get('parser_config', {}).get('mineru_formula_enable', True)
enable_table = kwargs.get('parser_config', {}).get('mineru_enable', True)
parser_cfg = kwargs.get('parser_config', {})
lang = parser_cfg.get('mineru_lang') or kwargs.get('lang', 'English')
mineru_lang_code = LANGUAGE_TO_MINERU_MAP.get(lang, 'ch') # Defaults to Chinese if not matched
mineru_method_raw_str = parser_cfg.get('mineru_parse_method', 'auto')
enable_formula = parser_cfg.get('mineru_formula_enable', True)
enable_table = parser_cfg.get('mineru_table_enable', True)
# remove spaces, or mineru crash, and _read_output fail too
file_path = Path(filepath)
@ -718,7 +598,7 @@ class MinerUParser(RAGFlowPdfParser):
out_dir = Path(tempfile.mkdtemp(prefix="mineru_pdf_"))
created_tmp_dir = True
self.logger.info(f"[MinerU] Output directory: {out_dir}")
self.logger.info(f"[MinerU] Output directory: {out_dir} backend={backend} api={self.mineru_api} server_url={server_url or self.mineru_server_url}")
if callback:
callback(0.15, f"[MinerU] Output directory: {out_dir}")
@ -735,8 +615,8 @@ class MinerUParser(RAGFlowPdfParser):
formula_enable=enable_formula,
table_enable=enable_table,
)
self._run_mineru(pdf, out_dir, options, callback=callback)
outputs = self._read_output(out_dir, pdf.stem, method=mineru_method_raw_str, backend=backend)
final_out_dir = self._run_mineru(pdf, out_dir, options, callback=callback)
outputs = self._read_output(final_out_dir, pdf.stem, method=mineru_method_raw_str, backend=backend)
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
if callback:
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")

View File

@ -201,64 +201,10 @@ function ensure_docling() {
|| python3 -m pip install -i https://pypi.tuna.tsinghua.edu.cn/simple --extra-index-url https://pypi.org/simple --no-cache-dir "docling${DOCLING_PIN}"
}
function ensure_mineru() {
[[ "${USE_MINERU}" == "true" ]] || { echo "[mineru] disabled by USE_MINERU"; return 0; }
export HUGGINGFACE_HUB_ENDPOINT="${HF_ENDPOINT:-https://hf-mirror.com}"
local default_prefix="/ragflow/uv_tools"
local venv_dir="${default_prefix}/.venv"
local exe="${MINERU_EXECUTABLE:-${venv_dir}/bin/mineru}"
local mineru_backend="${MINERU_BACKEND:-pipeline}"
local mineru_pkg="mineru[core]"
if [[ "${mineru_backend}" == vlm-* ]]; then
mineru_pkg="mineru[core,vlm]"
fi
if [[ -x "${exe}" ]]; then
echo "[mineru] found: ${exe} (MINERU_BACKEND=${mineru_backend})"
export MINERU_EXECUTABLE="${exe}"
if [[ "${mineru_backend}" == vlm-* ]]; then
if ! "${venv_dir}/bin/python3" -c "import importlib.util, sys; sys.exit(0 if importlib.util.find_spec('vllm') else 1)" >/dev/null 2>&1; then
echo "[mineru] vllm not found for MINERU_BACKEND=${mineru_backend}, installing ${mineru_pkg} ..."
(
set -e
source "${venv_dir}/bin/activate"
uv pip install -U "${mineru_pkg}" -i https://mirrors.aliyun.com/pypi/simple --extra-index-url https://pypi.org/simple
deactivate
) || return 1
fi
fi
return 0
fi
echo "[mineru] not found, bootstrapping with uv ... (MINERU_BACKEND=${mineru_backend}, pkg=${mineru_pkg})"
(
set -e
mkdir -p "${default_prefix}"
cd "${default_prefix}"
[[ -d "${venv_dir}" ]] || { echo "[mineru] creating venv at ${venv_dir} ..."; uv venv "${venv_dir}"; }
echo "[mineru] installing ${mineru_pkg} into ${venv_dir} ..."
source "${venv_dir}/bin/activate"
uv pip install -U "${mineru_pkg}" -i https://mirrors.aliyun.com/pypi/simple --extra-index-url https://pypi.org/simple
deactivate
)
export MINERU_EXECUTABLE="${exe}"
if ! "${MINERU_EXECUTABLE}" --help >/dev/null 2>&1; then
echo "[mineru] installation failed: ${MINERU_EXECUTABLE} not working" >&2
return 1
fi
echo "[mineru] installed: ${MINERU_EXECUTABLE}"
}
# -----------------------------------------------------------------------------
# Start components based on flags
# -----------------------------------------------------------------------------
ensure_docling
ensure_mineru
if [[ "${ENABLE_WEBSERVER}" -eq 1 ]]; then
echo "Starting nginx..."

View File

@ -493,66 +493,37 @@ See [here](./guides/agent/best_practices/accelerate_agent_question_answering.md)
### How to use MinerU to parse PDF documents?
MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports MinerU (>= 2.6.3) as an optional PDF parser with multiple backends. RAGFlow acts only as a client for MinerU, calling it to parse documents, reading the output files, and ingesting the parsed content. To use this feature, follow these steps:
1. Prepare MinerU
```bash
# docker/.env
...
USE_MINERU=true
...
```
Enabling `USE_MINERU=true` will internally perform the same setup as the manual configuration (including setting the MinerU executable path and related environment variables).
2. Start RAGFlow with MinerU enabled:
- **Source deployment** in the RAGFlow repo, continue to start the backend service:
```bash
...
source .venv/bin/activate
export PYTHONPATH=$(pwd)
bash docker/launch_backend_service.sh
```
- **Docker deployment** after setting `USE_MINERU=true`, restart the containers so that the new settings take effect:
```bash
# in RAGFlow repo
docker compose -f docker/docker-compose.yml restart
```
MinerU PDF document parsing is available starting from v0.22.0. RAGFlow works only as a remote client to MinerU (>= 2.6.3) and does not install or execute MinerU locally. To use this feature:
1. Prepare a reachable MinerU API service (for example, the FastAPI server provided by MinerU).
2. Configure RAGFlow with remote MinerU settings (environment variables or UI model provider):
- `MINERU_APISERVER`: MinerU API endpoint, for example `http://mineru-host:8886`.
- `MINERU_BACKEND`: MinerU backend, defaults to `pipeline` (supports `vlm-http-client`, `vlm-transformers`, `vlm-vllm-engine`, `vlm-mlx-engine`, `vlm-vllm-async-engine`, `vlm-lmdeploy-engine`).
- `MINERU_SERVER_URL`: (optional) For `vlm-http-client`, the downstream vLLM HTTP server, for example `http://vllm-host:30000`.
- `MINERU_OUTPUT_DIR`: (optional) Local directory to store MinerU API outputs (zip/JSON) before ingestion.
- `MINERU_DELETE_OUTPUT`: Whether to delete temporary output when a temp dir is used (`1` deletes temp outputs; set `0` to keep).
3. In the web UI, navigate to the **Configuration** page of your dataset. Click **Built-in** in the **Ingestion pipeline** section, select a chunking method from the **Built-in** dropdown (which supports PDF parsing), and select **MinerU** in **PDF parser**.
4. If you use a custom ingestion pipeline instead, you must also complete the first two steps before selecting **MinerU** in the **Parsing method** section of the **Parser** component.
4. If you use a custom ingestion pipeline instead, provide the same MinerU settings and select **MinerU** in the **Parsing method** section of the **Parser** component.
---
### How to configure MinerU-specific settings?
The table below summarizes the most frequently used MinerU environment variables:
The table below summarizes the most frequently used MinerU environment variables for remote MinerU:
| Environment variable | Description | Default | Example |
| ---------------------- | ---------------------------------- | ----------------------------------- | ----------------------------------------------------------------------------------------------- |
| `MINERU_EXECUTABLE` | Path to the local MinerU executable | `mineru` | `MINERU_EXECUTABLE=/home/ragflow/uv_tools/.venv/bin/mineru` |
| `MINERU_DELETE_OUTPUT` | Whether to delete MinerU output directory | `1` (do **not** keep the output directory) | `MINERU_DELETE_OUTPUT=0` |
| `MINERU_APISERVER` | URL of the MinerU API service | _unset_ | `MINERU_APISERVER=http://your-mineru-server:8886` |
| `MINERU_BACKEND` | MinerU parsing backend | `pipeline` | `MINERU_BACKEND=pipeline\|vlm-transformers\|vlm-vllm-engine\|vlm-mlx-engine\|vlm-vllm-async-engine\|vlm-http-client` |
| `MINERU_SERVER_URL` | URL of remote vLLM server (for `vlm-http-client`) | _unset_ | `MINERU_SERVER_URL=http://your-vllm-server-ip:30000` |
| `MINERU_OUTPUT_DIR` | Directory for MinerU output files | System-defined temporary directory | `MINERU_OUTPUT_DIR=/home/ragflow/mineru/output` |
| `MINERU_BACKEND` | MinerU parsing backend | `pipeline` | `MINERU_BACKEND=pipeline\|vlm-transformers\|vlm-vllm-engine\|vlm-http-client` |
| `MINERU_SERVER_URL` | URL of remote vLLM server (only for `vlm-http-client` backend) | _unset_ | `MINERU_SERVER_URL=http://your-vllm-server-ip:30000` |
| `MINERU_APISERVER` | URL of remote MinerU service used as the parser (instead of local MinerU) | _unset_ | `MINERU_APISERVER=http://your-mineru-server:port` |
| `MINERU_DELETE_OUTPUT` | Whether to delete MinerU output directory when a temp dir is used | `1` (delete temp output) | `MINERU_DELETE_OUTPUT=0` |
1. Set `MINERU_EXECUTABLE` to the path to the MinerU executable if the default `mineru` is not on `PATH`.
2. Set `MINERU_DELETE_OUTPUT` to `0` to keep MinerU's output. (Default: `1`, which deletes temporary output.)
3. Set `MINERU_OUTPUT_DIR` to specify the output directory for MinerU; otherwise, a system temp directory is used.
4. Set `MINERU_BACKEND` to specify a parsing backend:
- `"pipeline"` (default): The traditional multimodel pipeline.
- `"vlm-transformers"`: A vision-language model using HuggingFace Transformers.
- `"vlm-vllm-engine"`: A vision-language model using a local vLLM engine (requires a local GPU).
- `"vlm-http-client"`: A vision-language model via HTTP client to a remote vLLM server (RAGFlow only requires CPU).
5. If using the `"vlm-http-client"` backend, you must also set `MINERU_SERVER_URL` to your vLLM server's URL.
6. If configuring RAGFlow to call a *remote* MinerU service, set `MINERU_APISERVER` to the MinerU server's URL.
1. Set `MINERU_APISERVER` to point RAGFlow to your MinerU API server.
2. Set `MINERU_BACKEND` to specify a parsing backend.
3. If using the `"vlm-http-client"` backend, set `MINERU_SERVER_URL` to your vLLM server's URL. MinerU API expects `backend=vlm-http-client` and `server_url=http://<server>:30000` in the request body.
4. Set `MINERU_OUTPUT_DIR` to specify where RAGFlow stores MinerU API output; otherwise, a system temp directory is used.
5. Set `MINERU_DELETE_OUTPUT` to `0` to keep MinerU's temp output (useful for debugging).
:::tip NOTE
For information about other environment variables natively supported by MinerU, see [here](https://opendatalab.github.io/MinerU/usage/cli_tools/#environment-variables-description).
@ -562,21 +533,16 @@ For information about other environment variables natively supported by MinerU,
### How to use MinerU with a vLLM server for document parsing?
RAGFlow supports MinerU's `vlm-http-client` backend, enabling you to delegate document parsing tasks to a remote vLLM server. With this configuration, RAGFlow will connect to your remote vLLM server as a client and use its powerful GPU resources for document parsing. This significantly improves performance for parsing complex documents while reducing the resources required on your RAGFlow server. To configure MinerU with a vLLM server:
RAGFlow supports MinerU's `vlm-http-client` backend, enabling you to delegate document parsing tasks to a remote vLLM server while calling MinerU via HTTP. To configure:
1. Set up a vLLM server running MinerU:
```bash
mineru-vllm-server --port 30000
```
2. Configure the following environment variables in your **docker/.env** file (or your shell if running from source):
- `MINERU_EXECUTABLE=/home/ragflow/uv_tools/.venv/bin/mineru` (or the path to your MinerU executable)
1. Ensure a MinerU API service is reachable (for example `http://mineru-host:8886`).
2. Set up or point to a vLLM HTTP server (for example `http://vllm-host:30000`).
3. Configure the following in your **docker/.env** file (or your shell if running from source):
- `MINERU_APISERVER=http://mineru-host:8886`
- `MINERU_BACKEND="vlm-http-client"`
- `MINERU_SERVER_URL="http://your-vllm-server-ip:30000"`
3. Complete the rest of the standard MinerU setup steps as described [here](#how-to-configure-mineru-specific-settings).
- `MINERU_SERVER_URL="http://vllm-host:30000"`
MinerU API calls expect `backend=vlm-http-client` and `server_url=http://<server>:30000` in the request body.
4. Configure `MINERU_OUTPUT_DIR` / `MINERU_DELETE_OUTPUT` as desired to manage the returned zip/JSON before ingestion.
:::tip NOTE
When using the `vlm-http-client` backend, the RAGFlow server requires no GPU, only network connectivity. This enables cost-effective distributed deployment with multiple RAGFlow instances sharing one remote vLLM server.

View File

@ -40,56 +40,21 @@ The output of a PDF parser is `json`. In the PDF parser, you select the parsing
- A third-party visual model from a specific model provider.
:::danger IMPORTANT
MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports MinerU (>= 2.6.3) as an optional PDF parser with multiple backends. RAGFlow acts only as a client for MinerU, calling it to parse documents, reading the output files, and ingesting the parsed content. To use this feature, follow these steps:
MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports MinerU (>= 2.6.3) as an optional PDF parser with multiple backends. RAGFlow acts only as a **remote client** for MinerU, calling the MinerU API to parse documents, reading the returned output files, and ingesting the parsed content. To use this feature:
:::
1. Prepare MinerU:
1. Prepare a reachable MinerU API service (FastAPI server).
2. Configure RAGFlow with the remote MinerU settings (env or UI model provider):
- `MINERU_APISERVER`: MinerU API endpoint, for example `http://mineru-host:8886`.
- `MINERU_BACKEND`: MinerU backend, defaults to `pipeline` (supports `vlm-http-client`, `vlm-transformers`, `vlm-vllm-engine`, `vlm-mlx-engine`, `vlm-vllm-async-engine`, `vlm-lmdeploy-engine`).
- `MINERU_SERVER_URL`: (optional) For `vlm-http-client`, the downstream vLLM HTTP server, for example `http://vllm-host:30000`.
- `MINERU_OUTPUT_DIR`: (optional) Local directory to store MinerU API outputs (zip/JSON) before ingestion.
- `MINERU_DELETE_OUTPUT`: Whether to delete temporary output when a temp dir is used (`1` deletes temp outputs; set `0` to keep).
3. In the web UI, navigate to the **Configuration** page of your dataset. Click **Built-in** in the **Ingestion pipeline** section, select a chunking method from the **Built-in** dropdown, which supports PDF parsing, and select **MinerU** in **PDF parser**.
4. If you use a custom ingestion pipeline instead, provide the same MinerU settings and select **MinerU** in the **Parsing method** section of the **Parser** component.
- **If you deploy RAGFlow from source**, install MinerU into an isolated virtual environment (recommended path: `$HOME/uv_tools`):
```bash
mkdir -p "$HOME/uv_tools"
cd "$HOME/uv_tools"
uv venv .venv
source .venv/bin/activate
uv pip install -U "mineru[core]" -i https://mirrors.aliyun.com/pypi/simple
# or
# uv pip install -U "mineru[all]" -i https://mirrors.aliyun.com/pypi/simple
```
- **If you deploy RAGFlow with Docker**, you usually only need to turn on MinerU support in `docker/.env`:
```bash
# docker/.env
...
USE_MINERU=true
...
```
Enabling `USE_MINERU=true` will internally perform the same setup as the manual configuration (including setting the MinerU executable path and related environment variables). You only need the manual installation above if you are running from source or want full control over the MinerU installation.
2. Start RAGFlow with MinerU enabled:
- **Source deployment** in the RAGFlow repo, export the key MinerU-related variables and start the backend service:
```bash
# in RAGFlow repo
export MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru"
export MINERU_DELETE_OUTPUT=0 # keep output directory
export MINERU_BACKEND=pipeline # or another backend you prefer
source .venv/bin/activate
export PYTHONPATH=$(pwd)
bash docker/launch_backend_service.sh
```
- **Docker deployment** after setting `USE_MINERU=true`, restart the containers so that the new settings take effect:
```bash
# in RAGFlow repo
docker compose -f docker/docker-compose.yml restart
```
3. Restart the ragflow-server.
:::note
All MinerU environment variables are optional. If set, RAGFlow will auto-provision a MinerU OCR model for the tenant on first use with these values. To avoid auto-provisioning, configure MinerU solely through the UI and leave the env vars unset.
:::
:::caution WARNING

View File

@ -16,13 +16,13 @@ RAGFlow isn't one-size-fits-all. It is built for flexibility and supports deeper
## Prerequisites
- The PDF parser dropdown menu appears only when you select a chunking method compatible with PDFs, including:
- **General**
- **Manual**
- **Paper**
- **Book**
- **Laws**
- **Presentation**
- **One**
- **General**
- **Manual**
- **Paper**
- **Book**
- **Laws**
- **Presentation**
- **One**
- To use a third-party visual model for parsing PDFs, ensure you have set a default VLM under **Set default models** on the **Model providers** page.
## Quickstart
@ -33,65 +33,28 @@ RAGFlow isn't one-size-fits-all. It is built for flexibility and supports deeper
2. Select the option that works best with your scenario:
- DeepDoc: (Default) The default visual model performing OCR, TSR, and DLR tasks on PDFs, but can be time-consuming.
- Naive: Skip OCR, TSR, and DLR tasks if *all* your PDFs are plain text.
- [MinerU](https://github.com/opendatalab/MinerU): (Experimental) An open-source tool that converts PDF into machine-readable formats.
- [Docling](https://github.com/docling-project/docling): (Experimental) An open-source document processing tool for gen AI.
- A third-party visual model from a specific model provider.
- DeepDoc: (Default) The default visual model performing OCR, TSR, and DLR tasks on PDFs, but can be time-consuming.
- Naive: Skip OCR, TSR, and DLR tasks if _all_ your PDFs are plain text.
- [MinerU](https://github.com/opendatalab/MinerU): (Experimental) An open-source tool that converts PDF into machine-readable formats.
- [Docling](https://github.com/docling-project/docling): (Experimental) An open-source document processing tool for gen AI.
- A third-party visual model from a specific model provider.
:::danger IMPORTANT
MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports MinerU (>= 2.6.3) as an optional PDF parser with multiple backends. RAGFlow acts only as a client for MinerU, calling it to parse documents, reading the output files, and ingesting the parsed content. To use this feature, follow these steps:
MinerU PDF document parsing is available starting from v0.22.0. RAGFlow supports MinerU (>= 2.6.3) as an optional PDF parser with multiple backends. RAGFlow acts only as a **remote client** for MinerU, calling the MinerU API to parse documents, reading the returned output files, and ingesting the parsed content. To use this feature:
1. Prepare MinerU:
1. Prepare a reachable MinerU API service (FastAPI server).
2. Configure RAGFlow with the remote MinerU settings (env or UI model provider):
- `MINERU_APISERVER`: MinerU API endpoint, for example `http://mineru-host:8886`.
- `MINERU_BACKEND`: MinerU backend, defaults to `pipeline` (supports `vlm-http-client`, `vlm-transformers`, `vlm-vllm-engine`, `vlm-mlx-engine`, `vlm-vllm-async-engine`).
- `MINERU_SERVER_URL`: (optional) For `vlm-http-client`, the downstream vLLM HTTP server, for example `http://vllm-host:30000`.
- `MINERU_OUTPUT_DIR`: (optional) Local directory to store MinerU API outputs (zip/JSON) before ingestion.
- `MINERU_DELETE_OUTPUT`: Whether to delete temporary output when a temp dir is used (`1` deletes temp outputs; set `0` to keep).
3. In the web UI, navigate to the **Configuration** page of your dataset. Click **Built-in** in the **Ingestion pipeline** section, select a chunking method from the **Built-in** dropdown, which supports PDF parsing, and select **MinerU** in **PDF parser**.
4. If you use a custom ingestion pipeline instead, provide the same MinerU settings and select **MinerU** in the **Parsing method** section of the **Parser** component.
:::
- **If you deploy RAGFlow from source**, install MinerU into an isolated virtual environment (recommended path: `$HOME/uv_tools`):
```bash
mkdir -p "$HOME/uv_tools"
cd "$HOME/uv_tools"
uv venv .venv
source .venv/bin/activate
uv pip install -U "mineru[core]" -i https://mirrors.aliyun.com/pypi/simple
# or
# uv pip install -U "mineru[all]" -i https://mirrors.aliyun.com/pypi/simple
```
- **If you deploy RAGFlow with Docker**, you usually only need to turn on MinerU support in `docker/.env`:
```bash
# docker/.env
...
USE_MINERU=true
...
```
Enabling `USE_MINERU=true` will internally perform the same setup as the manual configuration (including setting the MinerU executable path and related environment variables). You only need the manual installation above if you are running from source or want full control over the MinerU installation.
2. Start RAGFlow with MinerU enabled:
- **Source deployment** in the RAGFlow repo, export the key MinerU-related variables and start the backend service:
```bash
# in RAGFlow repo
export MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru"
export MINERU_DELETE_OUTPUT=0 # keep output directory
export MINERU_BACKEND=pipeline # or another backend you prefer
source .venv/bin/activate
export PYTHONPATH=$(pwd)
bash docker/launch_backend_service.sh
```
- **Docker deployment** after setting `USE_MINERU=true`, restart the containers so that the new settings take effect:
```bash
# in RAGFlow repo
docker compose -f docker/docker-compose.yml restart
```
3. Restart the ragflow-server.
4. In the web UI, navigate to the **Configuration** page of your dataset. Click **Built-in** in the **Ingestion pipeline** section, select a chunking method from the **Built-in** dropdown, which supports PDF parsing, and select **MinerU** in **PDF parser**.
5. If you use a custom ingestion pipeline instead, you must also complete the first three steps before selecting **MinerU** in the **Parsing method** section of the **Parser** component.
:::note
All MinerU environment variables are optional. When they are set, RAGFlow will auto-create a MinerU OCR model for a tenant on first use using these values. If you do not want this auto-provisioning, configure MinerU only through the UI and leave the env vars unset.
:::
:::caution WARNING
@ -107,4 +70,3 @@ Use a visual model to extract data if your PDFs contain formatted or image-based
### Can I select a visual model to parse my DOCX files?
No, you cannot. This dropdown menu is for PDFs only. To use this feature, convert your DOCX files to PDF first.

View File

@ -280,6 +280,7 @@ class Parser(ProcessBase):
binary=blob,
callback=self.callback,
parse_method=conf.get("mineru_parse_method", "raw"),
lang=conf.get("lang", "Chinese"),
)
bboxes = []
for t, poss in lines:

View File

@ -398,7 +398,7 @@ class JinaMultiVecEmbed(Base):
ress.append(chunk_emb)
token_count +=total_token_count_from_response(res)
token_count += total_token_count_from_response(res)
except Exception as _e:
log_exception(_e, response)
raise Exception(f"Error: {response}")

View File

@ -16,7 +16,7 @@
import json
import logging
import os
from typing import Any, Optional, Tuple
from typing import Any, Optional
from deepdoc.parser.mineru_parser import MinerUParser
@ -25,7 +25,7 @@ class Base:
def __init__(self, key: str | dict, model_name: str, **kwargs):
self.model_name = model_name
def parse_pdf(self, filepath: str, binary=None, **kwargs) -> Tuple[Any, Any]:
def parse_pdf(self, filepath: str, binary=None, **kwargs) -> tuple[Any, Any]:
raise NotImplementedError("Please implement parse_pdf!")
@ -56,21 +56,22 @@ class MinerUOcrModel(Base, MinerUParser):
self.mineru_backend = _resolve_config("mineru_backend", "MINERU_BACKEND", "pipeline")
self.mineru_server_url = _resolve_config("mineru_server_url", "MINERU_SERVER_URL", "")
self.mineru_delete_output = bool(int(_resolve_config("mineru_delete_output", "MINERU_DELETE_OUTPUT", 1)))
self.mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
logging.info(f"Parsed MinerU config: {config}")
logging.info(
f"Parsed MinerU config: backend={self.mineru_backend} api={self.mineru_api} server_url={self.mineru_server_url} output_dir={self.mineru_output_dir} delete_output={self.mineru_delete_output}"
)
MinerUParser.__init__(self, mineru_path=self.mineru_executable, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
MinerUParser.__init__(self, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> Tuple[bool, str]:
def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> tuple[bool, str]:
backend = backend or self.mineru_backend
server_url = server_url or self.mineru_server_url
return self.check_installation(backend=backend, server_url=server_url)
def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw",**kwargs):
def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw", **kwargs):
ok, reason = self.check_available()
if not ok:
raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.")
raise RuntimeError(f"MinerU server not accessible: {reason}")
sections, tables = MinerUParser.parse_pdf(
self,

View File

@ -69,7 +69,6 @@ from common.signal_utils import start_tracemalloc_and_snapshot, stop_tracemalloc
from common.exceptions import TaskCanceledException
from common import settings
from common.constants import PAGERANK_FLD, TAG_FLD, SVR_CONSUMER_GROUP_NAME
from common.misc_utils import check_and_install_mineru
BATCH_SIZE = 64
@ -1169,7 +1168,6 @@ async def main():
show_configs()
settings.init_settings()
settings.check_and_install_torch()
check_and_install_mineru()
logging.info(f'default embedding config: {settings.EMBEDDING_CFG}')
settings.print_rag_settings()
if sys.platform != "win32":

View File

@ -118,6 +118,10 @@ export function ChunkMethodDialog({
auto_questions: z.coerce.number().optional(),
html4excel: z.boolean().optional(),
toc_extraction: z.boolean().optional(),
mineru_parse_method: z.enum(['auto', 'txt', 'ocr']).optional(),
mineru_formula_enable: z.boolean().optional(),
mineru_table_enable: z.boolean().optional(),
mineru_lang: z.string().optional(),
// raptor: z
// .object({
// use_raptor: z.boolean().optional(),
@ -166,6 +170,9 @@ export function ChunkMethodDialog({
name: 'parser_id',
control: form.control,
});
const isMineruSelected =
selectedTag?.toLowerCase().includes('mineru') ||
layoutRecognize?.toLowerCase?.()?.includes('mineru');
const isPdf = documentExtension === 'pdf';
@ -328,7 +335,7 @@ export function ChunkMethodDialog({
className="space-y-3"
>
{showOne && (
<LayoutRecognizeFormField></LayoutRecognizeFormField>
<LayoutRecognizeFormField showMineruOptions={false} />
)}
{showMaxTokenNumber && (
<>
@ -345,9 +352,16 @@ export function ChunkMethodDialog({
)}
</FormContainer>
<FormContainer
show={showAutoKeywords(selectedTag) || showExcelToHtml}
show={
isMineruSelected ||
showAutoKeywords(selectedTag) ||
showExcelToHtml
}
className="space-y-3"
>
{isMineruSelected && (
<LayoutRecognizeFormField showMineruOptions />
)}
{selectedTag === DocumentParserType.Naive && (
<EnableTocToggle />
)}

View File

@ -18,6 +18,10 @@ export function useDefaultParserValues() {
auto_questions: 0,
html4excel: false,
toc_extraction: false,
mineru_parse_method: 'auto',
mineru_formula_enable: true,
mineru_table_enable: true,
mineru_lang: 'English',
// raptor: {
// use_raptor: false,
// prompt: t('knowledgeConfiguration.promptText'),

View File

@ -5,6 +5,7 @@ import { cn } from '@/lib/utils';
import { camelCase } from 'lodash';
import { ReactNode, useMemo } from 'react';
import { useFormContext } from 'react-hook-form';
import { MinerUOptionsFormField } from './mineru-options-form-field';
import { SelectWithSearch } from './originui/select-with-search';
import {
FormControl,
@ -26,11 +27,13 @@ export function LayoutRecognizeFormField({
horizontal = true,
optionsWithoutLLM,
label,
showMineruOptions = true,
}: {
name?: string;
horizontal?: boolean;
optionsWithoutLLM?: { value: string; label: string }[];
label?: ReactNode;
showMineruOptions?: boolean;
}) {
const form = useFormContext();
@ -79,35 +82,38 @@ export function LayoutRecognizeFormField({
name={name}
render={({ field }) => {
return (
<FormItem className={'items-center space-y-0 '}>
<div
className={cn('flex', {
'flex-col ': !horizontal,
'items-center': horizontal,
})}
>
<FormLabel
tooltip={t('layoutRecognizeTip')}
className={cn('text-sm text-text-secondary whitespace-wrap', {
['w-1/4']: horizontal,
<>
<FormItem className={'items-center space-y-0 '}>
<div
className={cn('flex', {
'flex-col ': !horizontal,
'items-center': horizontal,
})}
>
{label || t('layoutRecognize')}
</FormLabel>
<div className={horizontal ? 'w-3/4' : 'w-full'}>
<FormControl>
<SelectWithSearch
{...field}
options={options}
></SelectWithSearch>
</FormControl>
<FormLabel
tooltip={t('layoutRecognizeTip')}
className={cn('text-sm text-text-secondary whitespace-wrap', {
['w-1/4']: horizontal,
})}
>
{label || t('layoutRecognize')}
</FormLabel>
<div className={horizontal ? 'w-3/4' : 'w-full'}>
<FormControl>
<SelectWithSearch
{...field}
options={options}
></SelectWithSearch>
</FormControl>
</div>
</div>
</div>
<div className="flex pt-1">
<div className={horizontal ? 'w-1/4' : 'w-full'}></div>
<FormMessage />
</div>
</FormItem>
<div className="flex pt-1">
<div className={horizontal ? 'w-1/4' : 'w-full'}></div>
<FormMessage />
</div>
</FormItem>
{showMineruOptions && <MinerUOptionsFormField />}
</>
);
}}
/>

View File

@ -7,10 +7,38 @@ import { useFormContext, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
const parseMethodOptions = buildOptions(['auto', 'txt', 'ocr']);
const languageOptions = buildOptions([
'English',
'Chinese',
'Traditional Chinese',
'Russian',
'Ukrainian',
'Indonesian',
'Spanish',
'Vietnamese',
'Japanese',
'Korean',
'Portuguese BR',
'German',
'French',
'Italian',
'Tamil',
'Telugu',
'Kannada',
'Thai',
'Greek',
'Hindi',
]);
export function MinerUOptionsFormField() {
export function MinerUOptionsFormField({
namePrefix = 'parser_config',
}: {
namePrefix?: string;
}) {
const form = useFormContext();
const { t } = useTranslation();
const buildName = (field: string) =>
namePrefix ? `${namePrefix}.${field}` : field;
const layoutRecognize = useWatch({
control: form.control,
@ -33,7 +61,7 @@ export function MinerUOptionsFormField() {
</div>
<RAGFlowFormItem
name="parser_config.mineru_parse_method"
name={buildName('mineru_parse_method')}
label={t('knowledgeConfiguration.mineruParseMethod', 'Parse Method')}
tooltip={t(
'knowledgeConfiguration.mineruParseMethodTip',
@ -52,7 +80,26 @@ export function MinerUOptionsFormField() {
</RAGFlowFormItem>
<RAGFlowFormItem
name="parser_config.mineru_formula_enable"
name={buildName('mineru_lang')}
label={t('knowledgeConfiguration.mineruLanguage', 'Language')}
tooltip={t(
'knowledgeConfiguration.mineruLanguageTip',
'Preferred OCR language for MinerU.',
)}
horizontal={true}
>
{(field) => (
<RAGFlowSelect
value={field.value || 'English'}
onChange={field.onChange}
options={languageOptions}
placeholder={t('common.selectPlaceholder', 'Select value')}
/>
)}
</RAGFlowFormItem>
<RAGFlowFormItem
name={buildName('mineru_formula_enable')}
label={t(
'knowledgeConfiguration.mineruFormulaEnable',
'Formula Recognition',
@ -73,7 +120,7 @@ export function MinerUOptionsFormField() {
</RAGFlowFormItem>
<RAGFlowFormItem
name="parser_config.mineru_table_enable"
name={buildName('mineru_table_enable')}
label={t(
'knowledgeConfiguration.mineruTableEnable',
'Table Recognition',

View File

@ -34,8 +34,13 @@ export interface IDocumentInfo {
export interface IParserConfig {
delimiter?: string;
html4excel?: boolean;
layout_recognize?: boolean;
pages: any[];
layout_recognize?: string;
pages?: any[];
chunk_token_num?: number;
auto_keywords?: number;
auto_questions?: number;
toc_extraction?: boolean;
task_page_size?: number;
raptor?: Raptor;
graphrag?: GraphRag;
}

View File

@ -1,8 +1,13 @@
export interface IChangeParserConfigRequestBody {
pages: number[][];
chunk_token_num: number;
layout_recognize: boolean;
task_page_size: number;
pages?: number[][];
chunk_token_num?: number;
layout_recognize?: string;
task_page_size?: number;
delimiter?: string;
auto_keywords?: number;
auto_questions?: number;
html4excel?: boolean;
toc_extraction?: boolean;
}
export interface IChangeParserRequestBody {

View File

@ -7,7 +7,6 @@ import { DelimiterFormField } from '@/components/delimiter-form-field';
import { ExcelToHtmlFormField } from '@/components/excel-to-html-form-field';
import { LayoutRecognizeFormField } from '@/components/layout-recognize-form-field';
import { MaxTokenNumberFormField } from '@/components/max-token-number-from-field';
import { MinerUOptionsFormField } from '@/components/mineru-options-form-field';
import {
ConfigurationFormContainer,
MainContainer,
@ -19,7 +18,6 @@ export function NaiveConfiguration() {
<MainContainer>
<ConfigurationFormContainer>
<LayoutRecognizeFormField></LayoutRecognizeFormField>
<MinerUOptionsFormField></MinerUOptionsFormField>
<MaxTokenNumberFormField initialValue={512}></MaxTokenNumberFormField>
<DelimiterFormField></DelimiterFormField>
<ChildrenDelimiterForm />

View File

@ -37,6 +37,7 @@ export const formSchema = z
mineru_parse_method: z.enum(['auto', 'txt', 'ocr']).optional(),
mineru_formula_enable: z.boolean().optional(),
mineru_table_enable: z.boolean().optional(),
mineru_lang: z.string().optional(),
raptor: z
.object({
use_raptor: z.boolean().optional(),

View File

@ -75,6 +75,7 @@ export default function DatasetSettings() {
mineru_parse_method: 'auto',
mineru_formula_enable: true,
mineru_table_enable: true,
mineru_lang: 'English',
raptor: {
use_raptor: true,
max_token: 256,