Refa: treat MinerU as an OCR model 2 (#11905)

### What problem does this PR solve?

Treat MinerU as an OCR model 2. #11903

### Type of change

- [x] Refactoring
This commit is contained in:
Yongteng Lei
2025-12-11 17:33:12 +08:00
committed by GitHub
parent bd0eff2954
commit e9710b7aa9
7 changed files with 50 additions and 39 deletions

View File

@ -192,6 +192,9 @@ async def add_llm():
elif factory == "OpenRouter": elif factory == "OpenRouter":
api_key = apikey_json(["api_key", "provider_order"]) api_key = apikey_json(["api_key", "provider_order"])
elif factory == "MinerU":
api_key = apikey_json(["api_key", "provider_order"])
llm = { llm = {
"tenant_id": current_user.id, "tenant_id": current_user.id,
"llm_factory": factory, "llm_factory": factory,

View File

@ -16,6 +16,7 @@
import os import os
import json import json
import logging import logging
from peewee import IntegrityError
from langfuse import Langfuse from langfuse import Langfuse
from common import settings from common import settings
from common.constants import MINERU_DEFAULT_CONFIG, MINERU_ENV_KEYS, LLMType from common.constants import MINERU_DEFAULT_CONFIG, MINERU_ENV_KEYS, LLMType
@ -274,21 +275,28 @@ class TenantLLMService(CommonService):
used_names = {item.llm_name for item in saved_mineru_models} used_names = {item.llm_name for item in saved_mineru_models}
idx = 1 idx = 1
base_name = "mineru-from-env" base_name = "mineru-from-env"
candidate = f"{base_name}-{idx}" while True:
while candidate in used_names:
idx += 1
candidate = f"{base_name}-{idx}" candidate = f"{base_name}-{idx}"
if candidate in used_names:
idx += 1
continue
cls.save( try:
tenant_id=tenant_id, cls.save(
llm_factory="MinerU", tenant_id=tenant_id,
llm_name=candidate, llm_factory="MinerU",
model_type=LLMType.OCR.value, llm_name=candidate,
api_key=json.dumps(cfg), model_type=LLMType.OCR.value,
api_base="", api_key=json.dumps(cfg),
max_tokens=0, api_base="",
) max_tokens=0,
return candidate )
return candidate
except IntegrityError:
logging.warning("MinerU env model %s already exists for tenant %s, retry with next name", candidate, tenant_id)
used_names.add(candidate)
idx += 1
continue
@classmethod @classmethod
@DB.connection_context() @DB.connection_context()

View File

@ -54,7 +54,7 @@ class MinerUContentType(StrEnum):
class MinerUParser(RAGFlowPdfParser): class MinerUParser(RAGFlowPdfParser):
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "http://host.docker.internal:9987", mineru_server_url: str = ""): def __init__(self, mineru_path: str = "mineru", mineru_api: str = "", mineru_server_url: str = ""):
self.mineru_path = Path(mineru_path) self.mineru_path = Path(mineru_path)
self.mineru_api = mineru_api.rstrip("/") self.mineru_api = mineru_api.rstrip("/")
self.mineru_server_url = mineru_server_url.rstrip("/") self.mineru_server_url = mineru_server_url.rstrip("/")
@ -176,7 +176,9 @@ class MinerUParser(RAGFlowPdfParser):
self.using_api = openapi_exists self.using_api = openapi_exists
return openapi_exists, reason return openapi_exists, reason
else: else:
self.logger.info("[MinerU] api not exists.") reason = "[MinerU] api not exists. Setting MINERU_SERVER_URL if your backend is vlm-http-client."
self.logger.info(reason)
return False, reason
except Exception as e: except Exception as e:
reason = f"[MinerU] Unexpected error during api check: {e}" reason = f"[MinerU] Unexpected error during api check: {e}"
self.logger.error(f"[MinerU] Unexpected error during api check: {e}") self.logger.error(f"[MinerU] Unexpected error during api check: {e}")

View File

@ -236,8 +236,9 @@ USE_DOCLING=false
# Enable Mineru # Enable Mineru
USE_MINERU=false USE_MINERU=false
MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru" MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru"
MINERU_DELETE_OUTPUT=0 # keep output directory # Uncommenting these lines will automatically add MinerU to the model provider whenever possible.
MINERU_BACKEND=pipeline # or another backend you prefer # MINERU_DELETE_OUTPUT=0 # keep output directory
# MINERU_BACKEND=pipeline # or another backend you prefer

View File

@ -68,7 +68,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
from api.db.services.tenant_llm_service import TenantLLMService from api.db.services.tenant_llm_service import TenantLLMService
env_name = TenantLLMService.ensure_mineru_from_env(tenant_id) env_name = TenantLLMService.ensure_mineru_from_env(tenant_id)
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="MinerU", model_type=LLMType.OCR.value) candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="MinerU", model_type=LLMType.OCR)
if candidates: if candidates:
mineru_llm_name = candidates[0].llm_name mineru_llm_name = candidates[0].llm_name
elif env_name: elif env_name:
@ -78,7 +78,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
if mineru_llm_name: if mineru_llm_name:
try: try:
ocr_model = LLMBundle(tenant_id, LLMType.OCR, llm_name=mineru_llm_name, lang=lang) ocr_model = LLMBundle(tenant_id=tenant_id, llm_type=LLMType.OCR, llm_name=mineru_llm_name, lang=lang)
pdf_parser = ocr_model.mdl pdf_parser = ocr_model.mdl
sections, tables = pdf_parser.parse_pdf( sections, tables = pdf_parser.parse_pdf(
filepath=filename, filepath=filename,
@ -711,8 +711,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
layout_recognizer = layout_recognizer_raw layout_recognizer = layout_recognizer_raw
if isinstance(layout_recognizer_raw, str): if isinstance(layout_recognizer_raw, str):
lowered = layout_recognizer_raw.lower() lowered = layout_recognizer_raw.lower()
if lowered.startswith("mineru@"): if lowered.endswith("@mineru"):
parser_model_name = layout_recognizer_raw.split("@", 1)[1] parser_model_name = layout_recognizer_raw.split("@", 1)[0]
layout_recognizer = "MinerU" layout_recognizer = "MinerU"
if parser_config.get("analyze_hyperlink", False) and is_root: if parser_config.get("analyze_hyperlink", False) and is_root:

View File

@ -240,10 +240,7 @@ class Parser(ProcessBase):
parse_method = parse_method or "" parse_method = parse_method or ""
if isinstance(raw_parse_method, str): if isinstance(raw_parse_method, str):
lowered = raw_parse_method.lower() lowered = raw_parse_method.lower()
if lowered.startswith("mineru@"): if lowered.endswith("@mineru"):
parser_model_name = raw_parse_method.split("@", 1)[1]
parse_method = "MinerU"
elif lowered.endswith("@mineru"):
parser_model_name = raw_parse_method.rsplit("@", 1)[0] parser_model_name = raw_parse_method.rsplit("@", 1)[0]
parse_method = "MinerU" parse_method = "MinerU"

View File

@ -22,7 +22,7 @@ from deepdoc.parser.mineru_parser import MinerUParser
class Base: class Base:
def __init__(self, key: str, model_name: str, **kwargs): def __init__(self, key: str | dict, model_name: str, **kwargs):
self.model_name = model_name self.model_name = model_name
def parse_pdf(self, filepath: str, binary=None, **kwargs) -> Tuple[Any, Any]: def parse_pdf(self, filepath: str, binary=None, **kwargs) -> Tuple[Any, Any]:
@ -32,23 +32,23 @@ class Base:
class MinerUOcrModel(Base, MinerUParser): class MinerUOcrModel(Base, MinerUParser):
_FACTORY_NAME = "MinerU" _FACTORY_NAME = "MinerU"
def __init__(self, key: str, model_name: str, **kwargs): def __init__(self, key: str | dict, model_name: str, **kwargs):
Base.__init__(self, key, model_name, **kwargs) Base.__init__(self, key, model_name, **kwargs)
cfg = {} config = {}
if key: if key:
try: try:
cfg = json.loads(key) config = json.loads(key)
except Exception: except Exception:
cfg = {} config = {}
config = config["api_key"]
self.mineru_api = cfg.get("MINERU_APISERVER", os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")) self.mineru_api = config.get("mineru_apiserver", os.environ.get("MINERU_APISERVER", ""))
self.mineru_output_dir = cfg.get("MINERU_OUTPUT_DIR", os.environ.get("MINERU_OUTPUT_DIR", "")) self.mineru_output_dir = config.get("mineru_output_dir", os.environ.get("MINERU_OUTPUT_DIR", ""))
self.mineru_backend = cfg.get("MINERU_BACKEND", os.environ.get("MINERU_BACKEND", "pipeline")) self.mineru_backend = config.get("mineru_backend", os.environ.get("MINERU_BACKEND", "pipeline"))
self.mineru_server_url = cfg.get("MINERU_SERVER_URL", os.environ.get("MINERU_SERVER_URL", "")) self.mineru_server_url = config.get("mineru_server_url", os.environ.get("MINERU_SERVER_URL", ""))
self.mineru_delete_output = bool(int(cfg.get("MINERU_DELETE_OUTPUT", os.environ.get("MINERU_DELETE_OUTPUT", 1)))) self.mineru_delete_output = bool(int(config.get("mineru_delete_output", os.environ.get("MINERU_DELETE_OUTPUT", 1))))
self.mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru") self.mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
logging.info(f"Parsered MinerU config: {cfg}") logging.info(f"Parsed MinerU config: {config}")
MinerUParser.__init__(self, mineru_path=self.mineru_executable, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url) MinerUParser.__init__(self, mineru_path=self.mineru_executable, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)