Refa: treat MinerU as an OCR model 2 (#11905)

### What problem does this PR solve?

Treat MinerU as an OCR model 2. #11903

### Type of change

- [x] Refactoring
This commit is contained in:
Yongteng Lei
2025-12-11 17:33:12 +08:00
committed by GitHub
parent bd0eff2954
commit e9710b7aa9
7 changed files with 50 additions and 39 deletions

View File

@ -68,7 +68,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
from api.db.services.tenant_llm_service import TenantLLMService
env_name = TenantLLMService.ensure_mineru_from_env(tenant_id)
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="MinerU", model_type=LLMType.OCR.value)
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="MinerU", model_type=LLMType.OCR)
if candidates:
mineru_llm_name = candidates[0].llm_name
elif env_name:
@ -78,7 +78,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
if mineru_llm_name:
try:
ocr_model = LLMBundle(tenant_id, LLMType.OCR, llm_name=mineru_llm_name, lang=lang)
ocr_model = LLMBundle(tenant_id=tenant_id, llm_type=LLMType.OCR, llm_name=mineru_llm_name, lang=lang)
pdf_parser = ocr_model.mdl
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
@ -711,8 +711,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
layout_recognizer = layout_recognizer_raw
if isinstance(layout_recognizer_raw, str):
lowered = layout_recognizer_raw.lower()
if lowered.startswith("mineru@"):
parser_model_name = layout_recognizer_raw.split("@", 1)[1]
if lowered.endswith("@mineru"):
parser_model_name = layout_recognizer_raw.split("@", 1)[0]
layout_recognizer = "MinerU"
if parser_config.get("analyze_hyperlink", False) and is_root:

View File

@ -240,10 +240,7 @@ class Parser(ProcessBase):
parse_method = parse_method or ""
if isinstance(raw_parse_method, str):
lowered = raw_parse_method.lower()
if lowered.startswith("mineru@"):
parser_model_name = raw_parse_method.split("@", 1)[1]
parse_method = "MinerU"
elif lowered.endswith("@mineru"):
if lowered.endswith("@mineru"):
parser_model_name = raw_parse_method.rsplit("@", 1)[0]
parse_method = "MinerU"
@ -853,4 +850,4 @@ class Parser(ProcessBase):
for t in tasks:
t.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
raise
raise

View File

@ -22,7 +22,7 @@ from deepdoc.parser.mineru_parser import MinerUParser
class Base:
def __init__(self, key: str, model_name: str, **kwargs):
def __init__(self, key: str | dict, model_name: str, **kwargs):
self.model_name = model_name
def parse_pdf(self, filepath: str, binary=None, **kwargs) -> Tuple[Any, Any]:
@ -32,23 +32,23 @@ class Base:
class MinerUOcrModel(Base, MinerUParser):
_FACTORY_NAME = "MinerU"
def __init__(self, key: str, model_name: str, **kwargs):
def __init__(self, key: str | dict, model_name: str, **kwargs):
Base.__init__(self, key, model_name, **kwargs)
cfg = {}
config = {}
if key:
try:
cfg = json.loads(key)
config = json.loads(key)
except Exception:
cfg = {}
self.mineru_api = cfg.get("MINERU_APISERVER", os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987"))
self.mineru_output_dir = cfg.get("MINERU_OUTPUT_DIR", os.environ.get("MINERU_OUTPUT_DIR", ""))
self.mineru_backend = cfg.get("MINERU_BACKEND", os.environ.get("MINERU_BACKEND", "pipeline"))
self.mineru_server_url = cfg.get("MINERU_SERVER_URL", os.environ.get("MINERU_SERVER_URL", ""))
self.mineru_delete_output = bool(int(cfg.get("MINERU_DELETE_OUTPUT", os.environ.get("MINERU_DELETE_OUTPUT", 1))))
config = {}
config = config["api_key"]
self.mineru_api = config.get("mineru_apiserver", os.environ.get("MINERU_APISERVER", ""))
self.mineru_output_dir = config.get("mineru_output_dir", os.environ.get("MINERU_OUTPUT_DIR", ""))
self.mineru_backend = config.get("mineru_backend", os.environ.get("MINERU_BACKEND", "pipeline"))
self.mineru_server_url = config.get("mineru_server_url", os.environ.get("MINERU_SERVER_URL", ""))
self.mineru_delete_output = bool(int(config.get("mineru_delete_output", os.environ.get("MINERU_DELETE_OUTPUT", 1))))
self.mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
logging.info(f"Parsered MinerU config: {cfg}")
logging.info(f"Parsed MinerU config: {config}")
MinerUParser.__init__(self, mineru_path=self.mineru_executable, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)