Refa: treat MinerU as an OCR model (#11849)

### What problem does this PR solve?

 Treat MinerU as an OCR model.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
- [x] Refactoring
This commit is contained in:
Yongteng Lei
2025-12-09 18:54:14 +08:00
committed by GitHub
parent 30377319d8
commit a94b3b9df2
9 changed files with 283 additions and 43 deletions

View File

@ -31,7 +31,6 @@ from common import settings
from common.constants import LLMType
from common.misc_utils import get_uuid
from deepdoc.parser import ExcelParser
from deepdoc.parser.mineru_parser import MinerUParser
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
from deepdoc.parser.tcadp_parser import TCADPParser
from rag.app.naive import Docx
@ -235,25 +234,55 @@ class Parser(ProcessBase):
conf = self._param.setups["pdf"]
self.set_output("output_format", conf["output_format"])
if conf.get("parse_method").lower() == "deepdoc":
raw_parse_method = conf.get("parse_method", "")
parser_model_name = None
parse_method = raw_parse_method
parse_method = parse_method or ""
if isinstance(raw_parse_method, str):
lowered = raw_parse_method.lower()
if lowered.startswith("mineru@"):
parser_model_name = raw_parse_method.split("@", 1)[1]
parse_method = "MinerU"
elif lowered.endswith("@mineru"):
parser_model_name = raw_parse_method.rsplit("@", 1)[0]
parse_method = "MinerU"
if parse_method.lower() == "deepdoc":
bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
elif conf.get("parse_method").lower() == "plain_text":
elif parse_method.lower() == "plain_text":
lines, _ = PlainParser()(blob)
bboxes = [{"text": t} for t, _ in lines]
elif conf.get("parse_method").lower() == "mineru":
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
ok, reason = pdf_parser.check_installation()
if not ok:
raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.")
elif parse_method.lower() == "mineru":
def resolve_mineru_llm_name():
configured = parser_model_name or conf.get("mineru_llm_name")
if configured:
return configured
tenant_id = self._canvas._tenant_id
if not tenant_id:
return None
from api.db.services.tenant_llm_service import TenantLLMService
env_name = TenantLLMService.ensure_mineru_from_env(tenant_id)
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="MinerU", model_type=LLMType.OCR.value)
if candidates:
return candidates[0].llm_name
return env_name
parser_model_name = resolve_mineru_llm_name()
if not parser_model_name:
raise RuntimeError("MinerU model not configured. Please add MinerU in Model Providers or set MINERU_* env.")
tenant_id = self._canvas._tenant_id
ocr_model = LLMBundle(tenant_id, LLMType.OCR, llm_name=parser_model_name, lang=conf.get("lang", "Chinese"))
pdf_parser = ocr_model.mdl
lines, _ = pdf_parser.parse_pdf(
filepath=name,
binary=blob,
callback=self.callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
parse_method=conf.get("mineru_parse_method", "raw"),
)
bboxes = []
for t, poss in lines:
@ -263,7 +292,7 @@ class Parser(ProcessBase):
"text": t,
}
bboxes.append(box)
elif conf.get("parse_method").lower() == "tcadp parser":
elif parse_method.lower() == "tcadp parser":
# ADP is a document parsing tool using Tencent Cloud API
table_result_type = conf.get("table_result_type", "1")
markdown_image_response_type = conf.get("markdown_image_response_type", "1")