mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-26 00:46:52 +08:00
Refa: treat MinerU as an OCR model (#11849)
### What problem does this PR solve? Treat MinerU as an OCR model. ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Refactoring
This commit is contained in:
@ -34,7 +34,6 @@ from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, ext
|
||||
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
|
||||
from deepdoc.parser.figure_parser import VisionFigureParser,vision_figure_parser_docx_wrapper,vision_figure_parser_pdf_wrapper
|
||||
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
||||
from deepdoc.parser.mineru_parser import MinerUParser
|
||||
from deepdoc.parser.docling_parser import DoclingParser
|
||||
from deepdoc.parser.tcadp_parser import TCADPParser
|
||||
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, tokenize_chunks, tokenize_chunks_with_images, tokenize_table, attach_media_context
|
||||
@ -58,27 +57,42 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
|
||||
|
||||
|
||||
def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
||||
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
||||
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
|
||||
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
|
||||
parse_method = kwargs.get("parse_method", "raw")
|
||||
mineru_llm_name = kwargs.get("mineru_llm_name")
|
||||
tenant_id = kwargs.get("tenant_id")
|
||||
|
||||
if not pdf_parser.check_installation():
|
||||
pdf_parser = None
|
||||
if tenant_id:
|
||||
if not mineru_llm_name:
|
||||
try:
|
||||
from api.db.services.tenant_llm_service import TenantLLMService
|
||||
|
||||
env_name = TenantLLMService.ensure_mineru_from_env(tenant_id)
|
||||
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="MinerU", model_type=LLMType.OCR.value)
|
||||
if candidates:
|
||||
mineru_llm_name = candidates[0].llm_name
|
||||
elif env_name:
|
||||
mineru_llm_name = env_name
|
||||
except Exception as e: # best-effort fallback
|
||||
logging.warning(f"fallback to env mineru: {e}")
|
||||
|
||||
if mineru_llm_name:
|
||||
try:
|
||||
ocr_model = LLMBundle(tenant_id, LLMType.OCR, llm_name=mineru_llm_name, lang=lang)
|
||||
pdf_parser = ocr_model.mdl
|
||||
sections, tables = pdf_parser.parse_pdf(
|
||||
filepath=filename,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
parse_method=parse_method,
|
||||
)
|
||||
return sections, tables, pdf_parser
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to parse pdf via LLMBundle MinerU ({mineru_llm_name}): {e}")
|
||||
|
||||
if callback:
|
||||
callback(-1, "MinerU not found.")
|
||||
return None, None, pdf_parser
|
||||
|
||||
sections, tables = pdf_parser.parse_pdf(
|
||||
filepath=filename,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
||||
backend=os.environ.get("MINERU_BACKEND", "pipeline"),
|
||||
server_url=os.environ.get("MINERU_SERVER_URL", ""),
|
||||
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
||||
parse_method=parse_method
|
||||
)
|
||||
return sections, tables, pdf_parser
|
||||
|
||||
return None, None, None
|
||||
|
||||
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls = None ,**kwargs):
|
||||
pdf_parser = DoclingParser()
|
||||
@ -692,7 +706,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
return res
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
||||
layout_recognizer_raw = parser_config.get("layout_recognize", "DeepDOC")
|
||||
parser_model_name = None
|
||||
layout_recognizer = layout_recognizer_raw
|
||||
if isinstance(layout_recognizer_raw, str):
|
||||
lowered = layout_recognizer_raw.lower()
|
||||
if lowered.startswith("mineru@"):
|
||||
parser_model_name = layout_recognizer_raw.split("@", 1)[1]
|
||||
layout_recognizer = "MinerU"
|
||||
|
||||
if parser_config.get("analyze_hyperlink", False) and is_root:
|
||||
urls = extract_links_from_pdf(binary)
|
||||
|
||||
@ -711,6 +733,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
lang = lang,
|
||||
callback = callback,
|
||||
layout_recognizer = layout_recognizer,
|
||||
mineru_llm_name = parser_model_name,
|
||||
**kwargs
|
||||
)
|
||||
|
||||
|
||||
@ -31,7 +31,6 @@ from common import settings
|
||||
from common.constants import LLMType
|
||||
from common.misc_utils import get_uuid
|
||||
from deepdoc.parser import ExcelParser
|
||||
from deepdoc.parser.mineru_parser import MinerUParser
|
||||
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
|
||||
from deepdoc.parser.tcadp_parser import TCADPParser
|
||||
from rag.app.naive import Docx
|
||||
@ -235,25 +234,55 @@ class Parser(ProcessBase):
|
||||
conf = self._param.setups["pdf"]
|
||||
self.set_output("output_format", conf["output_format"])
|
||||
|
||||
if conf.get("parse_method").lower() == "deepdoc":
|
||||
raw_parse_method = conf.get("parse_method", "")
|
||||
parser_model_name = None
|
||||
parse_method = raw_parse_method
|
||||
parse_method = parse_method or ""
|
||||
if isinstance(raw_parse_method, str):
|
||||
lowered = raw_parse_method.lower()
|
||||
if lowered.startswith("mineru@"):
|
||||
parser_model_name = raw_parse_method.split("@", 1)[1]
|
||||
parse_method = "MinerU"
|
||||
elif lowered.endswith("@mineru"):
|
||||
parser_model_name = raw_parse_method.rsplit("@", 1)[0]
|
||||
parse_method = "MinerU"
|
||||
|
||||
if parse_method.lower() == "deepdoc":
|
||||
bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
|
||||
elif conf.get("parse_method").lower() == "plain_text":
|
||||
elif parse_method.lower() == "plain_text":
|
||||
lines, _ = PlainParser()(blob)
|
||||
bboxes = [{"text": t} for t, _ in lines]
|
||||
elif conf.get("parse_method").lower() == "mineru":
|
||||
mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
||||
mineru_api = os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987")
|
||||
pdf_parser = MinerUParser(mineru_path=mineru_executable, mineru_api=mineru_api)
|
||||
ok, reason = pdf_parser.check_installation()
|
||||
if not ok:
|
||||
raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.")
|
||||
elif parse_method.lower() == "mineru":
|
||||
def resolve_mineru_llm_name():
|
||||
configured = parser_model_name or conf.get("mineru_llm_name")
|
||||
if configured:
|
||||
return configured
|
||||
|
||||
tenant_id = self._canvas._tenant_id
|
||||
if not tenant_id:
|
||||
return None
|
||||
|
||||
from api.db.services.tenant_llm_service import TenantLLMService
|
||||
|
||||
env_name = TenantLLMService.ensure_mineru_from_env(tenant_id)
|
||||
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="MinerU", model_type=LLMType.OCR.value)
|
||||
if candidates:
|
||||
return candidates[0].llm_name
|
||||
return env_name
|
||||
|
||||
parser_model_name = resolve_mineru_llm_name()
|
||||
if not parser_model_name:
|
||||
raise RuntimeError("MinerU model not configured. Please add MinerU in Model Providers or set MINERU_* env.")
|
||||
|
||||
tenant_id = self._canvas._tenant_id
|
||||
ocr_model = LLMBundle(tenant_id, LLMType.OCR, llm_name=parser_model_name, lang=conf.get("lang", "Chinese"))
|
||||
pdf_parser = ocr_model.mdl
|
||||
|
||||
lines, _ = pdf_parser.parse_pdf(
|
||||
filepath=name,
|
||||
binary=blob,
|
||||
callback=self.callback,
|
||||
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
||||
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
||||
parse_method=conf.get("mineru_parse_method", "raw"),
|
||||
)
|
||||
bboxes = []
|
||||
for t, poss in lines:
|
||||
@ -263,7 +292,7 @@ class Parser(ProcessBase):
|
||||
"text": t,
|
||||
}
|
||||
bboxes.append(box)
|
||||
elif conf.get("parse_method").lower() == "tcadp parser":
|
||||
elif parse_method.lower() == "tcadp parser":
|
||||
# ADP is a document parsing tool using Tencent Cloud API
|
||||
table_result_type = conf.get("table_result_type", "1")
|
||||
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
|
||||
|
||||
@ -121,6 +121,7 @@ EmbeddingModel = globals().get("EmbeddingModel", {})
|
||||
RerankModel = globals().get("RerankModel", {})
|
||||
Seq2txtModel = globals().get("Seq2txtModel", {})
|
||||
TTSModel = globals().get("TTSModel", {})
|
||||
OcrModel = globals().get("OcrModel", {})
|
||||
|
||||
|
||||
MODULE_MAPPING = {
|
||||
@ -130,6 +131,7 @@ MODULE_MAPPING = {
|
||||
"rerank_model": RerankModel,
|
||||
"sequence2txt_model": Seq2txtModel,
|
||||
"tts_model": TTSModel,
|
||||
"ocr_model": OcrModel,
|
||||
}
|
||||
|
||||
package_name = __name__
|
||||
@ -171,4 +173,5 @@ __all__ = [
|
||||
"RerankModel",
|
||||
"Seq2txtModel",
|
||||
"TTSModel",
|
||||
"OcrModel",
|
||||
]
|
||||
|
||||
76
rag/llm/ocr_model.py
Normal file
76
rag/llm/ocr_model.py
Normal file
@ -0,0 +1,76 @@
|
||||
#
|
||||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Optional, Tuple
|
||||
|
||||
from deepdoc.parser.mineru_parser import MinerUParser
|
||||
|
||||
|
||||
class Base:
|
||||
def __init__(self, key: str, model_name: str, **kwargs):
|
||||
self.model_name = model_name
|
||||
|
||||
def parse_pdf(self, filepath: str, binary=None, **kwargs) -> Tuple[Any, Any]:
|
||||
raise NotImplementedError("Please implement parse_pdf!")
|
||||
|
||||
|
||||
class MinerUOcrModel(Base, MinerUParser):
|
||||
_FACTORY_NAME = "MinerU"
|
||||
|
||||
def __init__(self, key: str, model_name: str, **kwargs):
|
||||
Base.__init__(self, key, model_name, **kwargs)
|
||||
cfg = {}
|
||||
if key:
|
||||
try:
|
||||
cfg = json.loads(key)
|
||||
except Exception:
|
||||
cfg = {}
|
||||
|
||||
self.mineru_api = cfg.get("MINERU_APISERVER", os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987"))
|
||||
self.mineru_output_dir = cfg.get("MINERU_OUTPUT_DIR", os.environ.get("MINERU_OUTPUT_DIR", ""))
|
||||
self.mineru_backend = cfg.get("MINERU_BACKEND", os.environ.get("MINERU_BACKEND", "pipeline"))
|
||||
self.mineru_server_url = cfg.get("MINERU_SERVER_URL", os.environ.get("MINERU_SERVER_URL", ""))
|
||||
self.mineru_delete_output = bool(int(cfg.get("MINERU_DELETE_OUTPUT", os.environ.get("MINERU_DELETE_OUTPUT", 1))))
|
||||
self.mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
||||
|
||||
logging.info(f"Parsered MinerU config: {cfg}")
|
||||
|
||||
MinerUParser.__init__(self, mineru_path=self.mineru_executable, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
|
||||
|
||||
def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> Tuple[bool, str]:
|
||||
backend = backend or self.mineru_backend
|
||||
server_url = server_url or self.mineru_server_url
|
||||
return self.check_installation(backend=backend, server_url=server_url)
|
||||
|
||||
def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw", **kwargs):
|
||||
ok, reason = self.check_available()
|
||||
if not ok:
|
||||
raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.")
|
||||
|
||||
sections, tables = MinerUParser.parse_pdf(
|
||||
self,
|
||||
filepath=filepath,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
output_dir=self.mineru_output_dir,
|
||||
backend=self.mineru_backend,
|
||||
server_url=self.mineru_server_url,
|
||||
delete_output=self.mineru_delete_output,
|
||||
parse_method=parse_method,
|
||||
)
|
||||
return sections, tables
|
||||
Reference in New Issue
Block a user