mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-20 04:39:00 +08:00
Refa: treat MinerU as an OCR model 2 (#11905)
### What problem does this PR solve? Treat MinerU as an OCR model 2. #11903 ### Type of change - [x] Refactoring
This commit is contained in:
@ -192,6 +192,9 @@ async def add_llm():
|
|||||||
elif factory == "OpenRouter":
|
elif factory == "OpenRouter":
|
||||||
api_key = apikey_json(["api_key", "provider_order"])
|
api_key = apikey_json(["api_key", "provider_order"])
|
||||||
|
|
||||||
|
elif factory == "MinerU":
|
||||||
|
api_key = apikey_json(["api_key", "provider_order"])
|
||||||
|
|
||||||
llm = {
|
llm = {
|
||||||
"tenant_id": current_user.id,
|
"tenant_id": current_user.id,
|
||||||
"llm_factory": factory,
|
"llm_factory": factory,
|
||||||
|
|||||||
@ -16,6 +16,7 @@
|
|||||||
import os
|
import os
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
from peewee import IntegrityError
|
||||||
from langfuse import Langfuse
|
from langfuse import Langfuse
|
||||||
from common import settings
|
from common import settings
|
||||||
from common.constants import MINERU_DEFAULT_CONFIG, MINERU_ENV_KEYS, LLMType
|
from common.constants import MINERU_DEFAULT_CONFIG, MINERU_ENV_KEYS, LLMType
|
||||||
@ -274,11 +275,13 @@ class TenantLLMService(CommonService):
|
|||||||
used_names = {item.llm_name for item in saved_mineru_models}
|
used_names = {item.llm_name for item in saved_mineru_models}
|
||||||
idx = 1
|
idx = 1
|
||||||
base_name = "mineru-from-env"
|
base_name = "mineru-from-env"
|
||||||
|
while True:
|
||||||
candidate = f"{base_name}-{idx}"
|
candidate = f"{base_name}-{idx}"
|
||||||
while candidate in used_names:
|
if candidate in used_names:
|
||||||
idx += 1
|
idx += 1
|
||||||
candidate = f"{base_name}-{idx}"
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
cls.save(
|
cls.save(
|
||||||
tenant_id=tenant_id,
|
tenant_id=tenant_id,
|
||||||
llm_factory="MinerU",
|
llm_factory="MinerU",
|
||||||
@ -289,6 +292,11 @@ class TenantLLMService(CommonService):
|
|||||||
max_tokens=0,
|
max_tokens=0,
|
||||||
)
|
)
|
||||||
return candidate
|
return candidate
|
||||||
|
except IntegrityError:
|
||||||
|
logging.warning("MinerU env model %s already exists for tenant %s, retry with next name", candidate, tenant_id)
|
||||||
|
used_names.add(candidate)
|
||||||
|
idx += 1
|
||||||
|
continue
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
@DB.connection_context()
|
@DB.connection_context()
|
||||||
|
|||||||
@ -54,7 +54,7 @@ class MinerUContentType(StrEnum):
|
|||||||
|
|
||||||
|
|
||||||
class MinerUParser(RAGFlowPdfParser):
|
class MinerUParser(RAGFlowPdfParser):
|
||||||
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "http://host.docker.internal:9987", mineru_server_url: str = ""):
|
def __init__(self, mineru_path: str = "mineru", mineru_api: str = "", mineru_server_url: str = ""):
|
||||||
self.mineru_path = Path(mineru_path)
|
self.mineru_path = Path(mineru_path)
|
||||||
self.mineru_api = mineru_api.rstrip("/")
|
self.mineru_api = mineru_api.rstrip("/")
|
||||||
self.mineru_server_url = mineru_server_url.rstrip("/")
|
self.mineru_server_url = mineru_server_url.rstrip("/")
|
||||||
@ -176,7 +176,9 @@ class MinerUParser(RAGFlowPdfParser):
|
|||||||
self.using_api = openapi_exists
|
self.using_api = openapi_exists
|
||||||
return openapi_exists, reason
|
return openapi_exists, reason
|
||||||
else:
|
else:
|
||||||
self.logger.info("[MinerU] api not exists.")
|
reason = "[MinerU] api not exists. Setting MINERU_SERVER_URL if your backend is vlm-http-client."
|
||||||
|
self.logger.info(reason)
|
||||||
|
return False, reason
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
reason = f"[MinerU] Unexpected error during api check: {e}"
|
reason = f"[MinerU] Unexpected error during api check: {e}"
|
||||||
self.logger.error(f"[MinerU] Unexpected error during api check: {e}")
|
self.logger.error(f"[MinerU] Unexpected error during api check: {e}")
|
||||||
|
|||||||
@ -236,8 +236,9 @@ USE_DOCLING=false
|
|||||||
# Enable Mineru
|
# Enable Mineru
|
||||||
USE_MINERU=false
|
USE_MINERU=false
|
||||||
MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru"
|
MINERU_EXECUTABLE="$HOME/uv_tools/.venv/bin/mineru"
|
||||||
MINERU_DELETE_OUTPUT=0 # keep output directory
|
# Uncommenting these lines will automatically add MinerU to the model provider whenever possible.
|
||||||
MINERU_BACKEND=pipeline # or another backend you prefer
|
# MINERU_DELETE_OUTPUT=0 # keep output directory
|
||||||
|
# MINERU_BACKEND=pipeline # or another backend you prefer
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@ -68,7 +68,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
|
|||||||
from api.db.services.tenant_llm_service import TenantLLMService
|
from api.db.services.tenant_llm_service import TenantLLMService
|
||||||
|
|
||||||
env_name = TenantLLMService.ensure_mineru_from_env(tenant_id)
|
env_name = TenantLLMService.ensure_mineru_from_env(tenant_id)
|
||||||
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="MinerU", model_type=LLMType.OCR.value)
|
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="MinerU", model_type=LLMType.OCR)
|
||||||
if candidates:
|
if candidates:
|
||||||
mineru_llm_name = candidates[0].llm_name
|
mineru_llm_name = candidates[0].llm_name
|
||||||
elif env_name:
|
elif env_name:
|
||||||
@ -78,7 +78,7 @@ def by_mineru(filename, binary=None, from_page=0, to_page=100000, lang="Chinese"
|
|||||||
|
|
||||||
if mineru_llm_name:
|
if mineru_llm_name:
|
||||||
try:
|
try:
|
||||||
ocr_model = LLMBundle(tenant_id, LLMType.OCR, llm_name=mineru_llm_name, lang=lang)
|
ocr_model = LLMBundle(tenant_id=tenant_id, llm_type=LLMType.OCR, llm_name=mineru_llm_name, lang=lang)
|
||||||
pdf_parser = ocr_model.mdl
|
pdf_parser = ocr_model.mdl
|
||||||
sections, tables = pdf_parser.parse_pdf(
|
sections, tables = pdf_parser.parse_pdf(
|
||||||
filepath=filename,
|
filepath=filename,
|
||||||
@ -711,8 +711,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
|||||||
layout_recognizer = layout_recognizer_raw
|
layout_recognizer = layout_recognizer_raw
|
||||||
if isinstance(layout_recognizer_raw, str):
|
if isinstance(layout_recognizer_raw, str):
|
||||||
lowered = layout_recognizer_raw.lower()
|
lowered = layout_recognizer_raw.lower()
|
||||||
if lowered.startswith("mineru@"):
|
if lowered.endswith("@mineru"):
|
||||||
parser_model_name = layout_recognizer_raw.split("@", 1)[1]
|
parser_model_name = layout_recognizer_raw.split("@", 1)[0]
|
||||||
layout_recognizer = "MinerU"
|
layout_recognizer = "MinerU"
|
||||||
|
|
||||||
if parser_config.get("analyze_hyperlink", False) and is_root:
|
if parser_config.get("analyze_hyperlink", False) and is_root:
|
||||||
|
|||||||
@ -240,10 +240,7 @@ class Parser(ProcessBase):
|
|||||||
parse_method = parse_method or ""
|
parse_method = parse_method or ""
|
||||||
if isinstance(raw_parse_method, str):
|
if isinstance(raw_parse_method, str):
|
||||||
lowered = raw_parse_method.lower()
|
lowered = raw_parse_method.lower()
|
||||||
if lowered.startswith("mineru@"):
|
if lowered.endswith("@mineru"):
|
||||||
parser_model_name = raw_parse_method.split("@", 1)[1]
|
|
||||||
parse_method = "MinerU"
|
|
||||||
elif lowered.endswith("@mineru"):
|
|
||||||
parser_model_name = raw_parse_method.rsplit("@", 1)[0]
|
parser_model_name = raw_parse_method.rsplit("@", 1)[0]
|
||||||
parse_method = "MinerU"
|
parse_method = "MinerU"
|
||||||
|
|
||||||
|
|||||||
@ -22,7 +22,7 @@ from deepdoc.parser.mineru_parser import MinerUParser
|
|||||||
|
|
||||||
|
|
||||||
class Base:
|
class Base:
|
||||||
def __init__(self, key: str, model_name: str, **kwargs):
|
def __init__(self, key: str | dict, model_name: str, **kwargs):
|
||||||
self.model_name = model_name
|
self.model_name = model_name
|
||||||
|
|
||||||
def parse_pdf(self, filepath: str, binary=None, **kwargs) -> Tuple[Any, Any]:
|
def parse_pdf(self, filepath: str, binary=None, **kwargs) -> Tuple[Any, Any]:
|
||||||
@ -32,23 +32,23 @@ class Base:
|
|||||||
class MinerUOcrModel(Base, MinerUParser):
|
class MinerUOcrModel(Base, MinerUParser):
|
||||||
_FACTORY_NAME = "MinerU"
|
_FACTORY_NAME = "MinerU"
|
||||||
|
|
||||||
def __init__(self, key: str, model_name: str, **kwargs):
|
def __init__(self, key: str | dict, model_name: str, **kwargs):
|
||||||
Base.__init__(self, key, model_name, **kwargs)
|
Base.__init__(self, key, model_name, **kwargs)
|
||||||
cfg = {}
|
config = {}
|
||||||
if key:
|
if key:
|
||||||
try:
|
try:
|
||||||
cfg = json.loads(key)
|
config = json.loads(key)
|
||||||
except Exception:
|
except Exception:
|
||||||
cfg = {}
|
config = {}
|
||||||
|
config = config["api_key"]
|
||||||
self.mineru_api = cfg.get("MINERU_APISERVER", os.environ.get("MINERU_APISERVER", "http://host.docker.internal:9987"))
|
self.mineru_api = config.get("mineru_apiserver", os.environ.get("MINERU_APISERVER", ""))
|
||||||
self.mineru_output_dir = cfg.get("MINERU_OUTPUT_DIR", os.environ.get("MINERU_OUTPUT_DIR", ""))
|
self.mineru_output_dir = config.get("mineru_output_dir", os.environ.get("MINERU_OUTPUT_DIR", ""))
|
||||||
self.mineru_backend = cfg.get("MINERU_BACKEND", os.environ.get("MINERU_BACKEND", "pipeline"))
|
self.mineru_backend = config.get("mineru_backend", os.environ.get("MINERU_BACKEND", "pipeline"))
|
||||||
self.mineru_server_url = cfg.get("MINERU_SERVER_URL", os.environ.get("MINERU_SERVER_URL", ""))
|
self.mineru_server_url = config.get("mineru_server_url", os.environ.get("MINERU_SERVER_URL", ""))
|
||||||
self.mineru_delete_output = bool(int(cfg.get("MINERU_DELETE_OUTPUT", os.environ.get("MINERU_DELETE_OUTPUT", 1))))
|
self.mineru_delete_output = bool(int(config.get("mineru_delete_output", os.environ.get("MINERU_DELETE_OUTPUT", 1))))
|
||||||
self.mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
self.mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
||||||
|
|
||||||
logging.info(f"Parsered MinerU config: {cfg}")
|
logging.info(f"Parsed MinerU config: {config}")
|
||||||
|
|
||||||
MinerUParser.__init__(self, mineru_path=self.mineru_executable, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
|
MinerUParser.__init__(self, mineru_path=self.mineru_executable, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user