mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-19 20:16:49 +08:00
Refa: only support MinerU-API now (#11977)
### What problem does this PR solve? Only support MinerU-API now, still need to complete frontend for pipeline to allow the configuration of MinerU options. ### Type of change - [x] Refactoring
This commit is contained in:
@ -16,7 +16,7 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Optional, Tuple
|
||||
from typing import Any, Optional
|
||||
|
||||
from deepdoc.parser.mineru_parser import MinerUParser
|
||||
|
||||
@ -25,7 +25,7 @@ class Base:
|
||||
def __init__(self, key: str | dict, model_name: str, **kwargs):
|
||||
self.model_name = model_name
|
||||
|
||||
def parse_pdf(self, filepath: str, binary=None, **kwargs) -> Tuple[Any, Any]:
|
||||
def parse_pdf(self, filepath: str, binary=None, **kwargs) -> tuple[Any, Any]:
|
||||
raise NotImplementedError("Please implement parse_pdf!")
|
||||
|
||||
|
||||
@ -56,21 +56,22 @@ class MinerUOcrModel(Base, MinerUParser):
|
||||
self.mineru_backend = _resolve_config("mineru_backend", "MINERU_BACKEND", "pipeline")
|
||||
self.mineru_server_url = _resolve_config("mineru_server_url", "MINERU_SERVER_URL", "")
|
||||
self.mineru_delete_output = bool(int(_resolve_config("mineru_delete_output", "MINERU_DELETE_OUTPUT", 1)))
|
||||
self.mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
||||
|
||||
logging.info(f"Parsed MinerU config: {config}")
|
||||
logging.info(
|
||||
f"Parsed MinerU config: backend={self.mineru_backend} api={self.mineru_api} server_url={self.mineru_server_url} output_dir={self.mineru_output_dir} delete_output={self.mineru_delete_output}"
|
||||
)
|
||||
|
||||
MinerUParser.__init__(self, mineru_path=self.mineru_executable, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
|
||||
MinerUParser.__init__(self, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
|
||||
|
||||
def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> Tuple[bool, str]:
|
||||
def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> tuple[bool, str]:
|
||||
backend = backend or self.mineru_backend
|
||||
server_url = server_url or self.mineru_server_url
|
||||
return self.check_installation(backend=backend, server_url=server_url)
|
||||
|
||||
def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw",**kwargs):
|
||||
def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw", **kwargs):
|
||||
ok, reason = self.check_available()
|
||||
if not ok:
|
||||
raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.")
|
||||
raise RuntimeError(f"MinerU server not accessible: {reason}")
|
||||
|
||||
sections, tables = MinerUParser.parse_pdf(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user