mirror of
https://github.com/infiniflow/ragflow.git
synced 2025-12-26 08:56:47 +08:00
Refa: only support MinerU-API now (#11977)
### What problem does this PR solve? Only support MinerU-API now, still need to complete frontend for pipeline to allow the configuration of MinerU options. ### Type of change - [x] Refactoring
This commit is contained in:
@ -280,6 +280,7 @@ class Parser(ProcessBase):
|
||||
binary=blob,
|
||||
callback=self.callback,
|
||||
parse_method=conf.get("mineru_parse_method", "raw"),
|
||||
lang=conf.get("lang", "Chinese"),
|
||||
)
|
||||
bboxes = []
|
||||
for t, poss in lines:
|
||||
|
||||
@ -398,7 +398,7 @@ class JinaMultiVecEmbed(Base):
|
||||
|
||||
ress.append(chunk_emb)
|
||||
|
||||
token_count +=total_token_count_from_response(res)
|
||||
token_count += total_token_count_from_response(res)
|
||||
except Exception as _e:
|
||||
log_exception(_e, response)
|
||||
raise Exception(f"Error: {response}")
|
||||
|
||||
@ -16,7 +16,7 @@
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from typing import Any, Optional, Tuple
|
||||
from typing import Any, Optional
|
||||
|
||||
from deepdoc.parser.mineru_parser import MinerUParser
|
||||
|
||||
@ -25,7 +25,7 @@ class Base:
|
||||
def __init__(self, key: str | dict, model_name: str, **kwargs):
|
||||
self.model_name = model_name
|
||||
|
||||
def parse_pdf(self, filepath: str, binary=None, **kwargs) -> Tuple[Any, Any]:
|
||||
def parse_pdf(self, filepath: str, binary=None, **kwargs) -> tuple[Any, Any]:
|
||||
raise NotImplementedError("Please implement parse_pdf!")
|
||||
|
||||
|
||||
@ -56,21 +56,22 @@ class MinerUOcrModel(Base, MinerUParser):
|
||||
self.mineru_backend = _resolve_config("mineru_backend", "MINERU_BACKEND", "pipeline")
|
||||
self.mineru_server_url = _resolve_config("mineru_server_url", "MINERU_SERVER_URL", "")
|
||||
self.mineru_delete_output = bool(int(_resolve_config("mineru_delete_output", "MINERU_DELETE_OUTPUT", 1)))
|
||||
self.mineru_executable = os.environ.get("MINERU_EXECUTABLE", "mineru")
|
||||
|
||||
logging.info(f"Parsed MinerU config: {config}")
|
||||
logging.info(
|
||||
f"Parsed MinerU config: backend={self.mineru_backend} api={self.mineru_api} server_url={self.mineru_server_url} output_dir={self.mineru_output_dir} delete_output={self.mineru_delete_output}"
|
||||
)
|
||||
|
||||
MinerUParser.__init__(self, mineru_path=self.mineru_executable, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
|
||||
MinerUParser.__init__(self, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
|
||||
|
||||
def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> Tuple[bool, str]:
|
||||
def check_available(self, backend: Optional[str] = None, server_url: Optional[str] = None) -> tuple[bool, str]:
|
||||
backend = backend or self.mineru_backend
|
||||
server_url = server_url or self.mineru_server_url
|
||||
return self.check_installation(backend=backend, server_url=server_url)
|
||||
|
||||
def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw",**kwargs):
|
||||
def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw", **kwargs):
|
||||
ok, reason = self.check_available()
|
||||
if not ok:
|
||||
raise RuntimeError(f"MinerU not found or server not accessible: {reason}. Please install it via: pip install -U 'mineru[core]'.")
|
||||
raise RuntimeError(f"MinerU server not accessible: {reason}")
|
||||
|
||||
sections, tables = MinerUParser.parse_pdf(
|
||||
self,
|
||||
|
||||
@ -69,7 +69,6 @@ from common.signal_utils import start_tracemalloc_and_snapshot, stop_tracemalloc
|
||||
from common.exceptions import TaskCanceledException
|
||||
from common import settings
|
||||
from common.constants import PAGERANK_FLD, TAG_FLD, SVR_CONSUMER_GROUP_NAME
|
||||
from common.misc_utils import check_and_install_mineru
|
||||
|
||||
BATCH_SIZE = 64
|
||||
|
||||
@ -1169,7 +1168,6 @@ async def main():
|
||||
show_configs()
|
||||
settings.init_settings()
|
||||
settings.check_and_install_torch()
|
||||
check_and_install_mineru()
|
||||
logging.info(f'default embedding config: {settings.EMBEDDING_CFG}')
|
||||
settings.print_rag_settings()
|
||||
if sys.platform != "win32":
|
||||
|
||||
Reference in New Issue
Block a user