From 9d0309aedce5bbc508cf2e1542da844660f81ec8 Mon Sep 17 00:00:00 2001 From: Yongteng Lei Date: Mon, 1 Dec 2025 12:17:43 +0800 Subject: [PATCH] Fix: [MinerU] Missing output file (#11623) ### What problem does this PR solve? Add fallbacks for MinerU output path. #11613, #11620. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) --- deepdoc/parser/mineru_parser.py | 49 ++++++++++++++++++++++++--------- rag/llm/chat_model.py | 22 ++++++++------- 2 files changed, 48 insertions(+), 23 deletions(-) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index d4834de39..9670bdcf9 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -190,7 +190,7 @@ class MinerUParser(RAGFlowPdfParser): self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback) def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None): - OUTPUT_ZIP_PATH = os.path.join(str(output_dir), "output.zip") + output_zip_path = os.path.join(str(output_dir), "output.zip") pdf_file_path = str(input_path) @@ -230,16 +230,16 @@ class MinerUParser(RAGFlowPdfParser): response.raise_for_status() if response.headers.get("Content-Type") == "application/zip": - self.logger.info(f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...") + self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...") if callback: - callback(0.30, f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...") + callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...") - with open(OUTPUT_ZIP_PATH, "wb") as f: + with open(output_zip_path, "wb") as f: f.write(response.content) self.logger.info(f"[MinerU] Unzip to {output_path}...") - self._extract_zip_no_root(OUTPUT_ZIP_PATH, output_path, pdf_file_name + "/") + self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/") if callback: callback(0.40, f"[MinerU] Unzip to {output_path}...") @@ -459,13 +459,36 @@ class MinerUParser(RAGFlowPdfParser): return poss def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]: - subdir = output_dir / file_stem / method - if backend.startswith("vlm-"): - subdir = output_dir / file_stem / "vlm" - json_file = subdir / f"{file_stem}_content_list.json" + candidates = [] + seen = set() - if not json_file.exists(): - raise FileNotFoundError(f"[MinerU] Missing output file: {json_file}") + def add_candidate_path(p: Path): + if p not in seen: + seen.add(p) + candidates.append(p) + + if backend.startswith("vlm-"): + add_candidate_path(output_dir / file_stem / "vlm") + if method: + add_candidate_path(output_dir / file_stem / method) + add_candidate_path(output_dir / file_stem / "auto") + else: + if method: + add_candidate_path(output_dir / file_stem / method) + add_candidate_path(output_dir / file_stem / "vlm") + add_candidate_path(output_dir / file_stem / "auto") + + json_file = None + subdir = None + for sub in candidates: + jf = sub / f"{file_stem}_content_list.json" + if jf.exists(): + subdir = sub + json_file = jf + break + + if not json_file: + raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(c / (file_stem + '_content_list.json')) for c in candidates)}") with open(json_file, "r", encoding="utf-8") as f: data = json.load(f) @@ -520,7 +543,7 @@ class MinerUParser(RAGFlowPdfParser): method: str = "auto", server_url: Optional[str] = None, delete_output: bool = True, - parse_method: str = "raw" + parse_method: str = "raw", ) -> tuple: import shutil @@ -570,7 +593,7 @@ class MinerUParser(RAGFlowPdfParser): self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.") if callback: callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.") - + return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs) finally: if temp_pdf and temp_pdf.exists(): diff --git a/rag/llm/chat_model.py b/rag/llm/chat_model.py index 9fbc88348..726aecd8b 100644 --- a/rag/llm/chat_model.py +++ b/rag/llm/chat_model.py @@ -33,9 +33,9 @@ from openai.lib.azure import AzureOpenAI from strenum import StrEnum from zhipuai import ZhipuAI +from common.token_utils import num_tokens_from_string, total_token_count_from_response from rag.llm import FACTORY_DEFAULT_BASE_URL, LITELLM_PROVIDER_PREFIX, SupportedLiteLLMProvider from rag.nlp import is_chinese, is_english -from common.token_utils import num_tokens_from_string, total_token_count_from_response # Error message constants @@ -66,7 +66,7 @@ LENGTH_NOTIFICATION_EN = "...\nThe answer is truncated by your chosen LLM due to class Base(ABC): def __init__(self, key, model_name, base_url, **kwargs): - timeout = int(os.environ.get("LM_TIMEOUT_SECONDS", 600)) + timeout = int(os.environ.get("LLM_TIMEOUT_SECONDS", 600)) self.client = OpenAI(api_key=key, base_url=base_url, timeout=timeout) self.model_name = model_name # Configure retry parameters @@ -127,7 +127,7 @@ class Base(ABC): "tool_choice", "logprobs", "top_logprobs", - "extra_headers" + "extra_headers", } gen_conf = {k: v for k, v in gen_conf.items() if k in allowed_conf} @@ -1213,7 +1213,7 @@ class GoogleChat(Base): # Build GenerateContentConfig try: - from google.genai.types import GenerateContentConfig, ThinkingConfig, Content, Part + from google.genai.types import Content, GenerateContentConfig, Part, ThinkingConfig except ImportError as e: logging.error(f"[GoogleChat] Failed to import google-genai: {e}. Please install: pip install google-genai>=1.41.0") raise @@ -1242,14 +1242,14 @@ class GoogleChat(Base): role = "model" if item["role"] == "assistant" else item["role"] content = Content( role=role, - parts=[Part(text=item["content"])] + parts=[Part(text=item["content"])], ) contents.append(content) response = self.client.models.generate_content( model=self.model_name, contents=contents, - config=config + config=config, ) ans = response.text @@ -1299,7 +1299,7 @@ class GoogleChat(Base): # Build GenerateContentConfig try: - from google.genai.types import GenerateContentConfig, ThinkingConfig, Content, Part + from google.genai.types import Content, GenerateContentConfig, Part, ThinkingConfig except ImportError as e: logging.error(f"[GoogleChat] Failed to import google-genai: {e}. Please install: pip install google-genai>=1.41.0") raise @@ -1326,7 +1326,7 @@ class GoogleChat(Base): role = "model" if item["role"] == "assistant" else item["role"] content = Content( role=role, - parts=[Part(text=item["content"])] + parts=[Part(text=item["content"])], ) contents.append(content) @@ -1334,7 +1334,7 @@ class GoogleChat(Base): for chunk in self.client.models.generate_content_stream( model=self.model_name, contents=contents, - config=config + config=config, ): text = chunk.text ans = text @@ -1406,7 +1406,7 @@ class LiteLLMBase(ABC): ] def __init__(self, key, model_name, base_url=None, **kwargs): - self.timeout = int(os.environ.get("LM_TIMEOUT_SECONDS", 600)) + self.timeout = int(os.environ.get("LLM_TIMEOUT_SECONDS", 600)) self.provider = kwargs.get("provider", "") self.prefix = LITELLM_PROVIDER_PREFIX.get(self.provider, "") self.model_name = f"{self.prefix}{model_name}" @@ -1625,6 +1625,7 @@ class LiteLLMBase(ABC): if self.provider == SupportedLiteLLMProvider.OpenRouter: if self.provider_order: + def _to_order_list(x): if x is None: return [] @@ -1633,6 +1634,7 @@ class LiteLLMBase(ABC): if isinstance(x, (list, tuple)): return [str(s).strip() for s in x if str(s).strip()] return [] + extra_body = {} provider_cfg = {} provider_order = _to_order_list(self.provider_order)