Fix: [MinerU] Missing output file (#11623)

### What problem does this PR solve?

Add fallbacks for MinerU output path. #11613, #11620.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Yongteng Lei
2025-12-01 12:17:43 +08:00
committed by GitHub
parent 9a8ce9d3e2
commit 9d0309aedc
2 changed files with 48 additions and 23 deletions

View File

@ -190,7 +190,7 @@ class MinerUParser(RAGFlowPdfParser):
self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback) self._run_mineru_executable(input_path, output_dir, method, backend, lang, server_url, callback)
def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None): def _run_mineru_api(self, input_path: Path, output_dir: Path, method: str = "auto", backend: str = "pipeline", lang: Optional[str] = None, callback: Optional[Callable] = None):
OUTPUT_ZIP_PATH = os.path.join(str(output_dir), "output.zip") output_zip_path = os.path.join(str(output_dir), "output.zip")
pdf_file_path = str(input_path) pdf_file_path = str(input_path)
@ -230,16 +230,16 @@ class MinerUParser(RAGFlowPdfParser):
response.raise_for_status() response.raise_for_status()
if response.headers.get("Content-Type") == "application/zip": if response.headers.get("Content-Type") == "application/zip":
self.logger.info(f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...") self.logger.info(f"[MinerU] zip file returned, saving to {output_zip_path}...")
if callback: if callback:
callback(0.30, f"[MinerU] zip file returned, saving to {OUTPUT_ZIP_PATH}...") callback(0.30, f"[MinerU] zip file returned, saving to {output_zip_path}...")
with open(OUTPUT_ZIP_PATH, "wb") as f: with open(output_zip_path, "wb") as f:
f.write(response.content) f.write(response.content)
self.logger.info(f"[MinerU] Unzip to {output_path}...") self.logger.info(f"[MinerU] Unzip to {output_path}...")
self._extract_zip_no_root(OUTPUT_ZIP_PATH, output_path, pdf_file_name + "/") self._extract_zip_no_root(output_zip_path, output_path, pdf_file_name + "/")
if callback: if callback:
callback(0.40, f"[MinerU] Unzip to {output_path}...") callback(0.40, f"[MinerU] Unzip to {output_path}...")
@ -459,13 +459,36 @@ class MinerUParser(RAGFlowPdfParser):
return poss return poss
def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]: def _read_output(self, output_dir: Path, file_stem: str, method: str = "auto", backend: str = "pipeline") -> list[dict[str, Any]]:
subdir = output_dir / file_stem / method candidates = []
if backend.startswith("vlm-"): seen = set()
subdir = output_dir / file_stem / "vlm"
json_file = subdir / f"{file_stem}_content_list.json"
if not json_file.exists(): def add_candidate_path(p: Path):
raise FileNotFoundError(f"[MinerU] Missing output file: {json_file}") if p not in seen:
seen.add(p)
candidates.append(p)
if backend.startswith("vlm-"):
add_candidate_path(output_dir / file_stem / "vlm")
if method:
add_candidate_path(output_dir / file_stem / method)
add_candidate_path(output_dir / file_stem / "auto")
else:
if method:
add_candidate_path(output_dir / file_stem / method)
add_candidate_path(output_dir / file_stem / "vlm")
add_candidate_path(output_dir / file_stem / "auto")
json_file = None
subdir = None
for sub in candidates:
jf = sub / f"{file_stem}_content_list.json"
if jf.exists():
subdir = sub
json_file = jf
break
if not json_file:
raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(c / (file_stem + '_content_list.json')) for c in candidates)}")
with open(json_file, "r", encoding="utf-8") as f: with open(json_file, "r", encoding="utf-8") as f:
data = json.load(f) data = json.load(f)
@ -520,7 +543,7 @@ class MinerUParser(RAGFlowPdfParser):
method: str = "auto", method: str = "auto",
server_url: Optional[str] = None, server_url: Optional[str] = None,
delete_output: bool = True, delete_output: bool = True,
parse_method: str = "raw" parse_method: str = "raw",
) -> tuple: ) -> tuple:
import shutil import shutil
@ -570,7 +593,7 @@ class MinerUParser(RAGFlowPdfParser):
self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.") self.logger.info(f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
if callback: if callback:
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.") callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs) return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs)
finally: finally:
if temp_pdf and temp_pdf.exists(): if temp_pdf and temp_pdf.exists():

View File

@ -33,9 +33,9 @@ from openai.lib.azure import AzureOpenAI
from strenum import StrEnum from strenum import StrEnum
from zhipuai import ZhipuAI from zhipuai import ZhipuAI
from common.token_utils import num_tokens_from_string, total_token_count_from_response
from rag.llm import FACTORY_DEFAULT_BASE_URL, LITELLM_PROVIDER_PREFIX, SupportedLiteLLMProvider from rag.llm import FACTORY_DEFAULT_BASE_URL, LITELLM_PROVIDER_PREFIX, SupportedLiteLLMProvider
from rag.nlp import is_chinese, is_english from rag.nlp import is_chinese, is_english
from common.token_utils import num_tokens_from_string, total_token_count_from_response
# Error message constants # Error message constants
@ -66,7 +66,7 @@ LENGTH_NOTIFICATION_EN = "...\nThe answer is truncated by your chosen LLM due to
class Base(ABC): class Base(ABC):
def __init__(self, key, model_name, base_url, **kwargs): def __init__(self, key, model_name, base_url, **kwargs):
timeout = int(os.environ.get("LM_TIMEOUT_SECONDS", 600)) timeout = int(os.environ.get("LLM_TIMEOUT_SECONDS", 600))
self.client = OpenAI(api_key=key, base_url=base_url, timeout=timeout) self.client = OpenAI(api_key=key, base_url=base_url, timeout=timeout)
self.model_name = model_name self.model_name = model_name
# Configure retry parameters # Configure retry parameters
@ -127,7 +127,7 @@ class Base(ABC):
"tool_choice", "tool_choice",
"logprobs", "logprobs",
"top_logprobs", "top_logprobs",
"extra_headers" "extra_headers",
} }
gen_conf = {k: v for k, v in gen_conf.items() if k in allowed_conf} gen_conf = {k: v for k, v in gen_conf.items() if k in allowed_conf}
@ -1213,7 +1213,7 @@ class GoogleChat(Base):
# Build GenerateContentConfig # Build GenerateContentConfig
try: try:
from google.genai.types import GenerateContentConfig, ThinkingConfig, Content, Part from google.genai.types import Content, GenerateContentConfig, Part, ThinkingConfig
except ImportError as e: except ImportError as e:
logging.error(f"[GoogleChat] Failed to import google-genai: {e}. Please install: pip install google-genai>=1.41.0") logging.error(f"[GoogleChat] Failed to import google-genai: {e}. Please install: pip install google-genai>=1.41.0")
raise raise
@ -1242,14 +1242,14 @@ class GoogleChat(Base):
role = "model" if item["role"] == "assistant" else item["role"] role = "model" if item["role"] == "assistant" else item["role"]
content = Content( content = Content(
role=role, role=role,
parts=[Part(text=item["content"])] parts=[Part(text=item["content"])],
) )
contents.append(content) contents.append(content)
response = self.client.models.generate_content( response = self.client.models.generate_content(
model=self.model_name, model=self.model_name,
contents=contents, contents=contents,
config=config config=config,
) )
ans = response.text ans = response.text
@ -1299,7 +1299,7 @@ class GoogleChat(Base):
# Build GenerateContentConfig # Build GenerateContentConfig
try: try:
from google.genai.types import GenerateContentConfig, ThinkingConfig, Content, Part from google.genai.types import Content, GenerateContentConfig, Part, ThinkingConfig
except ImportError as e: except ImportError as e:
logging.error(f"[GoogleChat] Failed to import google-genai: {e}. Please install: pip install google-genai>=1.41.0") logging.error(f"[GoogleChat] Failed to import google-genai: {e}. Please install: pip install google-genai>=1.41.0")
raise raise
@ -1326,7 +1326,7 @@ class GoogleChat(Base):
role = "model" if item["role"] == "assistant" else item["role"] role = "model" if item["role"] == "assistant" else item["role"]
content = Content( content = Content(
role=role, role=role,
parts=[Part(text=item["content"])] parts=[Part(text=item["content"])],
) )
contents.append(content) contents.append(content)
@ -1334,7 +1334,7 @@ class GoogleChat(Base):
for chunk in self.client.models.generate_content_stream( for chunk in self.client.models.generate_content_stream(
model=self.model_name, model=self.model_name,
contents=contents, contents=contents,
config=config config=config,
): ):
text = chunk.text text = chunk.text
ans = text ans = text
@ -1406,7 +1406,7 @@ class LiteLLMBase(ABC):
] ]
def __init__(self, key, model_name, base_url=None, **kwargs): def __init__(self, key, model_name, base_url=None, **kwargs):
self.timeout = int(os.environ.get("LM_TIMEOUT_SECONDS", 600)) self.timeout = int(os.environ.get("LLM_TIMEOUT_SECONDS", 600))
self.provider = kwargs.get("provider", "") self.provider = kwargs.get("provider", "")
self.prefix = LITELLM_PROVIDER_PREFIX.get(self.provider, "") self.prefix = LITELLM_PROVIDER_PREFIX.get(self.provider, "")
self.model_name = f"{self.prefix}{model_name}" self.model_name = f"{self.prefix}{model_name}"
@ -1625,6 +1625,7 @@ class LiteLLMBase(ABC):
if self.provider == SupportedLiteLLMProvider.OpenRouter: if self.provider == SupportedLiteLLMProvider.OpenRouter:
if self.provider_order: if self.provider_order:
def _to_order_list(x): def _to_order_list(x):
if x is None: if x is None:
return [] return []
@ -1633,6 +1634,7 @@ class LiteLLMBase(ABC):
if isinstance(x, (list, tuple)): if isinstance(x, (list, tuple)):
return [str(s).strip() for s in x if str(s).strip()] return [str(s).strip() for s in x if str(s).strip()]
return [] return []
extra_body = {} extra_body = {}
provider_cfg = {} provider_cfg = {}
provider_order = _to_order_list(self.provider_order) provider_order = _to_order_list(self.provider_order)